multithreaded crawler while using tor proxy - python

I am trying to build multi threaded crawler that uses tor proxies:
I am using following to establish tor connection:
from stem import Signal
from stem.control import Controller
controller = Controller.from_port(port=9151)
def connectTor():
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9150)
socket.socket = socks.socksocket
def renew_tor():
global request_headers
request_headers = {
"Accept-Language": "en-US,en;q=0.5",
"User-Agent": random.choice(BROWSERS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "http://thewebsite2.com",
"Connection": "close"
}
controller.authenticate()
controller.signal(Signal.NEWNYM)
Here is url fetcher:
def get_soup(url):
while True:
try:
connectTor()
r = requests.Session()
response = r.get(url, headers=request_headers)
the_page = response.content.decode('utf-8',errors='ignore')
the_soup = BeautifulSoup(the_page, 'html.parser')
if "captcha" in the_page.lower():
print("flag condition matched while url: ", url)
#print(the_page)
renew_tor()
else:
return the_soup
break
except Exception as e:
print ("Error while URL :", url, str(e))
I am then creating multithreaded fetch job:
with futures.ThreadPoolExecutor(200) as executor:
for url in zurls:
future = executor.submit(fetchjob,url)
then I am getting following error, which I am not seeing when I use multiprocessing:
Socket connection failed (Socket error: 0x01: General SOCKS server failure)
I would appreciate Any advise to avoid socks error and improving the performance of crawling method to make it multi threaded.

This is a perfect example of why monkey patching socket.socket is bad.
This replaces the socket used by all socket connections (which is most everything) with the SOCKS socket.
When you go to connect to the controller later, it attempts to use the SOCKS protocol to communicate instead of establishing a direct connection.
Since you're already using requests, I'd suggest getting rid of SocksiPy and the socks.socket = socks.socksocket code and using the SOCKS proxy functionality built into requests:
proxies = {
'http': 'socks5h://127.0.0.1:9050',
'https': 'socks5h://127.0.0.1:9050'
}
response = r.get(url, headers=request_headers, proxies=proxies)

Related

How to make a proxy server for python requests

I have seen code like this that shows how to use a proxy for python requests.
import requests
proxies = {
'http': 'http://localhost:7777',
'https': 'http://localhost:7777',
}
requests.get('http://example.org', proxies=proxies)
requests.get('https://example.org', proxies=proxies)
But I am wondering how can we make a very simple proxy server in Python that would be able to respond to the GET request?
You can find many examples how to do it - even in questions on Stackoverflow.
Some of them use standard module socket (but it doesn't look simply).
Other use standard module http but they show code for Python 2 which was using different names.
Version for Python 3
import http.server
import socketserver
import urllib.request
class MyProxy(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
print(self.path)
url = self.path
self.send_response(200)
self.end_headers()
self.copyfile(urllib.request.urlopen(url), self.wfile)
# --- main ---
PORT = 7777
httpd = None
try:
socketserver.TCPServer.allow_reuse_address = True # solution for `OSError: [Errno 98] Address already in use`
httpd = socketserver.TCPServer(('', PORT), MyProxy)
print(f"Proxy at: http://localhost:{PORT}")
httpd.serve_forever()
except KeyboardInterrupt:
print("Pressed Ctrl+C")
finally:
if httpd:
httpd.shutdown()
#httpd.socket.close()
Test using page httpbin.org
import requests
proxies = {
'http': 'http://localhost:7777',
'https': 'http://localhost:7777',
}
response = requests.get('http://httpbin.org/get', proxies=proxies)
print(response.text)
response = requests.get('http://httpbin.org/get?arg1=hello&arg2=world', proxies=proxies)
print(response.text)
But it works only for HTTP.
For HTTPS it may need to use ssl.socket from module ssl.
And it works only with GET.
For POST, PUT, DELETE, etc. it would need do_POST, do_PUT, do_DELETE, etc. with different code.
EDIT:
def do_POST(self):
url = self.path
# - post data -
content_length = int(self.headers.get('Content-Length', 0)) # <--- size of data
if content_length:
content = self.rfile.read(content_length) # <--- data itself
else:
content = None
req = urllib.request.Request(url, method="POST", data=content)
output = urllib.request.urlopen(req)
# ---
self.send_response(200)
self.end_headers()
self.copyfile(output, self.wfile)
But if you need local proxy only to test your code then you could use
Python module/program: mitmproxy (Man-In-The-Middle-Proxy)
not-python, not-free (but work 30 days for free), with nice GUI: Charles Proxy
More complex OWASP ZAP, Burp Suite (community edition)

stem.connection.IncorrectSocketType: unable to use the control socket

Trying to send requests over Tor with requests[socks] and renew the IP after x amount of requests.
This is the code:
def tor():
session = requests.session()
session.proxies = {'http': 'socks5://127.0.0.1:9050',
'https': 'socks5://127.0.0.1:9050'}
return session
session = tor()
def renew_tor_ip():
with Controller.from_port(port = 9050) as controller:
controller.authenticate(password="mypassword")
controller.signal(Signal.NEWNYM)
def get_dorks(pages):
for i in range(pages):
i += 1
if i == 0:
s = session.get("url")
if i != 0:
s = session.get("url" + str(i))
if i == 20:
renew_tor_ip()
when running this I get
stem.connection.IncorrectSocketType: unable to use the control socket
edit: i changed my torrc file following the "a little relay" article from stem
edit2: have not found a solution yet.
9050 is not the port for the controller.
9051 is the port you are looking for.
Alter with Controller.from_port(port = 9050) as controller
to with Controller.from_port(port = 9051) as controller
You also have to enable the port in the config file of tor.

changing ip in iteration with tor python

I want to change my IP everytime I run through loop. I am trying to achieve it with TOR. I have seen few posts with similar question, but solution given there is not working. So far my code looks like:
import socks
#import socket
import requests
import time
for i in range(1,3):
socks.setdefaultproxy(proxy_type=socks.PROXY_TYPE_SOCKS5, addr="127.0.0.1", port=9050)
try:
print (requests.get("http://icanhazip.com").text)
except Exception as e:
time.sleep(30)
print (type(e))
print (e)
I need different IP every time, instead of same IP.
edit : I have tried using approach given on How to change Tor identity in Python?. My limitation is not to use any external libraries. also solution provided by Nedim is without external library.
so far I have tried following to get different IP from mentioned link:
import socket
import sys
import os
try:
tor_c = socket.create_connection(("127.0.0.1", 9051 ))
secret = os.urandom(32) # pass this to authenticate
hash = tor_c.s2k_gen(secret) # pass this to Tor on startup.
tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(hash))
response = tor_c.recv(1024)
if response != '250 OK\r\n250 OK\r\n':
sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response))
except Exception as e:
sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e)))
but it is throwing following error:
Error connecting to Tor control port: ConnectionRefusedError(10061, 'No connection could be made because the target machine actively refused it', None, 10061, None)
def renew_connection():
with Controller.from_port(port=9051) as controller:
controller.authenticate(password='password')
controller.signal(Signal.NEWNYM)
controller.close()
def request_tor(url, headers):
print((requests.get(url,proxies={'http': 'socks5h://localhost:9050'}, headers=headers)).text)
r = requests.get(url)
print('direct IP:', r.text)
if __name__ == "__main__":
url = 'http://icanhazip.com'
headers = { 'User-Agent': UserAgent().random }
for i in range(5):
request_tor(url,headers)
renew_connection()
time.sleep(5)

Python twisted proxy to send 2 requests

How can I work on this code to be able to send 2 separate requests. The requests would be in this order:
Request1 :
HEAD http://google.com
Host: google.com
... wait for reply from google server ...
Request2 :
GET http://yahoo.com HTTP/1.1
User-Agent: mozilla
Accept: */*
... second request sent from browser while first request is static for all requests ...
The code I’m trying to modify is:
from twisted.web import proxy, http
class SnifferProxy(proxy.Proxy):
def allContentReceived(self):
print "Received data..."
print "method = %s" % self._command
print "action = %s" % self._path
print "ended content manipulation\n\n"
return proxy.Proxy.allContentReceived(self)
class ProxyFactory(http.HTTPFactory):
protocol = SnifferProxy
if __name__ == "__main__":
from twisted.internet import reactor
reactor.listenTCP(8080, ProxyFactory())
reactor.run()
The twisted proxy would be connecting to another external proxy
Any help is appreciated..
I think you can get what you want by adding the call to the Proxy.allContentReceived method as a callback to a HEAD request using Agent.
from twisted.internet import reactor from twisted.web import proxy, http
from twisted.web.client import Agent
from twisted.web.http_headers import Headers
agent = Agent(reactor)
class SnifferProxy(proxy.Proxy):
def allContentReceived(self):
def cbHead(result):
print "got response for HEAD"
def doProxiedRequest(result):
proxy.Proxy.allContentReceived(self)
# I assumed self._path, but it looks OP wants to do the
# HEAD request to the same path always
PATH = "http://foo.bar"
d = agent.request(
'HEAD', PATH, Headers({'User-Agent': ['twisted']}), None)
d.addCallback(cbHead)
d.addCallback(doProxiedRequest)

How to change Tor identity in Python?

I have the following script:
import socks
import socket
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
socket.socket = socks.socksocket
import urllib2
print(urllib2.urlopen("http://www.ifconfig.me/ip").read())
which uses tor and SocksiPy
Now I want to change tor identity with each request, for example:
for i in range(0, 10):
#somehow change tor identity
print(urllib2.urlopen("http://www.ifconfig.me/ip").read())
How can I do this?
Tor wrote a new TOR control library in Python, stem. It can be found on PyPI. They provide some nice tutorials how to work with it, one of them explains how to change your identity:
from stem import Signal
from stem.control import Controller
with Controller.from_port(port = 9051) as controller:
controller.authenticate()
controller.signal(Signal.NEWNYM)
Make sure your config is correct.
Today, I have searched a lot about this question, and finally managed to answer myself. But before I need to say that pirvoxy and tor should be configured correctly. First script, then a little bit about configuration:
import urllib2
from TorCtl import TorCtl
proxy_support = urllib2.ProxyHandler({"http" : "127.0.0.1:8118"})
opener = urllib2.build_opener(proxy_support)
def newId():
conn = TorCtl.connect(controlAddr="127.0.0.1", controlPort=9051, passphrase="your_password")
conn.send_signal("NEWNYM")
for i in range(0, 10):
print "case "+str(i+1)
newId()
proxy_support = urllib2.ProxyHandler({"http" : "127.0.0.1:8118"})
urllib2.install_opener(opener)
print(urllib2.urlopen("http://www.ifconfig.me/ip").read())
Above script gets new IP and checks it from ifconfig.me web site. About configuration:
We need Privoxy. to use TOR with HTTP connections, privoxy should work with tor. We can do it by adding thi to /etc/privoxy/config file:
forward-socks5 / localhost:9050 . #dot is important at the end
then we configure ControlPort in /etc/tor/torrc file. We need just uncomment this line:
ControlPort 9051
## If you enable the controlport, be sure to enable one of these
## authentication methods, to prevent attackers from accessing it.
HashedControlPassword 16:872860B76453A77D60CA2BB8C1A7042072093276A3D701AD684053EC4C
then we just restart tor:
/etc/init.d/tor restart
Another simple solution, no external libraries required, works for both IPv4 and IPv6:
import socket
try:
tor_c = socket.create_connection((TOR_CTRL_HOST, TOR_CTRL_PORT))
tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(TOR_CTRL_PWD))
response = tor_c.recv(1024)
if response != '250 OK\r\n250 OK\r\n':
sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response))
except Exception, e:
sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e)))
This is a video where im using STEM, SockSipy, Tor 100% working :)
#!/usr/bin/python
import socks
import socket
import time
from stem.control import Controller
from stem import Signal
import urllib2
import sys
def info():
print "[*] Welcome to Chart-Cheat Script"
print "[*] This script works with running TOR only"
print "[*] usage is chartcheat.py domain"
print "[*] argument domain must be in format www.example.com"
print "[*] Example: chartcheat.py www.example.com"
return
if len(sys.argv)==2:
info();
counter = 0
url = str(sys.argv[1]);
with Controller.from_port(port = 9051) as controller:
controller.authenticate()
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
socket.socket = socks.socksocket
#visiting url in infinite loop
while True:
urllib2.urlopen("http://"+url)
counter=counter+1
print "Page " + url + " visited = " + str(counter)
#wait till next identity will be available
controller.signal(Signal.NEWNYM)
time.sleep(controller.get_newnym_wait())
else:
info();
In case you are running python3, urllib package in python3 will be the same as urllib2 package in python2.
You can enable tor control server by uncommenting few lines in
/etc/tor/torrc
And use stem library to send NEWNYM signal to change circuit.
controller.signal(Signal.NEWNYM)
You can read tutorial here.
you can write something like this :
def renew_connection():
with Controller.from_port(port=9051) as controller:
controller.authenticate(password='password')
controller.signal(Signal.NEWNYM)
controller.close()
def request_tor(url, headers):
renew_connection()
session = requests.session()
session.proxies = {}
session.proxies['http'] = 'socks5h://localhost:9050'
print((session.get(url)).text)
The following could work:
for i in range(0, 10):
#somehow change tor identity
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050+i)
socket.socket = socks.socksocket
print(urllib2.urlopen("http://www.ifconfig.me/ip").read())
You basically set set the proxy prior to making each connection. I am asuming that you have different proxies for different IPs since you have not stated how you intend to change the IP

Categories