When I download a file using single request I do the following:
session = requests.Session()
params = {'fd': 1, 'count': 1024, 'auth': 'auth_token'}
r = session.get('https://httpbin.org/bytes/9', params=params)
print(r.content)
# b'\xb3_\\l\xe2\xbf/:\x07'
How can I do multiple requests without waiting for an answer?
Server api docs says:
You can push multiple requests over single connection without waiting
for answer, to improve performance. The server will process the
requests in the order they are received and you are guaranteed to
receive answers in the same order. It is important however to send all
requests with "Connection: keep-alive", otherwise the API server will
close the connection without processing the pending requests.
They are talking about one thread and multiple requests without waiting for an answer. I suppose it is called HTTP pipelining.
How can I do this with Python Requests library?
A similar answer suggests using parallel calls which is not the case for my question. It also says: "requests does pool connections, keeping the TCP connection open". How can I implement this?
Can I use any other synchronous library, if it's not possible for requests?
You can get several pages in parallel, without threads. It exploits HTTP pipelining by resetting the state(private variable!) of HTTPSConnection to trick it into sending the next request ahead of time.
from http.client import HTTPSConnection, _CS_IDLE
from urllib.parse import urlparse, urlunparse
def pipeline(host, pages, max_out_bound=4, debuglevel=0):
page_count = len(pages)
conn = HTTPSConnection(host)
conn.set_debuglevel(debuglevel)
responses = [None] * page_count
finished = [False] * page_count
content = [None] * page_count
headers = {'Host': host, 'Content-Length': 0, 'Connection': 'Keep-Alive'}
while not all(finished):
# Send
out_bound = 0
for i, page in enumerate(pages):
if out_bound >= max_out_bound:
break
elif page and not finished[i] and responses[i] is None:
if debuglevel > 0:
print('Sending request for %r...' % (page,))
conn._HTTPConnection__state = _CS_IDLE # private variable!
conn.request("GET", page, None, headers)
responses[i] = conn.response_class(conn.sock, method=conn._method)
out_bound += 1
# Try to read a response
for i, resp in enumerate(responses):
if resp is None:
continue
if debuglevel > 0:
print('Retrieving %r...' % (pages[i],))
out_bound -= 1
skip_read = False
resp.begin()
if debuglevel > 0:
print(' %d %s' % (resp.status, resp.reason))
if 200 <= resp.status < 300:
# Ok
content[i] = resp.read()
cookie = resp.getheader('Set-Cookie')
if cookie is not None:
headers['Cookie'] = cookie
skip_read = True
finished[i] = True
responses[i] = None
elif 300 <= resp.status < 400:
# Redirect
loc = resp.getheader('Location')
responses[i] = None
parsed = loc and urlparse(loc)
if not parsed:
# Missing or empty location header
content[i] = (resp.status, resp.reason)
finished[i] = True
elif parsed.netloc != '' and parsed.netloc != host:
# Redirect to another host
content[i] = (resp.status, resp.reason, loc)
finished[i] = True
else:
path = urlunparse(parsed._replace(scheme='', netloc='', fragment=''))
if debuglevel > 0:
print(' Updated %r to %r' % (pages[i], path))
pages[i] = path
elif resp.status >= 400:
# Failed
content[i] = (resp.status, resp.reason)
finished[i] = True
responses[i] = None
if resp.will_close:
# Connection (will be) closed, need to resend
conn.close()
if debuglevel > 0:
print(' Connection closed')
for j, f in enumerate(finished):
if not f and responses[j] is not None:
if debuglevel > 0:
print(' Discarding out-bound request for %r' % (pages[j],))
responses[j] = None
break
elif not skip_read:
resp.read() # read any data
if any(not f and responses[j] is None for j, f in enumerate(finished)):
# Send another pending request
break
else:
break # All responses are None?
return content
if __name__ == '__main__':
domain = 'en.wikipedia.org'
pages = ['/wiki/HTTP_pipelining', '/wiki/HTTP', '/wiki/HTTP_persistent_connection']
data = pipeline(domain, pages, max_out_bound=3, debuglevel=1)
for i, page in enumerate(data):
print()
print('==== Page %r ====' % (pages[i],))
print(page[:512])
Related
I'm trying to get the status_code from various URLs in a csv file using the requests Python module.
It works for some websites, but for most of them it shows 'Connection Refused', even though if I visit the websites through the browser they load just fine.
The code looks like this:
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from fake_useragent import UserAgent
import time
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
df = pd.read_csv('Websites.csv')
output_data = pd.DataFrame(columns=['url', 'status'])
number_urls = df.shape[0]
i = 0
for url in df['urls']:
session = requests.Session()
adapter = HTTPAdapter(max_retries=3)
adapter.max_retries.respect_retry_after_header = False
session.mount('http://', adapter)
session.mount('https://', adapter)
print(url)
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}
try:
# Status
start = time.time()
response = session.get(url, headers=header, verify=False, timeout=0.5)
request_time = time.time() - start
info = "Request completed in {0:.0f}ms".format(request_time)
print(info)
status = response.status_code
if (status == 200):
status = "Connection Successful"
if (status == 404):
status = "404 Error"
if (status == 403):
status = "403 Error"
if (status == 503):
status = "503 Error"
print(status)
output_data.loc[i] = [df.iloc[i, 0], status]
i += 1
except requests.exceptions.Timeout:
status = "Connection Timed Out"
print(status)
request_time = time.time() - start
info = "TimeOut in {0:.0f}ms".format(request_time)
print(info)
output_data.loc[i] = [df.iloc[i, 0], status]
i += 1
except requests.exceptions.ConnectionError:
status = "Connection Refused"
print(status)
request_time = time.time() - start
info = "Connection Error in {0:.0f}ms".format(request_time)
print(info)
output_data.loc[i] = [df.iloc[i, 0], status]
i += 1
output_data.to_csv('dead_blocked2.csv', index=False)
print('CSV file created!')
Here's an example of one website that shows Connection Refused, even though it works: https://www.dytt8.net
I've tried using different TLS versions using the following piece of code and updating my session, but it still doesn't work:
class MyAdapter(HTTPAdapter):
def init_poolmanager(self, connections, maxsize, block=False):
self.poolmanager = PoolManager(num_pools=connections,
maxsize=maxsize,
block=block,
ssl_version=ssl.PROTOCOL_TLSv1)
Can anyone help?
Thanks!
I've been using a function that I took from the book Web Scraping with Python from O'Really by Ryan Mitchell:
import sys
import os.path
import socket
import random
import urllib2
import contextlib
import diskCache
import logging as logger
from bs4 import BeautifulSoup
DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout (DEFAULT_TIMEOUT)
def download (url, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, \
cache=None, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, data=None):
result = None
if cache:
try:
result = cache[url]
except KeyError:
# url is not available in cache
pass
if result is not None and result['code'] is not None \
and num_retries > 0 and 500 <= result['code'] < 600:
# server error so ignore result from cache and re-download
result = None
if result is None:
proxy = random.choice(proxies) if proxies else None
headers = {'User-agent': user_agent}
result = call (url, headers, proxy=proxy, num_retries=num_retries, cache=cache)
if cache:
# save result to cache
cache[url] = result
return result['html']
def call (url, headers, proxy, num_retries, cache=None, data=None):
request = urllib2.Request(url, data, headers or {})
with contextlib.closing (urllib2.urlopen(request)) as connection:
try:
logger.info ('Downloading: %s', url)
html = connection.read ()
code = connection.getcode ()
except Exception as e:
logger.exception ('Download error:', str(e))
if cache:
del cache['url']
html = None
if hasattr (e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
return download (url, headers, num_retries-1, data) # retry server errors
else:
code = None
return {'html': html, 'code':code}
I wanted to know if there is a simpler way of handling the errors when downloading urls. I've seen that the requests library is a higher level and easier library and maybe it could simplify this. At the very least how would this code be for python3?
It would be something like
"""Functions used by the fetch module"""
# Standard library imports
import time
import socket
import logging as logger
from typing import Dict, Optional
# Third party imports
import requests
from requests.exceptions import HTTPError, Timeout
from bs4 import BeautifulSoup
# Constants
DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout(DEFAULT_TIMEOUT)
def fetch(url: str, retries: Optional[int] = DEFAULT_RETRIES) -> Dict:
"""Download an url"""
code = None
try:
logger.info('Downloading: %s', url)
resp = requests.get(url)
resp.raise_for_status()
code = resp.status_code
except (HTTPError, Timeout) as ex:
logger.exception("Couldn't download %s", ex)
return None
if code is not None and retries > 0 and \
500 <= code < 600: # Server error
logger.info('Retrying download')
time.sleep(DEFAULT_DELAY)
return fetch(url, retries-1)
return {'html': resp, 'code': code}
As you said this is a lot easier with requests
resp = requests.get(url, headers=headers, timeout=timeout)
print(resp.status_code)
print(resp.text)
# for an API use resp.json()
There is no exception raised by default. You can call resp.raise_for_status() if you do want to raise an exception.
See http://docs.python-requests.org/en/master/user/quickstart/ for details
I'm trying to write a forwarding http proxy with aiohttp, I've currently got it working with http but wanting it to work with https(without decryption).
import asyncio
from aiohttp import web, ClientSession
loop = asyncio.get_event_loop()
async def handler(server_request):
if server_request.method == "CONNECT":
print(await server_request.read())
else:
async with ClientSession() as session:
async with session.request(server_request.method, server_request.raw_path) as request:
response = web.StreamResponse(status=200,
reason='OK',
headers={'Content-Type': 'text/html'})
await response.prepare(server_request)
while True:
chunk = await request.content.read()
if not chunk:
break
response.write(chunk)
return response
server = web.Server(handler)
loop.run_until_complete(loop.create_server(server, "0.0.0.0", 8080))
try:
loop.run_forever()
except KeyboardInterrupt:
loop.close()
pass
I've got to the point where I need to get the raw body to send down the tunnel to the destination but can't seem to access it
If I attempt to read I get an exception:
b''
Unhandled exception
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/aiohttp/web_protocol.py", line 434, in start
yield from resp.prepare(request)
AttributeError: 'NoneType' object has no attribute 'prepare'
The basic idea in a HTTPS proxy is to act like a stream proxy, where we read raw data and put raw data to destination. Unfortunately after skimming through aiohttp I could not find anything which would help achieve the same, so you need to us basic asyncio.
Below code snippets shows the main logic behind the https part
if head[0] == 'CONNECT': # https proxy
try:
logger.info('%sBYPASSING <%s %s> (SSL connection)' %
('[%s] ' % ident if verbose >= 1 else '', head[0], head[1]))
m = REGEX_HOST.search(head[1])
host = m.group(1)
port = int(m.group(2))
req_reader, req_writer = yield from asyncio.open_connection(host, port, ssl=False, loop=loop)
client_writer.write(b'HTTP/1.1 200 Connection established\r\n\r\n')
#asyncio.coroutine
def relay_stream(reader, writer):
try:
while True:
line = yield from reader.read(1024)
if len(line) == 0:
break
writer.write(line)
except:
print_exc()
tasks = [
asyncio.async(relay_stream(client_reader, req_writer), loop=loop),
asyncio.async(relay_stream(req_reader, client_writer), loop=loop),
]
yield from asyncio.wait(tasks, loop=loop)
except:
print_exc()
finally:
return
The complete PROXY code is as below
#!/usr/bin/env python3
VERSION = "v0.2.0"
"""
Copyright (c) 2013 devunt
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
"""
import sys
if sys.version_info < (3, 4):
print('Error: You need python 3.4.0 or above.')
exit(1)
from argparse import ArgumentParser
from socket import TCP_NODELAY
from time import time
from traceback import print_exc
import asyncio
import logging
import random
import functools
import re
REGEX_HOST = re.compile(r'(.+?):([0-9]{1,5})')
REGEX_CONTENT_LENGTH = re.compile(r'\r\nContent-Length: ([0-9]+)\r\n', re.IGNORECASE)
REGEX_CONNECTION = re.compile(r'\r\nConnection: (.+)\r\n', re.IGNORECASE)
clients = {}
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] {%(levelname)s} %(message)s')
logging.getLogger('asyncio').setLevel(logging.CRITICAL)
logger = logging.getLogger('warp')
verbose = 0
def accept_client(client_reader, client_writer, *, loop=None):
ident = hex(id(client_reader))[-6:]
task = asyncio.async(process_warp(client_reader, client_writer, loop=loop), loop=loop)
clients[task] = (client_reader, client_writer)
started_time = time()
def client_done(task):
del clients[task]
client_writer.close()
logger.debug('[%s] Connection closed (took %.5f seconds)' % (ident, time() - started_time))
logger.debug('[%s] Connection started' % ident)
task.add_done_callback(client_done)
#asyncio.coroutine
def process_warp(client_reader, client_writer, *, loop=None):
ident = str(hex(id(client_reader)))[-6:]
header = ''
payload = b''
try:
RECV_MAX_RETRY = 3
recvRetry = 0
while True:
line = yield from client_reader.readline()
if not line:
if len(header) == 0 and recvRetry < RECV_MAX_RETRY:
# handle the case when the client make connection but sending data is delayed for some reasons
recvRetry += 1
yield from asyncio.sleep(0.2, loop=loop)
continue
else:
break
if line == b'\r\n':
break
if line != b'':
header += line.decode()
m = REGEX_CONTENT_LENGTH.search(header)
if m:
cl = int(m.group(1))
while (len(payload) < cl):
payload += yield from client_reader.read(1024)
except:
print_exc()
if len(header) == 0:
logger.debug('[%s] !!! Task reject (empty request)' % ident)
return
req = header.split('\r\n')[:-1]
if len(req) < 4:
logger.debug('[%s] !!! Task reject (invalid request)' % ident)
return
head = req[0].split(' ')
if head[0] == 'CONNECT': # https proxy
try:
logger.info('%sBYPASSING <%s %s> (SSL connection)' %
('[%s] ' % ident if verbose >= 1 else '', head[0], head[1]))
m = REGEX_HOST.search(head[1])
host = m.group(1)
port = int(m.group(2))
req_reader, req_writer = yield from asyncio.open_connection(host, port, ssl=False, loop=loop)
client_writer.write(b'HTTP/1.1 200 Connection established\r\n\r\n')
#asyncio.coroutine
def relay_stream(reader, writer):
try:
while True:
line = yield from reader.read(1024)
if len(line) == 0:
break
writer.write(line)
except:
print_exc()
tasks = [
asyncio.async(relay_stream(client_reader, req_writer), loop=loop),
asyncio.async(relay_stream(req_reader, client_writer), loop=loop),
]
yield from asyncio.wait(tasks, loop=loop)
except:
print_exc()
finally:
return
phost = False
sreq = []
sreqHeaderEndIndex = 0
for line in req[1:]:
headerNameAndValue = line.split(': ', 1)
if len(headerNameAndValue) == 2:
headerName, headerValue = headerNameAndValue
else:
headerName, headerValue = headerNameAndValue[0], None
if headerName.lower() == "host":
phost = headerValue
elif headerName.lower() == "connection":
if headerValue.lower() in ('keep-alive', 'persist'):
# current version of this program does not support the HTTP keep-alive feature
sreq.append("Connection: close")
else:
sreq.append(line)
elif headerName.lower() != 'proxy-connection':
sreq.append(line)
if len(line) == 0 and sreqHeaderEndIndex == 0:
sreqHeaderEndIndex = len(sreq) - 1
if sreqHeaderEndIndex == 0:
sreqHeaderEndIndex = len(sreq)
m = REGEX_CONNECTION.search(header)
if not m:
sreq.insert(sreqHeaderEndIndex, "Connection: close")
if not phost:
phost = '127.0.0.1'
path = head[1][len(phost)+7:]
logger.info('%sWARPING <%s %s>' % ('[%s] ' % ident if verbose >= 1 else '', head[0], head[1]))
new_head = ' '.join([head[0], path, head[2]])
m = REGEX_HOST.search(phost)
if m:
host = m.group(1)
port = int(m.group(2))
else:
host = phost
port = 80
try:
req_reader, req_writer = yield from asyncio.open_connection(host, port, flags=TCP_NODELAY, loop=loop)
req_writer.write(('%s\r\n' % new_head).encode())
yield from req_writer.drain()
yield from asyncio.sleep(0.2, loop=loop)
def generate_dummyheaders():
def generate_rndstrs(strings, length):
return ''.join(random.choice(strings) for _ in range(length))
import string
return ['X-%s: %s\r\n' % (generate_rndstrs(string.ascii_uppercase, 16),
generate_rndstrs(string.ascii_letters + string.digits, 128)) for _ in range(32)]
req_writer.writelines(list(map(lambda x: x.encode(), generate_dummyheaders())))
yield from req_writer.drain()
req_writer.write(b'Host: ')
yield from req_writer.drain()
def feed_phost(phost):
i = 1
while phost:
yield random.randrange(2, 4), phost[:i]
phost = phost[i:]
i = random.randrange(2, 5)
for delay, c in feed_phost(phost):
yield from asyncio.sleep(delay / 10.0, loop=loop)
req_writer.write(c.encode())
yield from req_writer.drain()
req_writer.write(b'\r\n')
req_writer.writelines(list(map(lambda x: (x + '\r\n').encode(), sreq)))
req_writer.write(b'\r\n')
if payload != b'':
req_writer.write(payload)
req_writer.write(b'\r\n')
yield from req_writer.drain()
try:
while True:
buf = yield from req_reader.read(1024)
if len(buf) == 0:
break
client_writer.write(buf)
except:
print_exc()
except:
print_exc()
client_writer.close()
#asyncio.coroutine
def start_warp_server(host, port, *, loop = None):
try:
accept = functools.partial(accept_client, loop=loop)
server = yield from asyncio.start_server(accept, host=host, port=port, loop=loop)
except OSError as ex:
logger.critical('!!! Failed to bind server at [%s:%d]: %s' % (host, port, ex.args[1]))
raise
else:
logger.info('Server bound at [%s:%d].' % (host, port))
return server
def main():
"""CLI frontend function. It takes command line options e.g. host,
port and provides `--help` message.
"""
parser = ArgumentParser(description='Simple HTTP transparent proxy')
parser.add_argument('-H', '--host', default='127.0.0.1',
help='Host to listen [default: %(default)s]')
parser.add_argument('-p', '--port', type=int, default=8800,
help='Port to listen [default: %(default)d]')
parser.add_argument('-v', '--verbose', action='count', default=0,
help='Print verbose')
args = parser.parse_args()
if not (1 <= args.port <= 65535):
parser.error('port must be 1-65535')
if args.verbose >= 3:
parser.error('verbose level must be 1-2')
if args.verbose >= 1:
logger.setLevel(logging.DEBUG)
if args.verbose >= 2:
logging.getLogger('warp').setLevel(logging.DEBUG)
logging.getLogger('asyncio').setLevel(logging.DEBUG)
global verbose
verbose = args.verbose
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(start_warp_server(args.host, args.port))
loop.run_forever()
except OSError:
pass
except KeyboardInterrupt:
print('bye')
finally:
loop.close()
if __name__ == '__main__':
exit(main())
PS: The code has been taken from https://github.com/devunt/warp. The complete code is posted here so that if the link becomes invalid in future, the answer is still valid.
I am trying to create a data endpoint that streams either the entirety of a file or responds appropriately to range requests. Streaming the whole file seems understandable, but it's not clear to me how to deal with range requests. Particularly, I can't see how aiohttp.MultipartWriter can write to a StreamResponse.
Here's an abstracted form of my code, so far:
from aiohttp.web import Request, StreamResponse
from aiohttp.multipart import MultipartWriter
async def data_handler(req:Request) -> StreamResponse:
is_range_request = "Range" in req.headers
with open("my_big_file", "rb") as f:
if is_range_request:
status_code = 202
content_type = "multipart/bytes"
else:
status_code = 200
content_type = "application/octet-stream"
resp = SteamResponse(status=status_code, headers={"Content-Type": content_type})
resp.enable_chunked_encoding()
resp.enable_compression()
await resp.prepare(req)
if is_range_request:
# _parse_range_header :: str -> List[ByteRange]
# ByteRange = Tuple[int, int] i.e., "from" and "to", inclusive
ranges = _parse_range_header(req.headers["Range"])
mpwriter = MultipartWriter("bytes")
for r in ranges:
range_from, range_to = r
range_size = (range_to - range_from) + 1
range_header = {"Content-Type": "application/octet-stream"}
# FIXME Won't this block?
f.seek(range_from)
mpwriter.append(f.read(range_size), range_header)
# TODO Write to response. How?...
else:
while True:
data = f.read(8192)
if not data:
await resp.drain()
break
resp.write(data)
return resp
This also doesn't return the response until it gets to the end. This doesn't seem correct to me: How does an upstream call know what's going on until the response is returned; or is the asyncio stuff doing this for me automagically?
I got a rather weird problem. The following code in scrape1 sometimes works as it should, but most of the time it just stops at line 24, where request.get is being used. I do however consistently get this keyerror exception:
Exception KeyError: KeyError(140186412830800,) in module <'threading' from '/usr/lib/python2.7/threading.pyc'> ignored
The exception is only thrown when I'm importing the module proxyfetch.py but as long as I don't actually execute the code in proxyfetch.py, scrape1.py doesn't break (exception is thrown after nominal execution). Proxyfetch is based on DanMcInerney elite-proxy-finder on github. I just edited it so I could use it as a module which returns a list of proxys instead of printing them.
So here are the 2 scripts:
scrape1.py:
#scrape1.py
from bs4 import BeautifulSoup
from proxyfetch import getprox
import requests
proxcount=3
listz = getprox(proxcount)
proxfile = open("proxysave.txt", "w")
base_url = "http://google.com"
def pagefetch(url):
print "Test"
http_proxy = "http://"+listz[0]
#http_proxy = "http://103.25.203.227:3127"
print "Test2"
proxydict = {
"http" : http_proxy
#"https_proxy" : https_proxy
}
print "Test3"
page = requests.get(url, proxies=proxydict) #with proxy
#page = requests.get(url) #without proxy
print "Test4"
return page
page = pagefetch(base_url)
soup = BeautifulSoup(page.text)
links = soup.find_all("a")
if links:
for n in links:
print n
else:
print "I got nuthin."
And proxyfetch.py
#!/usr/bin/env python2
#proxyfetch.py
'''Finds hundreds of elite anonymity (L1) HTTP proxies then tests them all in parallel printing the fastest ones first.
Checks headers to confirm eliteness, checks if compatible with opening HTTPS sites, and confirms the proxy is working
through multiple IP checking sites'''
# TO DO:
# -Add http://free-proxy-list.net/
# -Add hidemyass
#from IPython import embed
__author__ = 'Dan McInerney'
__contact__ = 'danhmcinerney gmail'
from gevent import monkey
monkey.patch_all()
import requests
import ast
import gevent
import sys, re, time, os, argparse
import socket
from bs4 import BeautifulSoup
listz =[]
def getprox(amount):
argz = [amount, False, True]
try:
P = find_http_proxy(argz)
P.run()
except BaseException,Err:
return listz
return listz
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--show', help='Show this number of results. Example: "-s 5" will show the 5 fastest proxies then stop')
parser.add_argument('-a', '--all', help='Show all proxy results including the ones that failed 1 of the 3 tests', action='store_true')
parser.add_argument('-q', '--quiet', help='Only print the IP:port of the fastest proxies that pass all the tests', action='store_true')
return parser.parse_args()
class find_http_proxy():
''' Will only gather L1 (elite anonymity) proxies
which should not give out your IP or advertise
that you are using a proxy at all '''
#argz = [arg1, False, True]
def __init__(self, argz):
self.proxy_list = []
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36'}
self.show_num = argz[0]
self.show_all = False
self.quiet = True
self.errors = []
self.print_counter = 0
self.externalip = self.external_ip()
def external_ip(self):
req = requests.get('http://myip.dnsdynamic.org/', headers=self.headers)
ip = req.text
return ip
def run(self):
''' Gets raw high anonymity (L1) proxy data then calls make_proxy_list()
Currently parses data from gatherproxy.com and letushide.com '''
if not self.quiet:
print '[*] Your accurate external IP: %s' % self.externalip
letushide_list = self.letushide_req()
if not self.quiet:
print '[*] letushide.com: %s proxies' % str(len(letushide_list))
# Has a login now :(
gatherproxy_list = self.gatherproxy_req()
if not self.quiet:
print '[*] gatherproxy.com: %s proxies' % str(len(gatherproxy_list))
checkerproxy_list = self.checkerproxy_req()
if not self.quiet:
print '[*] checkerproxy.net: %s proxies' % str(len(checkerproxy_list))
self.proxy_list.append(letushide_list)
self.proxy_list.append(gatherproxy_list)
self.proxy_list.append(checkerproxy_list)
# Flatten list of lists (1 master list containing 1 list of ips per proxy website)
self.proxy_list = [ips for proxy_site in self.proxy_list for ips in proxy_site]
self.proxy_list = list(set(self.proxy_list)) # Remove duplicates
if not self.quiet:
print '[*] %d unique high anonymity proxies found' % len(self.proxy_list)
print '[*] Testing proxy speeds ...'
print ''
print ' Proxy | CC | Domain | Time/Errors'
self.proxy_checker()
return list_
def checkerproxy_req(self):
''' Make the request to checkerproxy and create a master list from that site '''
cp_ips = []
try:
url = 'http://checkerproxy.net/all_proxy'
r = requests.get(url, headers=self.headers)
html = r.text
except Exception:
print '[!] Failed to get reply from %s' % url
checkerproxy_list = []
return checkerproxy_list
checkerproxy_list = self.parse_checkerproxy(html)
return checkerproxy_list
def parse_checkerproxy(self, html):
''' Only get elite proxies from checkerproxy '''
ips = []
soup = BeautifulSoup(html)
for tr in soup.findAll('tr'):
if len(tr) == 19:
ip_found = False
elite = False
ip_port = None
tds = tr.findAll('td')
for td in tds:
if ':' in td.text:
ip_found = True
ip_port_re = re.match('(\d{1,3}\.){3}\d{1,3}:\d{1,5}', td.text)
if ip_port_re:
ip_port = ip_port_re.group()
if not ip_port:
ip_found = False
if 'Elite' in td.text:
elite = True
if ip_found == True and elite == True:
ips.append(str(ip_port))
break
return ips
def letushide_req(self):
''' Make the request to the proxy site and create a master list from that site '''
letushide_ips = []
for i in xrange(1,20): # can search maximum of 20 pages
try:
url = 'http://letushide.com/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i)
r = requests.get(url, headers=self.headers)
html = r.text
ips = self.parse_letushide(html)
# Check html for a link to the next page
if '/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i+1) in html:
pass
else:
letushide_ips.append(ips)
break
letushide_ips.append(ips)
except:
print '[!] Failed get reply from %s' % url
break
# Flatten list of lists (1 list containing 1 list of ips for each page)
letushide_list = [item for sublist in letushide_ips for item in sublist]
return letushide_list
def parse_letushide(self, html):
''' Parse out list of IP:port strings from the html '''
# \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} - matches IP addresses
# </a></td><td> - is in between the IP and the port
# .*?< - match all text (.) for as many characters as possible (*) but don't be greedy (?) and stop at the next greater than (<)
raw_ips = re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}</a></td><td>.*?<', html)
ips = []
for ip in raw_ips:
ip = ip.replace('</a></td><td>', ':')
ip = ip.strip('<')
ips.append(ip)
return ips
def gatherproxy_req(self):
url = 'http://gatherproxy.com/proxylist/anonymity/?t=Elite'
try:
r = requests.get(url, headers = self.headers)
lines = r.text.splitlines()
except:
print '[!] Failed get reply from %s' % url
gatherproxy_list = []
return gatherproxy_list
gatherproxy_list = self.parse_gp(lines)
return gatherproxy_list
def parse_gp(self, lines):
''' Parse the raw scraped data '''
gatherproxy_list = []
for l in lines:
if 'proxy_ip' in l.lower():
l = l.replace('gp.insertPrx(', '')
l = l.replace(');', '')
l = l.replace('null', 'None')
l = l.strip()
l = ast.literal_eval(l)
proxy = '%s:%s' % (l["PROXY_IP"], l["PROXY_PORT"])
gatherproxy_list.append(proxy)
#ctry = l["PROXY_COUNTRY"]
return gatherproxy_list
def proxy_checker(self):
''' Concurrency stuff here '''
jobs = [gevent.spawn(self.proxy_checker_req, proxy) for proxy in self.proxy_list]
try:
gevent.joinall(jobs)
except KeyboardInterrupt:
sys.exit('[-] Ctrl-C caught, exiting')
def proxy_checker_req(self, proxy):
''' See how long each proxy takes to open each URL '''
proxyip = str(proxy.split(':', 1)[0])
# A lot of proxy checker sites give a different final octet for some reason
#proxy_split = proxyip.split('.')
#first_3_octets = '.'.join(proxy_split[:3])+'.'
results = []
urls = ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org', 'https://www.astrill.com/what-is-my-ip-address.php', 'http://danmcinerney.org/headers.php']
for url in urls:
try:
check = requests.get(url,
headers = self.headers,
proxies = {'http':'http://'+proxy,
'https':'http://'+proxy},
timeout = 15)
time_or_error = str(check.elapsed)
html = check.text
time_or_error = self.html_handler(time_or_error, html, url)
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
except Exception as e:
time_or_error = self.error_handler(str(e))
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
self.print_handler(results, proxyip)
def html_handler(self, time_or_error, html, url):
''' Check the html for errors and if none are found return time to load page '''
html_lines = html.splitlines()
leng = len(html_lines)
ipre = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}'
# Both of these urls just return the ip and nothing else
if url in ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org']:
if leng == 1: # Should return 1 line of html
match = re.match(ipre, html)
if match:
if self.externalip in html:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
# This is the SSL page
if 'astrill' in url:
soup = BeautifulSoup(html)
ip = soup.find("td", { "colspan": 2 }).text # the ip is the only on with colspan = 2
match = re.match(ipre, ip)
if match:
if self.externalip in ip:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
if '/headers' in url:
# check for proxy headers
proxy_headers = ['via: ', 'forwarded: ', 'x-forwarded-for', 'client-ip']
if leng > 15: # 15 is arbitrary, I just don't think you'll ever see more than 15 headers
time_or_error = 'Err: headers not returned'
return time_or_error
for l in html_lines:
for h in proxy_headers:
if h in l.lower():
time_or_error = 'Err: Proxy headers found'
return time_or_error
time_or_error = 'Passed: elite proxy'
return time_or_error
def print_handler(self, results, proxyip):
if self.show_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
else:
passed_all = self.passed_all_tests(results)
if passed_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
if self.show_num:
self.limiter()
def printer(self, results, country_code):
''' Creates the output '''
counter = 0
if not self.quiet:
print '--------------------------------------------------------------------'
for r in results:
counter += 1
time_or_error = r[0]
proxy = r[1]
url = r[2]
if self.quiet:
if counter % 4 == 0: #################### THIS results is a list of 4 tuples each, so proxies will repeat 4 times
#print proxy
global listz
listz.append(proxy)
else:
# Only print the proxy once, on the second print job
if counter == 1:
print '%s | %s | %s | %s' % (proxy.ljust(21), country_code.ljust(3), url.ljust(21), time_or_error)
else:
print '%s | %s | %s | %s' % (' '.ljust(21), ' ', url.ljust(21), time_or_error)
def get_country_code(self, proxyip):
''' Get the 3 letter country code of the proxy using geoiptool.com
Would use the geoip library, but it requires a local DB and what
is the point of that hassle other than marginal speed improvement '''
cc_line_found = False
cc = 'N/A'
try:
r = requests.get('http://www.geoiptool.com/en/?IP=%s' % proxyip, headers=self.headers)
html = r.text
html_lines = html.splitlines()
for l in html_lines:
if cc_line_found == True:
cc = l.split('(', 1)[1].split(')', 1)[0]
break
if 'country code:' in l.lower():
cc_line_found = True
except:
pass
return cc
def error_handler(self, e):
if 'Cannot connect' in e:
time_or_error = 'Err: Cannot connect to proxy'
elif 'timed out' in e.lower():
time_or_error = 'Err: Timed out'
elif 'retries exceeded' in e:
time_or_error = 'Err: Max retries exceeded'
elif 'Connection reset by peer' in e:
time_or_error = 'Err: Connection reset by peer'
elif 'readline() takes exactly 1 argument (2 given)' in e:
time_or_error = 'Err: SSL error'
else:
time_or_error = 'Err: ' + e
return time_or_error
def url_shortener(self, url):
if 'ip.php' in url:
url = 'danmcinerney.org'
elif 'headers.php' in url:
url = 'Header check'
elif 'dnsdynamic' in url:
url = 'dnsdynamic.org'
elif 'astrill' in url:
url = 'https://astrill.com'
return url
def passed_all_tests(self, results):
for r in results:
time_or_error= r[0]
if 'Err:' in time_or_error:
global testx
testx = 50
return False
return True
def limiter(self):
testx = 0
''' Kill the script if user supplied limit of successful proxy attempts (-s argument) is reached '''
if self.print_counter >= int(self.show_num):
sys.exit()