Lets say I have a website that I want to scrape. Ex. cheapoair.com
I want to use a normal requests in python to scrape the data on the first, hypothetical page. If I end up being blocked by the server, I want to switch to a proxy. I have a list of proxy servers and a method, and I also have a list of user agent strings. However, I think I need help thinking through the problem.
For reference
uagen() will return a user agent string
proxit() will return a proxy
Here is what I have so far:
import requests
from proxy_def import *
from http import cookiejar
import time
from socket import error as SocketError
import sys
start_time = time.time()
class BlockAll(cookiejar.CookiePolicy):
return_ok = set_ok = domain_return_ok = path_return_ok = lambda self, *args, **kwargs: False
netscape = True
rfc2965 = hide_cookie2 = False
headers = {'User-Agent': uagen()}
print(headers)
s = requests.Session()
s.cookies.set_policy(BlockAll)
cookies = {'SetCurrency': 'USD'}
sp = proxit()
for i in range(100000000000):
while True:
try:
print('trying on ', sp)
print('with user agent headers', headers)
s.proxies = {"http": sp}
r = s.get("http://www.cheapoair.com", headers=headers, timeout=15, cookies=cookies)
print(i, sp, 'success')
print("--- %s seconds ---" % (time.time() - start_time))
except SocketError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.ConnectionError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.Timeout as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except KeyboardInterrupt:
print("The program has been terminated")
sys.exit(1)
break
#print(r.text)
print('all done',
'\n')
What I am looking for is an idea of how to say, start with a normal requests (not from a proxy), and if you end up with an error (such as being rejected by the server), switch to a proxy and try again.
I can almost picture it, but cant quite see it.
I'm thinking, that if I place a variable after
for i in range(1000000000000):
But before while true: That updates the sp then it might work. Another possibility it to maybe declare s.proxies = {"http": ""} and then if I run into an error, switch to s.poxies = {"http": "proxit()"} or s.poxies = {"http": "sp"}
Thanks!
I figured it out.
while True:
try:
#do this thing
#but remove variable from here and declare it before "while True"
except SockerError as e:
#switch headers, switch user agent string
s.proxies = {"http": proxit()}
continue
That will refresh the variable after it gets an error from the server
Related
I'm trying to make a proxy with a Blocklist. I initially used 403 when a user goes to a blocked page. However, it doesn't work with HTTPS and returns ERR_TUNNEL_CONNECTION_FAILED as explained in Respond with 403 in an HTTPS proxy
Thus, I want to redirect the user to a html page like this
This is my code:
import socket
import threading
import signal
import sys
import fnmatch
import errno
import time
import pdb
import re
from time import gmtime, strftime, localtime
import logging
import config
import rule
import tag as tag_store
from Ignite import flame
core = flame()
p=re.compile('(http:\/\/)?([\w\.-]*)(\:(\d*))?(\/.*)?')
thread_logger = logging.getLogger('thread')
access_logger = logging.getLogger('access')
csv_logger = logging.getLogger('csv')
def proxy(browser_conn, client_addr):
print("hi")
def ishostAllowed(host):
print("\n\nHost:")
print(str(host))
access_logger.info(str(host))
if host.split('.')[-1].isdigit():
thread_logger.warn("Invalid host:".format(host),extra=req)
return core.check_allow(host)
#pdb.set_trace()
tags=tag_store.get(host)
if not tags:
thread_logger.warn("{0} isn't allowed: empty tags".format(host),extra=req)
return core.check_allow(host)
for tag in tag_store.get(host):
if not rule.isTagAllowed(tag):
thread_logger.warn("{0}:{1} isn't allowed".format(host,tag),extra=req)
return core.check_allow(host)
return core.check(host)
def proxy_http(request):
try:
# create a socket to connect to the web server
#pdb.set_trace()
server_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_conn.settimeout(config.connection_timeout)
server_conn.connect((request['host'], request['port']))
server_conn.sendall(request['raw_data']) # send request to webserver
while 1:
data = server_conn.recv(config.max_request_len) # receive data from web server
if (len(data) > 0):
browser_conn.send(data) # send to browser
else:
break
except socket.error as error_msg:
thread_logger.error(str(error_msg)+":"+str(request),extra=req);
finally:
if server_conn:
server_conn.close()
if browser_conn:
browser_conn.close()
return
def response(status,message):
reply = "HTTP/1.1 {0} {1}\r\n"
reply += "Proxy-agent: Sinktrap\r\n"
reply += "\r\n"
reply = reply.format(status,message);
#pdb.set_trace()
browser_conn.sendall( reply.encode() )
def proxy_https(request):
#pdb.set_trace()
try:
server_conn = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# If successful, send 200 code response
server_conn.connect((req['host'], req['port']))
response(200,'Connection established')
except socket.error as err:
# If the connection could not be established, exit
# Should properly handle the exit with http error code here
thread_logger.error("Cannot establish https connection:"+err,extra=req);
if server_conn:
server_conn.close()
if browser_conn:
browser_conn.close()
return
# Indiscriminately forward bytes
browser_conn.setblocking(0)
server_conn.setblocking(0)
timeout=time.time()+60 # 1 minute
while timeout-time.time()>0:
request_done=False
replied_done=False
try:
request =browser_conn.recv(config.max_request_len) # receive data from browser
if (len(request) > 0):
server_conn.sendall(request) # send to web server
else:
request_done=True
#hread_logger.info("REQUEST len: " + str(len(request)),extra=req);
except socket.error as e:
if e.errno==errno.EWOULDBLOCK:
time.sleep(0.1)
pass
else:
thread_logger.error("pipe error:"+str(e),extra=req);
break
try:
reply = server_conn.recv(config.max_request_len) # receive data from web server
if (len(reply) > 0):
browser_conn.sendall(reply) # send to browser
else:
replied_done=True
#thread_logger.info("reply len: " + str(len(reply)),extra=req);
except socket.error as e:
if e.errno==errno.EWOULDBLOCK:
time.sleep(0.1)
pass
else:
thread_logger.error("pipe error:"+str(e),extra=req);
break
if request_done and replied_done:
break
server_conn.close()
browser_conn.close()
raw_data = browser_conn.recv(config.max_request_len) # get the request from browser
req={'raw_data':raw_data,
'tname' : threading.currentThread().getName(),
'client_ip' : client_addr[0],
'client_port' : client_addr[1]
}
thread_logger.info("REQUEST: {0}".format(raw_data),extra=req);
#pdb.set_trace()
try:
# request_line is the first one. https://www.w3.org/Protocols/rfc2616/rfc2616-sec5.html
msg_body_pos=len(raw_data)
for i in range(4,len(raw_data)):
if raw_data[i-4:i].decode()=='\r\n\r\n':
msg_body_pos=i
break
lines=raw_data[:msg_body_pos-4].decode('utf-8').split('\r\n')
if len(lines[0])<16:
thread_logger.warn("INVALU REQUEST:{0}".format(raw_data),extra=req);
return
headers = {k:v for k,v in (x.split(':',1) for x in lines[1:]) }
if 'Referer' in headers:
req['Referer']=headers['Referer']
else:
req['Referer']=''
if 'User-Agent' in headers:
req['User-Agent']=headers['User-Agent']
else:
req['User-Agent']=''
req['request_line'] =lines[0]
req['method'],req['request_uri'],req['http_version']=lines[0].split(' ')
#check if the first line is valid request. request_line might be empty
if not req['method'] or not req['request_uri'] or not req['http_version']:
thread_logger.warn("INVALU REQUEST:{0}".format(raw_data),extra=req);
return
except Exception as e:
thread_logger.error("INVALU REQUEST:{0} {1}".format(e, raw_data),extra=req);
logging.exception("INVALU REQUEST")
return
access_logger.info("",extra=req)
#pdb.set_trace()
m=p.match(req['request_uri'])
req['host']=m.group(2)
req['port']=int(m.group(4)) if m.group(4) else 80
# Check if request is allowed or not
if not ishostAllowed(req['host']):
csv_logger.info("blocked",extra=req);
thread_logger.warn("Block REQUEST:{0}".format(raw_data),extra=req);
response(403,"The website has been blocked by Ignite's proxy.")
#Breq = req
#Breq['host'] = "azlancoding.github.io/Blocked"
#proxy_https(Breq)
#response(307,"https://azlancoding.github.io/BLOCKED")
return
csv_logger.info("allowed",extra=req);
#pdb.set_trace()
if req['method']=='CONNECT':
proxy_https(req)
else:
proxy_http(req)
The original proxy is pcxy
See my github project here
import requests
import json
import threading
data = {
"amount": 2
}
def foo(data):
try:
r = requests.post(url = "www.mysite.com", data = data)
j = json.loads(r.text)
print(j)
except requests.exceptions.RequestException as e:
raise SystemExist(e)
threading.Timer(1, foo, [data]).start()
I want to run this http request every second using a thread in my program. However, the program only runs the http request once and exit. How do I fix this?
You need to restart the timer after each request :
def foo(data):
try:
r = requests.post(url = "www.mysite.com", data = data)
j = json.loads(r.text)
print(j)
threading.Timer(1, foo, [data]).start() # New Line Added
except requests.exceptions.RequestException as e:
raise SystemExist(e)
I'm new to python and I want this code to run only once and stops, not every 30 seconds
because I want to run multiple codes like this with different access tokens every 5 seconds using the command line.
and when I tried this code it never jumps to the second one because it's a while true:
import requests
import time
api_url = "https://graph.facebook.com/v2.9/"
access_token = "access token"
graph_url = "site url"
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
# Beware of rate limiting if trying to increase frequency.
refresh_rate = 30 # refresh rate in second
while True:
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open ("open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
time.sleep(refresh_rate)
From what I understood you're trying to execute the piece of code for multiple access tokens. To make your job simple, have all your access_tokens as lists and use the following code. It assumes that you know all your access_tokens in advance.
import requests
import time
def scrape_facebook(api_url, access_token, graph_url):
""" Scrapes the given access token"""
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open (access_token+"_"+"open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
access_token = ['a','b','c']
graph_url = ['sss','xxx','ppp']
api_url = "https://graph.facebook.com/v2.9/"
for n in range(len(graph_url)):
scrape_facebook(api_url, access_token[n], graph_url[n])
time.sleep(5)
I am scrapping a web site, but sometimes the laptop lost the connection, and I got (obviously) a requests.exceptions.ConnectionError. Which is the right (or most elegant?) way of recover from this error? I mean: I don't want the program to stop, but retry the connection, maybe some seconds later? This is my code, but I got the feeling is not correct:
def make_soup(session,url):
try:
n = randint(1, MAX_NAPTIME)
sleep(n)
response = session.get(url)
except requests.exceptions.ConnectionError as req_ce:
error_msg = req_ce.args[0].reason.strerror
print "Error: %s con la url %s" % (eror_msg, url)
session = logout(session)
n = randint(MIN_SLEEPTIME, MAX_SLEEPTIME)
sleep(n)
session = login(session)
response = session.get(url)
soup = BeautifulSoup(response.text)
return soup
Any ideas?
Note that I need a session to scrap this pages, so, I think that the login (i.e. login again to the site, after a logout) could be cause troubles
So why not something like
import requests
import time
def retry(cooloff=5, exc_type=None):
if not exc_type:
exc_type = [requests.exceptions.ConnectionError]
def real_decorator(function):
def wrapper(*args, **kwargs):
while True:
try:
return function(*args, **kwargs)
except Exception as e:
if e.__class__ in exc_type:
print "failed (?)"
time.sleep(cooloff)
else:
raise e
return wrapper
return real_decorator
Which is a decorator that allows you to call any function until it succeeds. e.g.
#retry(exc_type=[ZeroDivisionError])
def test():
return 1/0
print test()
Which will just print "failed (y)" every 5 seconds until the end of time (or until the laws of math change)
Is it really needed to logout and relogin into your session? I'd just retry the connection the same way:
def make_soup(session,url):
success = False
response = None
for attempt in range(1, MAXTRIES):
try:
response = session.get(url)
# If session.get succeeded, we break out of the
# for loop after setting a success flag
success = True
break
except requests.exceptions.ConnectionError as req_ce:
error_msg = req_ce.args[0].reason.strerror
print "Error: %s con la url %s" % (error_msg, url)
print " Attempt %s of %s" % (attempt, MAXTRIES)
sleep(randint(MIN_SLEEPTIME, MAX_SLEEPTIME))
# Figure out if we were successful.
# Note it may not be needed to have a flag, you can maybe just
# check the value of response here.
if not success:
print "Couldn't get it after retrying many times"
return None
#Once we get here, we know we got a good response
soup = BeautifulSoup(response.text)
return soup
right now I'm using Flask, and I'm having trouble while trying to do more than one GET request using python requests module.
If I try to send a series of requests, the first one is completed successfully, but the other ones throw a timeout exception.
Here is part of the view's code:
import requests
sess = requests.Session()
site_url = 'http://www.example.com/api/'
steps = ['first_step', 'second_step', 'third_step']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5))
except requests.exceptions.Timeout:
return jsonify({'result':False, 'error':'timeout'})
except requests.exceptions.ConnectionError:
return jsonify({'result':False, 'error':'connection_error'})
else:
step_responses[s] = True
If I extract this part into a standalone .py file, it completes successfully.
import requests
sess = requests.Session()
site_url = 'http://www.example.com/api/'
steps = ['first_step', 'second_step', 'third_step']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5)
except requests.exceptions.Timeout:
step_responses[s] = 'timeout'
except requests.exceptions.ConnectionError:
step_responses[s] = 'conn_error'
else:
step_responses[s] = 'ok'
print step_responses
Works for me. You may want to check the second and third steps
import requests
sess = requests.Session()
def module():
site_url = 'http://stackoverflow.com/'
steps = ['users', 'questions', 'tags']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5)
except requests.exceptions.Timeout:
return jsonify({'result':False, 'error':'timeout'})
except requests.exceptions.ConnectionError:
return jsonify({'result':False, 'error':'connection_error'})
else:
step_responses[s] = True
You might want to make sure that you read all the values from the req object.
I think you might need req.text and req.status_code or req.content
Check half-way down the page here: http://docs.python-requests.org/en/latest/api/#request-sessions where they discuss session parameters
"class requests.adapters.HTTPAdapter(pool_connections=10, pool_maxsize=10, max_retries=0, pool_block=False)"
I'm not at all sure how to use connection pools and so forth but the docs do say (http://docs.python-requests.org/en/latest/user/advanced/) (Look for Keep Alive)
"Note that connections are only released back to the pool for reuse once all body data has been read; be sure to either set stream to False or read the content property of the Response object."