Using proxy middleware to scrape Amazon

Using proxy middleware to scrape Amazon - python

class HttpProxyMiddleware(object):
###never retry these errors
def __init__(self, settings):
socket.setdefaulttimeout(3)
self.proxies = []
self.proxy_index = 1
##plant proxies
self.proxy_list = settings.get('PROXY_LIST')
fin = open(self.proxy_list)
for line in fin.readlines():
line = line.rstrip()
line = 'http://%s' % line
self.proxies.append(line)
print self.proxies
fin.close()
#classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
def process_request(self, request, spider):
if 'proxy' in request.meta:
return
if len(self.proxies) == 0:
raise ValueError('All proxies are unusable, cannot proceed')
proxy_address = self.proxies[self.proxy_index]
print proxy_address
request.meta['proxy'] = proxy_address
def process_exception(self, request, exception, spider):
print 'not working'
self.proxy_index += 1
new_request = request.copy()
new_request.dont_filter = True
return new_request
I have the complete proxy list from HMA and I am try to use this middleware to scrape amazon items. However, it turns out it hardly works. In 'process_request', after printing out the proxy address, it stops for a long time and eventually failed.
I also create the following program to test each proxy ip in my list. All proxy ip are working from here.
def main():
socket.setdefaulttimeout(3)
proxies = []
##plant proxies
proxy_list = '/users/zehuapan/desktop/amazon/amazon/proxy_list.txt'
fin = open(proxy_list)
for line in fin.readlines():
line = line.rstrip()
line = 'http://%s' % line
print line
if check_validity(line):
proxies.append(line)
fin.close()
print proxies
file = open('/users/zehuapan/desktop/amazon/amazon/valid_proxy_list.txt', 'w+')
for proxy in proxies:
file.write(proxy + '\n')
file.close()
def check_validity(proxy):
try:
proxy_handler = urllib2.ProxyHandler({'http': proxy})
opener = urllib2.build_opener(proxy_handler)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib2.install_opener(opener)
req=urllib2.Request('https://www.amazon.com')
sock=urllib2.urlopen(req)
except urllib2.HTTPError, e:
print 'Error code: ', e.code
return e.code
except Exception, detail:
print "ERROR:", detail
return False
return True
if __name__ == '__main__':
main()

Zero response is unlikely to be given using a proxy. And a working proxy goes down very fast with Amazon after few hundreds of requests.
When a proxy still works you can get a captcha page with 200 code, when it is banned you get 503 code.
Also, the proxy might work for the main page and for the offers list, but fail with the product page.
Hope that helps.

Related

Why h.getresponse() is needed in python logging HTTPHandler?

I overrided the method emit of python logging httphandler to adapt it to my needs, and I noticed the line
h.getresponse() #can't do anything with the result
Why is this line necessary?
I noticed that removing this line has no effect when using unsecure logging, but makes the logs fail when using secure connection.
def emit(self, record):
"""
Emit a record.
Send the record to the Web server as a percent-encoded dictionary
"""
try:
import http.client, urllib.parse
host = self.host
if self.secure:
h = http.client.HTTPSConnection(host, context=self.context)
else:
h = http.client.HTTPConnection(host)
url = self.url
data = urllib.parse.urlencode(self.mapLogRecord(record))
if self.method == "GET":
if (url.find('?') >= 0):
sep = '&'
else:
sep = '?'
url = url + "%c%s" % (sep, data)
h.putrequest(self.method, url)
# support multiple hosts on one IP address...
# need to strip optional :port from host, if present
i = host.find(":")
if i >= 0:
host = host[:i]
# See issue #30904: putrequest call above already adds this header
# on Python 3.x.
# h.putheader("Host", host)
if self.method == "POST":
h.putheader("Content-type",
"application/x-www-form-urlencoded")
h.putheader("Content-length", str(len(data)))
if self.credentials:
import base64
s = ('%s:%s' % self.credentials).encode('utf-8')
s = 'Basic ' + base64.b64encode(s).strip().decode('ascii')
h.putheader('Authorization', s)
h.endheaders()
if self.method == "POST":
h.send(data.encode('utf-8'))
h.getresponse() #can't do anything with the result
except Exception:
self.handleError(record)

The getresponse() call guarantees that the request is actually sent to the server by getting the response to the request.

Switching proxies in python: Psuedo code

Lets say I have a website that I want to scrape. Ex. cheapoair.com
I want to use a normal requests in python to scrape the data on the first, hypothetical page. If I end up being blocked by the server, I want to switch to a proxy. I have a list of proxy servers and a method, and I also have a list of user agent strings. However, I think I need help thinking through the problem.
For reference
uagen() will return a user agent string
proxit() will return a proxy
Here is what I have so far:
import requests
from proxy_def import *
from http import cookiejar
import time
from socket import error as SocketError
import sys
start_time = time.time()
class BlockAll(cookiejar.CookiePolicy):
return_ok = set_ok = domain_return_ok = path_return_ok = lambda self, *args, **kwargs: False
netscape = True
rfc2965 = hide_cookie2 = False
headers = {'User-Agent': uagen()}
print(headers)
s = requests.Session()
s.cookies.set_policy(BlockAll)
cookies = {'SetCurrency': 'USD'}
sp = proxit()
for i in range(100000000000):
while True:
try:
print('trying on ', sp)
print('with user agent headers', headers)
s.proxies = {"http": sp}
r = s.get("http://www.cheapoair.com", headers=headers, timeout=15, cookies=cookies)
print(i, sp, 'success')
print("--- %s seconds ---" % (time.time() - start_time))
except SocketError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.ConnectionError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.Timeout as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except KeyboardInterrupt:
print("The program has been terminated")
sys.exit(1)
break
#print(r.text)
print('all done',
'\n')
What I am looking for is an idea of how to say, start with a normal requests (not from a proxy), and if you end up with an error (such as being rejected by the server), switch to a proxy and try again.
I can almost picture it, but cant quite see it.
I'm thinking, that if I place a variable after
for i in range(1000000000000):
But before while true: That updates the sp then it might work. Another possibility it to maybe declare s.proxies = {"http": ""} and then if I run into an error, switch to s.poxies = {"http": "proxit()"} or s.poxies = {"http": "sp"}
Thanks!

I figured it out.
while True:
try:
#do this thing
#but remove variable from here and declare it before "while True"
except SockerError as e:
#switch headers, switch user agent string
s.proxies = {"http": proxit()}
continue
That will refresh the variable after it gets an error from the server

How do I implement a web crawler that scrapes ad links?

To get training data, I wrote a crawler to follow the top 500 websites on Alexa with a depth of 2 and write all links found to a file. Right now, it looks for all the links in the html and writes them to a file. The problem is, the crawler misses all links to ads, some of which are located in iframes or located in CSS files. How can I change my web crawler so that it scrapes all links, including ads? The relevant code can be found below.
class Crawler(object):
def __init__(self, root, depth, locked=True):
self.root = root
self.depth = depth
self.locked = locked
self.host = urlparse.urlparse(root)[1]
self.urls = []
self.links = 0
self.followed = 0
def crawl(self):
#print " in crawl"
page = Fetcher(self.root)
q = Queue()
#print "made fetcher"
try:
page.fetch()
if page.urls == []:
print "Error: could not fetch urls for %s" % (self.root)
return
#raise KeyboardInterrupt
else:
target = open("output.txt", 'w')
for url in page.urls:
q.put(url)
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
except Exception as e:
print('Error: could not fetch urls')
raise KeyboardInterrupt
'''
q = Queue()
target = open("output.txt", 'w')
for url in page.urls:
q.put(url) f
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
#print followed
'''
n = 0
while True:
try:
url = q.get()
except QueueEmpty:
break
n += 1
if url not in followed:
try:
host = urlparse.urlparse(url)[1]
if self.locked and re.match(".*%s" % self.host, host):
followed.append(url)
#print url
self.followed += 1
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
if url not in self.urls:
self.links += 1
q.put(url)
self.urls.append(url)
with open("data.out", 'w') as f:
f.write(url)
if n > self.depth and self.depth > 0:
break
except Exception, e:
print "ERROR: Can't process url '%s' (%s)" % (url, e)
print format_exc()
class Fetcher(object):
def __init__(self, url):
self.url = url
self.urls = []
def __getitem__(self, x):
return self.urls[x]
def _addHeaders(self, request):
request.add_header("User-Agent", AGENT)
def open(self):
url = self.url
try:
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
return (request, handle)
def fetch(self):
request, handle = self.open()
self._addHeaders(request)
if handle:
try:
content = unicode(handle.open(request).read(), "utf-8",
errors="replace")
soup = BeautifulSoup(content)
tags = soup('a')
except urllib2.HTTPError, error:
if error.code == 404:
print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
else:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except urllib2.URLError, error:
print >> sys.stderr, "ERROR: %s" % error
tags = []
for tag in tags:
href = tag.get("href")
if href is not None:
url = urlparse.urljoin(self.url, escape(href))
if url not in self:
self.urls.append(url)
def getLinks(url):
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
print "%d. %s" % (i, url)
Static methods:
def main():
depth =2
file_in = []
reload(sys)
sys.setdefaultencoding('utf-8')
filename = "stuff.txt"
text = open(filename)
for line in text:
file_in.append(line.rstrip())
for i in file_in:
print "Crawling %s (Max Depth: %d)" % (i, depth)
crawler = Crawler(i, depth)
crawler.crawl()
print "\n".join(crawler.urls)

A lot of advertising is delivered via asynchronous javascript executed on the page. If you're just scraping the server initial output you won't be able to obtain those others links. One method would be to use a headless browser like PhantomJS to render the html to a file then use your script on that. There are other possibilities as well.

Python KeyError exception / exiting code without error

I got a rather weird problem. The following code in scrape1 sometimes works as it should, but most of the time it just stops at line 24, where request.get is being used. I do however consistently get this keyerror exception:
Exception KeyError: KeyError(140186412830800,) in module <'threading' from '/usr/lib/python2.7/threading.pyc'> ignored
The exception is only thrown when I'm importing the module proxyfetch.py but as long as I don't actually execute the code in proxyfetch.py, scrape1.py doesn't break (exception is thrown after nominal execution). Proxyfetch is based on DanMcInerney elite-proxy-finder on github. I just edited it so I could use it as a module which returns a list of proxys instead of printing them.
So here are the 2 scripts:
scrape1.py:
#scrape1.py
from bs4 import BeautifulSoup
from proxyfetch import getprox
import requests
proxcount=3
listz = getprox(proxcount)
proxfile = open("proxysave.txt", "w")
base_url = "http://google.com"
def pagefetch(url):
print "Test"
http_proxy = "http://"+listz[0]
#http_proxy = "http://103.25.203.227:3127"
print "Test2"
proxydict = {
"http" : http_proxy
#"https_proxy" : https_proxy
}
print "Test3"
page = requests.get(url, proxies=proxydict) #with proxy
#page = requests.get(url) #without proxy
print "Test4"
return page
page = pagefetch(base_url)
soup = BeautifulSoup(page.text)
links = soup.find_all("a")
if links:
for n in links:
print n
else:
print "I got nuthin."
And proxyfetch.py
#!/usr/bin/env python2
#proxyfetch.py
'''Finds hundreds of elite anonymity (L1) HTTP proxies then tests them all in parallel printing the fastest ones first.
Checks headers to confirm eliteness, checks if compatible with opening HTTPS sites, and confirms the proxy is working
through multiple IP checking sites'''
# TO DO:
# -Add http://free-proxy-list.net/
# -Add hidemyass
#from IPython import embed
__author__ = 'Dan McInerney'
__contact__ = 'danhmcinerney gmail'
from gevent import monkey
monkey.patch_all()
import requests
import ast
import gevent
import sys, re, time, os, argparse
import socket
from bs4 import BeautifulSoup
listz =[]
def getprox(amount):
argz = [amount, False, True]
try:
P = find_http_proxy(argz)
P.run()
except BaseException,Err:
return listz
return listz
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--show', help='Show this number of results. Example: "-s 5" will show the 5 fastest proxies then stop')
parser.add_argument('-a', '--all', help='Show all proxy results including the ones that failed 1 of the 3 tests', action='store_true')
parser.add_argument('-q', '--quiet', help='Only print the IP:port of the fastest proxies that pass all the tests', action='store_true')
return parser.parse_args()
class find_http_proxy():
''' Will only gather L1 (elite anonymity) proxies
which should not give out your IP or advertise
that you are using a proxy at all '''
#argz = [arg1, False, True]
def __init__(self, argz):
self.proxy_list = []
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36'}
self.show_num = argz[0]
self.show_all = False
self.quiet = True
self.errors = []
self.print_counter = 0
self.externalip = self.external_ip()
def external_ip(self):
req = requests.get('http://myip.dnsdynamic.org/', headers=self.headers)
ip = req.text
return ip
def run(self):
''' Gets raw high anonymity (L1) proxy data then calls make_proxy_list()
Currently parses data from gatherproxy.com and letushide.com '''
if not self.quiet:
print '[*] Your accurate external IP: %s' % self.externalip
letushide_list = self.letushide_req()
if not self.quiet:
print '[*] letushide.com: %s proxies' % str(len(letushide_list))
# Has a login now :(
gatherproxy_list = self.gatherproxy_req()
if not self.quiet:
print '[*] gatherproxy.com: %s proxies' % str(len(gatherproxy_list))
checkerproxy_list = self.checkerproxy_req()
if not self.quiet:
print '[*] checkerproxy.net: %s proxies' % str(len(checkerproxy_list))
self.proxy_list.append(letushide_list)
self.proxy_list.append(gatherproxy_list)
self.proxy_list.append(checkerproxy_list)
# Flatten list of lists (1 master list containing 1 list of ips per proxy website)
self.proxy_list = [ips for proxy_site in self.proxy_list for ips in proxy_site]
self.proxy_list = list(set(self.proxy_list)) # Remove duplicates
if not self.quiet:
print '[*] %d unique high anonymity proxies found' % len(self.proxy_list)
print '[*] Testing proxy speeds ...'
print ''
print ' Proxy | CC | Domain | Time/Errors'
self.proxy_checker()
return list_
def checkerproxy_req(self):
''' Make the request to checkerproxy and create a master list from that site '''
cp_ips = []
try:
url = 'http://checkerproxy.net/all_proxy'
r = requests.get(url, headers=self.headers)
html = r.text
except Exception:
print '[!] Failed to get reply from %s' % url
checkerproxy_list = []
return checkerproxy_list
checkerproxy_list = self.parse_checkerproxy(html)
return checkerproxy_list
def parse_checkerproxy(self, html):
''' Only get elite proxies from checkerproxy '''
ips = []
soup = BeautifulSoup(html)
for tr in soup.findAll('tr'):
if len(tr) == 19:
ip_found = False
elite = False
ip_port = None
tds = tr.findAll('td')
for td in tds:
if ':' in td.text:
ip_found = True
ip_port_re = re.match('(\d{1,3}\.){3}\d{1,3}:\d{1,5}', td.text)
if ip_port_re:
ip_port = ip_port_re.group()
if not ip_port:
ip_found = False
if 'Elite' in td.text:
elite = True
if ip_found == True and elite == True:
ips.append(str(ip_port))
break
return ips
def letushide_req(self):
''' Make the request to the proxy site and create a master list from that site '''
letushide_ips = []
for i in xrange(1,20): # can search maximum of 20 pages
try:
url = 'http://letushide.com/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i)
r = requests.get(url, headers=self.headers)
html = r.text
ips = self.parse_letushide(html)
# Check html for a link to the next page
if '/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i+1) in html:
pass
else:
letushide_ips.append(ips)
break
letushide_ips.append(ips)
except:
print '[!] Failed get reply from %s' % url
break
# Flatten list of lists (1 list containing 1 list of ips for each page)
letushide_list = [item for sublist in letushide_ips for item in sublist]
return letushide_list
def parse_letushide(self, html):
''' Parse out list of IP:port strings from the html '''
# \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} - matches IP addresses
# </a></td><td> - is in between the IP and the port
# .*?< - match all text (.) for as many characters as possible (*) but don't be greedy (?) and stop at the next greater than (<)
raw_ips = re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}</a></td><td>.*?<', html)
ips = []
for ip in raw_ips:
ip = ip.replace('</a></td><td>', ':')
ip = ip.strip('<')
ips.append(ip)
return ips
def gatherproxy_req(self):
url = 'http://gatherproxy.com/proxylist/anonymity/?t=Elite'
try:
r = requests.get(url, headers = self.headers)
lines = r.text.splitlines()
except:
print '[!] Failed get reply from %s' % url
gatherproxy_list = []
return gatherproxy_list
gatherproxy_list = self.parse_gp(lines)
return gatherproxy_list
def parse_gp(self, lines):
''' Parse the raw scraped data '''
gatherproxy_list = []
for l in lines:
if 'proxy_ip' in l.lower():
l = l.replace('gp.insertPrx(', '')
l = l.replace(');', '')
l = l.replace('null', 'None')
l = l.strip()
l = ast.literal_eval(l)
proxy = '%s:%s' % (l["PROXY_IP"], l["PROXY_PORT"])
gatherproxy_list.append(proxy)
#ctry = l["PROXY_COUNTRY"]
return gatherproxy_list
def proxy_checker(self):
''' Concurrency stuff here '''
jobs = [gevent.spawn(self.proxy_checker_req, proxy) for proxy in self.proxy_list]
try:
gevent.joinall(jobs)
except KeyboardInterrupt:
sys.exit('[-] Ctrl-C caught, exiting')
def proxy_checker_req(self, proxy):
''' See how long each proxy takes to open each URL '''
proxyip = str(proxy.split(':', 1)[0])
# A lot of proxy checker sites give a different final octet for some reason
#proxy_split = proxyip.split('.')
#first_3_octets = '.'.join(proxy_split[:3])+'.'
results = []
urls = ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org', 'https://www.astrill.com/what-is-my-ip-address.php', 'http://danmcinerney.org/headers.php']
for url in urls:
try:
check = requests.get(url,
headers = self.headers,
proxies = {'http':'http://'+proxy,
'https':'http://'+proxy},
timeout = 15)
time_or_error = str(check.elapsed)
html = check.text
time_or_error = self.html_handler(time_or_error, html, url)
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
except Exception as e:
time_or_error = self.error_handler(str(e))
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
self.print_handler(results, proxyip)
def html_handler(self, time_or_error, html, url):
''' Check the html for errors and if none are found return time to load page '''
html_lines = html.splitlines()
leng = len(html_lines)
ipre = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}'
# Both of these urls just return the ip and nothing else
if url in ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org']:
if leng == 1: # Should return 1 line of html
match = re.match(ipre, html)
if match:
if self.externalip in html:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
# This is the SSL page
if 'astrill' in url:
soup = BeautifulSoup(html)
ip = soup.find("td", { "colspan": 2 }).text # the ip is the only on with colspan = 2
match = re.match(ipre, ip)
if match:
if self.externalip in ip:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
if '/headers' in url:
# check for proxy headers
proxy_headers = ['via: ', 'forwarded: ', 'x-forwarded-for', 'client-ip']
if leng > 15: # 15 is arbitrary, I just don't think you'll ever see more than 15 headers
time_or_error = 'Err: headers not returned'
return time_or_error
for l in html_lines:
for h in proxy_headers:
if h in l.lower():
time_or_error = 'Err: Proxy headers found'
return time_or_error
time_or_error = 'Passed: elite proxy'
return time_or_error
def print_handler(self, results, proxyip):
if self.show_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
else:
passed_all = self.passed_all_tests(results)
if passed_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
if self.show_num:
self.limiter()
def printer(self, results, country_code):
''' Creates the output '''
counter = 0
if not self.quiet:
print '--------------------------------------------------------------------'
for r in results:
counter += 1
time_or_error = r[0]
proxy = r[1]
url = r[2]
if self.quiet:
if counter % 4 == 0: #################### THIS results is a list of 4 tuples each, so proxies will repeat 4 times
#print proxy
global listz
listz.append(proxy)
else:
# Only print the proxy once, on the second print job
if counter == 1:
print '%s | %s | %s | %s' % (proxy.ljust(21), country_code.ljust(3), url.ljust(21), time_or_error)
else:
print '%s | %s | %s | %s' % (' '.ljust(21), ' ', url.ljust(21), time_or_error)
def get_country_code(self, proxyip):
''' Get the 3 letter country code of the proxy using geoiptool.com
Would use the geoip library, but it requires a local DB and what
is the point of that hassle other than marginal speed improvement '''
cc_line_found = False
cc = 'N/A'
try:
r = requests.get('http://www.geoiptool.com/en/?IP=%s' % proxyip, headers=self.headers)
html = r.text
html_lines = html.splitlines()
for l in html_lines:
if cc_line_found == True:
cc = l.split('(', 1)[1].split(')', 1)[0]
break
if 'country code:' in l.lower():
cc_line_found = True
except:
pass
return cc
def error_handler(self, e):
if 'Cannot connect' in e:
time_or_error = 'Err: Cannot connect to proxy'
elif 'timed out' in e.lower():
time_or_error = 'Err: Timed out'
elif 'retries exceeded' in e:
time_or_error = 'Err: Max retries exceeded'
elif 'Connection reset by peer' in e:
time_or_error = 'Err: Connection reset by peer'
elif 'readline() takes exactly 1 argument (2 given)' in e:
time_or_error = 'Err: SSL error'
else:
time_or_error = 'Err: ' + e
return time_or_error
def url_shortener(self, url):
if 'ip.php' in url:
url = 'danmcinerney.org'
elif 'headers.php' in url:
url = 'Header check'
elif 'dnsdynamic' in url:
url = 'dnsdynamic.org'
elif 'astrill' in url:
url = 'https://astrill.com'
return url
def passed_all_tests(self, results):
for r in results:
time_or_error= r[0]
if 'Err:' in time_or_error:
global testx
testx = 50
return False
return True
def limiter(self):
testx = 0
''' Kill the script if user supplied limit of successful proxy attempts (-s argument) is reached '''
if self.print_counter >= int(self.show_num):
sys.exit()

Which is the right way of recovering from a requests.exceptions.ConnectionError?

I am scrapping a web site, but sometimes the laptop lost the connection, and I got (obviously) a requests.exceptions.ConnectionError. Which is the right (or most elegant?) way of recover from this error? I mean: I don't want the program to stop, but retry the connection, maybe some seconds later? This is my code, but I got the feeling is not correct:
def make_soup(session,url):
try:
n = randint(1, MAX_NAPTIME)
sleep(n)
response = session.get(url)
except requests.exceptions.ConnectionError as req_ce:
error_msg = req_ce.args[0].reason.strerror
print "Error: %s con la url %s" % (eror_msg, url)
session = logout(session)
n = randint(MIN_SLEEPTIME, MAX_SLEEPTIME)
sleep(n)
session = login(session)
response = session.get(url)
soup = BeautifulSoup(response.text)
return soup
Any ideas?
Note that I need a session to scrap this pages, so, I think that the login (i.e. login again to the site, after a logout) could be cause troubles

So why not something like
import requests
import time
def retry(cooloff=5, exc_type=None):
if not exc_type:
exc_type = [requests.exceptions.ConnectionError]
def real_decorator(function):
def wrapper(*args, **kwargs):
while True:
try:
return function(*args, **kwargs)
except Exception as e:
if e.__class__ in exc_type:
print "failed (?)"
time.sleep(cooloff)
else:
raise e
return wrapper
return real_decorator
Which is a decorator that allows you to call any function until it succeeds. e.g.
#retry(exc_type=[ZeroDivisionError])
def test():
return 1/0
print test()
Which will just print "failed (y)" every 5 seconds until the end of time (or until the laws of math change)

Is it really needed to logout and relogin into your session? I'd just retry the connection the same way:
def make_soup(session,url):
success = False
response = None
for attempt in range(1, MAXTRIES):
try:
response = session.get(url)
# If session.get succeeded, we break out of the
# for loop after setting a success flag
success = True
break
except requests.exceptions.ConnectionError as req_ce:
error_msg = req_ce.args[0].reason.strerror
print "Error: %s con la url %s" % (error_msg, url)
print " Attempt %s of %s" % (attempt, MAXTRIES)
sleep(randint(MIN_SLEEPTIME, MAX_SLEEPTIME))
# Figure out if we were successful.
# Note it may not be needed to have a flag, you can maybe just
# check the value of response here.
if not success:
print "Couldn't get it after retrying many times"
return None
#Once we get here, we know we got a good response
soup = BeautifulSoup(response.text)
return soup

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Using proxy middleware to scrape Amazon - python

Related

Why h.getresponse() is needed in python logging HTTPHandler?

Switching proxies in python: Psuedo code

How do I implement a web crawler that scrapes ad links?

Python KeyError exception / exiting code without error

Which is the right way of recovering from a requests.exceptions.ConnectionError?

Categories

Resources