Download and handle errors - python

I've been using a function that I took from the book Web Scraping with Python from O'Really by Ryan Mitchell:
import sys
import os.path
import socket
import random
import urllib2
import contextlib
import diskCache
import logging as logger
from bs4 import BeautifulSoup
DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout (DEFAULT_TIMEOUT)
def download (url, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, \
cache=None, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, data=None):
result = None
if cache:
try:
result = cache[url]
except KeyError:
# url is not available in cache
pass
if result is not None and result['code'] is not None \
and num_retries > 0 and 500 <= result['code'] < 600:
# server error so ignore result from cache and re-download
result = None
if result is None:
proxy = random.choice(proxies) if proxies else None
headers = {'User-agent': user_agent}
result = call (url, headers, proxy=proxy, num_retries=num_retries, cache=cache)
if cache:
# save result to cache
cache[url] = result
return result['html']
def call (url, headers, proxy, num_retries, cache=None, data=None):
request = urllib2.Request(url, data, headers or {})
with contextlib.closing (urllib2.urlopen(request)) as connection:
try:
logger.info ('Downloading: %s', url)
html = connection.read ()
code = connection.getcode ()
except Exception as e:
logger.exception ('Download error:', str(e))
if cache:
del cache['url']
html = None
if hasattr (e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
return download (url, headers, num_retries-1, data) # retry server errors
else:
code = None
return {'html': html, 'code':code}
I wanted to know if there is a simpler way of handling the errors when downloading urls. I've seen that the requests library is a higher level and easier library and maybe it could simplify this. At the very least how would this code be for python3?
It would be something like
"""Functions used by the fetch module"""
# Standard library imports
import time
import socket
import logging as logger
from typing import Dict, Optional
# Third party imports
import requests
from requests.exceptions import HTTPError, Timeout
from bs4 import BeautifulSoup
# Constants
DEFAULT_AGENT = 'Mozilla/5.0 Firefox/56.0'
DEFAULT_DELAY = 3
DEFAULT_RETRIES = 10
DEFAULT_TIMEOUT = 60
socket.setdefaulttimeout(DEFAULT_TIMEOUT)
def fetch(url: str, retries: Optional[int] = DEFAULT_RETRIES) -> Dict:
"""Download an url"""
code = None
try:
logger.info('Downloading: %s', url)
resp = requests.get(url)
resp.raise_for_status()
code = resp.status_code
except (HTTPError, Timeout) as ex:
logger.exception("Couldn't download %s", ex)
return None
if code is not None and retries > 0 and \
500 <= code < 600: # Server error
logger.info('Retrying download')
time.sleep(DEFAULT_DELAY)
return fetch(url, retries-1)
return {'html': resp, 'code': code}

As you said this is a lot easier with requests
resp = requests.get(url, headers=headers, timeout=timeout)
print(resp.status_code)
print(resp.text)
# for an API use resp.json()
There is no exception raised by default. You can call resp.raise_for_status() if you do want to raise an exception.
See http://docs.python-requests.org/en/master/user/quickstart/ for details

Related

How do i use diffrent session and proxy after every 5 request in python?

below is my code but currently it sometimes take different proxy on every new request and for some reason the request are taking too much time to show response, way more than usual
i also have no way of knowing if the session is staying same for all five request or are they changing on every new request
i'm rotating proxy using next(proxy_cycle) and i'm hitting the request like this in the postman http://127.0.0.1:8000/test/http://ipinfo.io/json
i don't know what exactly i'm doing wrong.
from fastapi import FastAPI
import requests
from bs4 import BeautifulSoup
import lxml
import asyncio
import uvicorn
from itertools import cycle
app = FastAPI()
counter_lock = asyncio.Lock()
counter = 0
def make_5_req_per_session(url, proxy, session):
try:
response = session.get(url, proxies=proxy, timeout=10)
if response.status_code == 200:
return response.json()
else:
return None
except Exception as e:
# print(e)
print("dead proxy")
return None
def load_proxies():
# read proxies from a text file
with open("proxies.txt", "r") as proxies_file:
list_of_proxies = []
for proxy in proxies_file:
list_of_proxies.append(proxy.strip())
return list_of_proxies
list_of_proxies = load_proxies()
proxy_cycle = cycle(list_of_proxies)
# first time proxy and session
proxy = {"http": next(proxy_cycle)}
session = requests.Session()
#app.get("/test/{url:path}")
async def hello_world(
url,
proxy=proxy,
session=session,
):
global counter
async with counter_lock:
counter += 1
if counter == 5:
counter = 0
# make new proxy and session after 5 requests
proxy = {"http": next(proxy_cycle)}
session = requests.Session()
data = make_5_req_per_session(url, proxy, session)
# if the response is None, then try again with new proxy
while data is None:
proxy = {"http": next(proxy_cycle)}
session = requests.Session()
data = make_5_req_per_session(url, proxy, session)
return {"url": f" {counter} {proxy} {data} "}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
this is my proxies in "proxies.txt"
111.21.183.58:9091
112.105.59.218:80
110.164.3.7:8888

Urllib syntax translation from python 3.5 to 2.7

I have a piece of code which was written in Python 3.5 and uses urllib module. Now, I tried to convert this so that it will work with Python 2.7, but I get some errors from the urllib() module.
E.g:
Traceback (most recent call last):
File "alert.py", line 13, in <module>
import urllib.request as urllib
ImportError: No module named request
Now, I know that urllib is deprecated in Python 2.7 so I'm coming here to ask for some help with the lines that use urllib.
import urllib.request as urllib
from http.cookiejar import CookieJar
from os.path import isfile
from os.path import join as joinPath
from sys import exc_info
from traceback import print_tb
from urllib.parse import urlencode
# constant
APPLICATION_PATH = '/srv/path/'
ALERT_POINT_PATH = joinPath(APPLICATION_PATH, 'alert_contact')
URL_REQUEST_TIMEOUT = 42
SMS_BOX_URL = 'xx.xxxx.xxx.xxx'
def initWebConnection(): # init web connection
response = 0
initUrlLibResponse = initUrlLib() # init urllib
if initUrlLibResponse:
response = 1
return response
def initUrlLib(): # init urllib
response = 0
try:
cookieJar = CookieJar() # cookies
opener = urllib.build_opener(urllib.HTTPCookieProcessor(cookieJar))
urllib.install_opener(opener)
except Exception as e:
response = 1
# ex_type, ex, tb = exc_info()
return response
def urlRequest(url, data=None): # make url request
contentResponse = None
try:
request = None
if data:
dataRequest = urlencode(data)
dataRequest = dataRequest.encode('UTF-8')
request = urllib.Request(url, dataRequest)
else:
request = urllib.Request(url)
response = urllib.urlopen(url=request, timeout=URL_REQUEST_TIMEOUT) # make request
# get response
contentResponse = response.read()
except Exception as e:
contentResponse = None
# ex_type, ex, tb = exc_info()
return contentResponse
try:
evt.data = 'Some name'
# check production state
isInProduction = False
if evt.prodState == 1000:
isInProduction = True
if isInProduction:
initWebConnection()
# check alert point'
if isfile(ALERT_POINT_PATH):
alertContactContent = None
with open(ALERT_POINT_PATH, 'r') as alertContactFile:
alertContactContent = alertContactFile.read()
alertContactContent = alertContactContent.splitlines()
if alertContactContent:
evt.summary = '#[ DNS: ALERT ]# {}'.format(evt.summary)
for alertContactContentLine in alertContactContent:
webRequestData = dict(
## TO DO: set the url parameters appropriately
phone=alertContactContentLine,
message='NEW ALERT: {}'.format(evt.ipAddress),
)
webRequestResponse = urlRequest(SMS_BOX_URL, webRequestData)
else:
evt.summary = '#[ ERROR: SMS ALERT NO CONTACT ]# {}'.format(evt.summary)
except Exception as e:
ex_type, ex, tb = exc_info()
print('\n #[ERROR]#exception: {ex}\n'.format(ex=e))
print('\n #[ERROR]#exception traceback: {trace}\n'.format(trace=print_tb(tb)))
evt.summary = '#[ DNS:ERROR traceback in event message ]# {}'.format(evt.summary)
evt.message = '#[ DNS:ERROR ex_type:\n {} \nex: {} \n traceback:\n {} \n]# {}'.format(ex_type, ex,
print_tb(tb),
evt.message)
You can change the import lines from
import urllib.request as urllib
from http.cookiejar import CookieJar
from urllib.parse import urlencode
to
import urllib2 as urllib
from cookielib import CookieJar
from urllib import urlencode
for Python 2.7

How to implement retry mechanism into python requests library?

I would like to add a retry mechanism to python request library, so scripts that are using it will retry for non fatal errors.
At this moment I do consider three kind of errors to be recoverable:
HTTP return codes 502, 503, 504
host not found (less important now)
request timeout
At the first stage I do want to retry specified 5xx requests every minute.
I want to be able to add this functionality transparently, without having to manually implement recovery for each HTTP call made from inside these scripts or libraries that are using python-requests.
This snippet of code will make all HTTP requests from the same session retry for a total of 5 times, sleeping between retries with an increasing backoff of 0s, 2s, 4s, 8s, 16s (the first retry is done immediately). It will retry on basic connectivity issues (including DNS lookup failures), and HTTP status codes of 502, 503 and 504.
import logging
import requests
from requests.adapters import HTTPAdapter, Retry
logging.basicConfig(level=logging.DEBUG)
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.get("http://httpstat.us/503")
See Retry class for details.
This is a snippet of code I used to retry for the petitions made with urllib2. Maybe you could use it for your purposes:
retries = 1
success = False
while not success:
try:
response = urllib2.urlopen(request)
success = True
except Exception as e:
wait = retries * 30;
print 'Error! Waiting %s secs and re-trying...' % wait
sys.stdout.flush()
time.sleep(wait)
retries += 1
The waiting time grows incrementally to avoid be banned from server.
Possible solution using retrying package
from retrying import retry
import requests
def retry_if_connection_error(exception):
""" Specify an exception you need. or just True"""
#return True
return isinstance(exception, ConnectionError)
# if exception retry with 2 second wait
#retry(retry_on_exception=retry_if_connection_error, wait_fixed=2000)
def safe_request(url, **kwargs):
return requests.get(url, **kwargs)
response = safe_request('test.com')
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
MAX_RETRY = 2
MAX_RETRY_FOR_SESSION = 2
BACK_OFF_FACTOR = 0.3
TIME_BETWEEN_RETRIES = 1000
ERROR_CODES = (500, 502, 504)
def requests_retry_session(retries=MAX_RETRY_FOR_SESSION,
back_off_factor=BACK_OFF_FACTOR,
status_force_list=ERROR_CODES,
session=None):
session = session
retry = Retry(total=retries, read=retries, connect=retries,
backoff_factor=back_off_factor,
status_forcelist=status_force_list,
method_whitelist=frozenset(['GET', 'POST']))
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
class ConfigService:
def __init__(self):
self.session = requests_retry_session(session=requests.Session())
def call_to_api():
config_url = 'http://localhost:8080/predict/'
headers = {
"Content-Type": "application/json",
"x-api-key": self.x_api_key
}
response = self.session.get(config_url, headers=headers)
return response
I was able to obtain the desired level of reliability by extending requests.Session class.
Here is the code https://bitbucket.org/bspeakmon/jira-python/src/a7fca855394402f58507ca4056de87ccdbd6a213/jira/resilientsession.py?at=master
EDIT That code was:
from requests import Session
from requests.exceptions import ConnectionError
import logging
import time
class ResilientSession(Session):
"""
This class is supposed to retry requests that do return temporary errors.
At this moment it supports: 502, 503, 504
"""
def __recoverable(self, error, url, request, counter=1):
if hasattr(error,'status_code'):
if error.status_code in [502, 503, 504]:
error = "HTTP %s" % error.status_code
else:
return False
DELAY = 10 * counter
logging.warn("Got recoverable error [%s] from %s %s, retry #%s in %ss" % (error, request, url, counter, DELAY))
time.sleep(DELAY)
return True
def get(self, url, **kwargs):
counter = 0
while True:
counter += 1
try:
r = super(ResilientSession, self).get(url, **kwargs)
except ConnectionError as e:
r = e.message
if self.__recoverable(r, url, 'GET', counter):
continue
return r
def post(self, url, **kwargs):
counter = 0
while True:
counter += 1
try:
r = super(ResilientSession, self).post(url, **kwargs)
except ConnectionError as e:
r = e.message
if self.__recoverable(r, url, 'POST', counter):
continue
return r
def delete(self, url, **kwargs):
counter = 0
while True:
counter += 1
try:
r = super(ResilientSession, self).delete(url, **kwargs)
except ConnectionError as e:
r = e.message
if self.__recoverable(r, url, 'DELETE', counter):
continue
return r
def put(self, url, **kwargs):
counter = 0
while True:
counter += 1
try:
r = super(ResilientSession, self).put(url, **kwargs)
except ConnectionError as e:
r = e.message
if self.__recoverable(r, url, 'PUT', counter):
continue
return r
def head(self, url, **kwargs):
counter = 0
while True:
counter += 1
try:
r = super(ResilientSession, self).head(url, **kwargs)
except ConnectionError as e:
r = e.message
if self.__recoverable(r, url, 'HEAD', counter):
continue
return r
def patch(self, url, **kwargs):
counter = 0
while True:
counter += 1
try:
r = super(ResilientSession, self).patch(url, **kwargs)
except ConnectionError as e:
r = e.message
if self.__recoverable(r, url, 'PATCH', counter):
continue
return r
def options(self, url, **kwargs):
counter = 0
while True:
counter += 1
try:
r = super(ResilientSession, self).options(url, **kwargs)
except ConnectionError as e:
r = e.message
if self.__recoverable(r, url, 'OPTIONS', counter):
continue
return r
Method to retry certain logic if some exception has occured at time intervals t1=1 sec, t2=2 sec, t3=4 sec.
We can increase/decrease the time interval as well.
MAX_RETRY = 3
retries = 0
try:
call_to_api() // some business logic goes here.
except Exception as exception:
retries += 1
if retries <= MAX_RETRY:
print("ERROR=Method failed. Retrying ... #%s", retries)
time.sleep((1 << retries) * 1) // retry happens after time as a exponent of 2
continue
else:
raise Exception(exception)

Sequential requests using python-requests

right now I'm using Flask, and I'm having trouble while trying to do more than one GET request using python requests module.
If I try to send a series of requests, the first one is completed successfully, but the other ones throw a timeout exception.
Here is part of the view's code:
import requests
sess = requests.Session()
site_url = 'http://www.example.com/api/'
steps = ['first_step', 'second_step', 'third_step']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5))
except requests.exceptions.Timeout:
return jsonify({'result':False, 'error':'timeout'})
except requests.exceptions.ConnectionError:
return jsonify({'result':False, 'error':'connection_error'})
else:
step_responses[s] = True
If I extract this part into a standalone .py file, it completes successfully.
import requests
sess = requests.Session()
site_url = 'http://www.example.com/api/'
steps = ['first_step', 'second_step', 'third_step']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5)
except requests.exceptions.Timeout:
step_responses[s] = 'timeout'
except requests.exceptions.ConnectionError:
step_responses[s] = 'conn_error'
else:
step_responses[s] = 'ok'
print step_responses
Works for me. You may want to check the second and third steps
import requests
sess = requests.Session()
def module():
site_url = 'http://stackoverflow.com/'
steps = ['users', 'questions', 'tags']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5)
except requests.exceptions.Timeout:
return jsonify({'result':False, 'error':'timeout'})
except requests.exceptions.ConnectionError:
return jsonify({'result':False, 'error':'connection_error'})
else:
step_responses[s] = True
You might want to make sure that you read all the values from the req object.
I think you might need req.text and req.status_code or req.content
Check half-way down the page here: http://docs.python-requests.org/en/latest/api/#request-sessions where they discuss session parameters
"class requests.adapters.HTTPAdapter(pool_connections=10, pool_maxsize=10, max_retries=0, pool_block=False)"
I'm not at all sure how to use connection pools and so forth but the docs do say (http://docs.python-requests.org/en/latest/user/advanced/) (Look for Keep Alive)
"Note that connections are only released back to the pool for reuse once all body data has been read; be sure to either set stream to False or read the content property of the Response object."

Buildbot force build via python

Just a little question : it's possible to force a build in Buildbot via a python script or command line (and not via the web interface) ?
Thank you!
If you have a PBSource configured in your master.cfg, you can send a change from the command line:
buildbot sendchange --master {MASTERHOST}:{PORT} --auth {USER}:{PASS}
--who {USER} {FILENAMES..}
You can make a python script using the urlib2 or requests library to simulate a POST to the web UI
import urllib2
import urllib
import cookielib
import uuid
import unittest
import sys
from StringIO import StringIO
class ForceBuildApi():
MAX_RETRY = 3
def __init__(self, server):
self.server = server
cookiejar = cookielib.CookieJar()
self.urlOpener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
def login(self, user, passwd):
data = urllib.urlencode(dict(username=user,
passwd=passwd))
url = self.server + "login"
request = urllib2.Request(url, data)
res = self.urlOpener.open(request).read()
if res.find("The username or password you entered were not correct") > 0:
raise Exception("invalid password")
def force_build(self, builder, reason, **kw):
"""Create a buildbot build request
several attempts are created in case of errors
"""
reason = reason + " ID="+str(uuid.uuid1())
kw['reason'] = reason
data_str = urllib.urlencode(kw)
url = "%s/builders/%s/force" % (self.server, builder)
print url
request = urllib2.Request(url, data_str)
file_desc = None
for i in xrange(self.MAX_RETRY):
try:
file_desc = self.urlOpener.open(request)
break
except Exception as e:
print >>sys.stderr, "error when doing force build", e
if file_desc is None:
print >>sys.stderr, "too many errors, giving up"
return None
for line in file_desc:
if 'alert' in line:
print >>sys.stderr, "invalid arguments", url, data_str
return None
if 'Authorization Failed' in line:
print >>sys.stderr, "Authorization Failed"
return
return reason
class ForceBuildApiTest(unittest.TestCase):
def setUp(self):
from mock import Mock # pip install mock for test
self.api = ForceBuildApi("server/")
self.api.urlOpener = Mock()
urllib2.Request = Mock()
uuid.uuid1 = Mock()
uuid.uuid1.return_value = "myuuid"
sys.stderr = StringIO()
def test_login(self):
from mock import call
self.api.login("log", "pass")
self.assertEquals(len(self.api.urlOpener.open.call_args_list), 1)
req = urllib2.Request.call_args_list
self.assertEquals([call('server/login', 'passwd=pass&username=log')], req)
def test_force(self):
from mock import call
self.api.urlOpener.open.return_value = ["blabla"]
r = self.api.force_build("builder1", reason="reason", param1="foo", param2="bar")
self.assertEquals(len(self.api.urlOpener.open.call_args_list), 1)
req = urllib2.Request.call_args_list
self.assertEquals([call('server//builders/builder1/force', 'reason=reason+ID%3Dmyuuid&param2=bar&param1=foo')], req)
self.assertEquals(r, "reason ID=myuuid")
def test_force_fail1(self):
from mock import call
self.api.urlOpener.open.return_value = ["alert bla"]
r = self.api.force_build("builder1", reason="reason", param1="foo", param2="bar")
self.assertEquals(len(self.api.urlOpener.open.call_args_list), 1)
req = urllib2.Request.call_args_list
self.assertEquals([call('server//builders/builder1/force', 'reason=reason+ID%3Dmyuuid&param2=bar&param1=foo')], req)
self.assertEquals(sys.stderr.getvalue(), "invalid arguments server//builders/builder1/force reason=reason+ID%3Dmyuuid&param2=bar&param1=foo\n")
self.assertEquals(r, None)
def test_force_fail2(self):
from mock import call
def raise_exception(*a, **kw):
raise Exception("oups")
self.api.urlOpener.open = raise_exception
r = self.api.force_build("builder1", reason="reason", param1="foo", param2="bar")
req = urllib2.Request.call_args_list
self.assertEquals([call('server//builders/builder1/force', 'reason=reason+ID%3Dmyuuid&param2=bar&param1=foo')], req)
self.assertEquals(sys.stderr.getvalue(), "error when doing force build oups\n"*3 + "too many errors, giving up\n")
self.assertEquals(r, None)
def test_force_fail3(self):
from mock import call
self.api.urlOpener.open.return_value = ["bla", "blu", "Authorization Failed"]
r = self.api.force_build("builder1", reason="reason", param1="foo", param2="bar")
req = urllib2.Request.call_args_list
self.assertEquals([call('server//builders/builder1/force', 'reason=reason+ID%3Dmyuuid&param2=bar&param1=foo')], req)
self.assertEquals(sys.stderr.getvalue(), "Authorization Failed\n")
self.assertEquals(r, None)
if __name__ == '__main__':
unittest.main()

Categories