right now I'm using Flask, and I'm having trouble while trying to do more than one GET request using python requests module.
If I try to send a series of requests, the first one is completed successfully, but the other ones throw a timeout exception.
Here is part of the view's code:
import requests
sess = requests.Session()
site_url = 'http://www.example.com/api/'
steps = ['first_step', 'second_step', 'third_step']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5))
except requests.exceptions.Timeout:
return jsonify({'result':False, 'error':'timeout'})
except requests.exceptions.ConnectionError:
return jsonify({'result':False, 'error':'connection_error'})
else:
step_responses[s] = True
If I extract this part into a standalone .py file, it completes successfully.
import requests
sess = requests.Session()
site_url = 'http://www.example.com/api/'
steps = ['first_step', 'second_step', 'third_step']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5)
except requests.exceptions.Timeout:
step_responses[s] = 'timeout'
except requests.exceptions.ConnectionError:
step_responses[s] = 'conn_error'
else:
step_responses[s] = 'ok'
print step_responses
Works for me. You may want to check the second and third steps
import requests
sess = requests.Session()
def module():
site_url = 'http://stackoverflow.com/'
steps = ['users', 'questions', 'tags']
step_responses = dict()
for s in steps:
try:
req = sess.get(site_url + s, timeout=5)
except requests.exceptions.Timeout:
return jsonify({'result':False, 'error':'timeout'})
except requests.exceptions.ConnectionError:
return jsonify({'result':False, 'error':'connection_error'})
else:
step_responses[s] = True
You might want to make sure that you read all the values from the req object.
I think you might need req.text and req.status_code or req.content
Check half-way down the page here: http://docs.python-requests.org/en/latest/api/#request-sessions where they discuss session parameters
"class requests.adapters.HTTPAdapter(pool_connections=10, pool_maxsize=10, max_retries=0, pool_block=False)"
I'm not at all sure how to use connection pools and so forth but the docs do say (http://docs.python-requests.org/en/latest/user/advanced/) (Look for Keep Alive)
"Note that connections are only released back to the pool for reuse once all body data has been read; be sure to either set stream to False or read the content property of the Response object."
Related
I'm new to python, but I don't see much information on Stackoverflow in regards to paginating with the links method. The loop works perfectly in that it pulls all the data I want, but it only breaks until there's a timeout error when my Mac falls asleep. Sometimes it runs for 2 hours until my Mac sleeps. I'm wondering if there's a faster way to retrieve this data? Here is my python script:
import requests
import pandas as pd
res = []
url = "https://horizon.stellar.org/accounts/GCQJVAXWHB23WBNIG7TWEWHWUGGB6HWBC2ASPF5HMSADO5R5UKI4T7SD/trades"
querystring = {"limit":"200"}
try:
while True:
response = requests.request("GET", url, params=querystring)
data = response.json()
res += data['_embedded']['records']
if "href" not in data['_links']['next']:
break
url = data['_links']['next']['href']
except Exception as ex:
print("Exception:", ex)
df = pd.json_normalize(res)
df.to_csv('stellar_t7sd_trades.csv')
It returns with the following:
Exception: ('Connection aborted.', TimeoutError(60, 'Operation timed
out'))
But it returns the desired data into the csv file.
Is there a problem with my loop in that it doesn't properly break when It's done returning the data? Just trying to figure out a way so it doesn't run for 2 hours, but other than that, I get the desired data.
I solved this by adding a break after n number of loop iterations. This only works because I know exactly how many iterations of the loop will pull the data I need.
res = []
url = "https://horizon.stellar.org/accounts/GCQJVAXWHB23WBNIG7TWEWHWUGGB6HWBC2ASPF5HMSADO5R5UKI4T7SD/trades"
querystring = {"limit":"200"}
n = 32
try:
while n > 0: #True:
response = requests.request("GET", url, params=querystring)
n-=1
data = response.json()
res += data['_embedded']['records']
if "href" not in data['_links']['next']:
break
elif n==32:
break
url = data['_links']['next']['href']
except Exception as ex:
print("Exception:", ex)
df = pd.json_normalize(res)
df.to_csv('stellar_t7sd_tradestest.csv')
with open("student.csv", "r") as csv_ledger:
r = csv.DictReader(csv_ledger)
data = [dict(d) for d in r ]
groups = {}
for k, g in groupby(data, lambda r: (r['name'])):
items = []
for i in g:
#data processing
try:
post_api = requests.post(ENDPOINT_URL, json=groups, headers=headers)
except requests.ConnectionError:
print("Something went wrong")
finally:
print("resume post request")
Currently, my code won't be able to resume the post request when the internet connection is disconnected. It's not working if I use the try and exception.
Not sure you can resume like you want without server side implementation as well but you can resume from the client. Here is a simple blocking example but you may want to throw it in a thread.
import csv
import socket
from time import sleep
import requests
def is_internet_on():
try:
socket.setdefaulttimeout(3)
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect(('1.1.1.1', '8080'))
return True
except:
return False
def do_call_later(url, headers, body):
while not is_internet_on():
sleep(5)
requests.post(url, headers=headers, body=body)
with open("student.csv", "r") as csv_ledger:
r = csv.DictReader(csv_ledger)
data = [dict(d) for d in r]
groups = {}
for k, g in groupby(data, lambda r: (r['name'])):
items = []
for i in g:
# data processing
pass
try:
timeout_arg = (
# first tuple value is the connection timeout,
# how long to wait before initial connection is established
1.0,
# second tuple value is the read timeout, this is how long
# the client will wait after the initial connection
# before dropping the connection because no response was sent
1.0
)
post_api = requests.post(ENDPOINT_URL, json=groups, headers=headers, timeout=timeout_arg)
except requests.ConnectionError:
do_call_later(ENDPOINT_URL, headers, groups)
finally:
print("resume post request")
Edit: docs for the timeout call: https://github.com/kennethreitz/requests/blob/master/requests/api.py#L34
Not sure, if it can work like this.
The idea for resumable request is to send data in chunks and then assemble as the backend.Therefore if a request fails in between, it can resume later by sending other chunks.
Also the backend should be able to accept chunks and assemble them
Take a look at a python library resumable
I'm new to python and I want this code to run only once and stops, not every 30 seconds
because I want to run multiple codes like this with different access tokens every 5 seconds using the command line.
and when I tried this code it never jumps to the second one because it's a while true:
import requests
import time
api_url = "https://graph.facebook.com/v2.9/"
access_token = "access token"
graph_url = "site url"
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
# Beware of rate limiting if trying to increase frequency.
refresh_rate = 30 # refresh rate in second
while True:
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open ("open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
time.sleep(refresh_rate)
From what I understood you're trying to execute the piece of code for multiple access tokens. To make your job simple, have all your access_tokens as lists and use the following code. It assumes that you know all your access_tokens in advance.
import requests
import time
def scrape_facebook(api_url, access_token, graph_url):
""" Scrapes the given access token"""
post_data = { 'id':graph_url, 'scrape':True, 'access_token':access_token }
try:
resp = requests.post(api_url, data = post_data)
if resp.status_code == 200:
contents = resp.json()
print(contents['title'])
else:
error = "Warning: Status Code {}\n{}\n".format(
resp.status_code, resp.content)
print(error)
raise RuntimeWarning(error)
except Exception as e:
f = open (access_token+"_"+"open_graph_refresher.log", "a")
f.write("{} : {}".format(type(e), e))
f.close()
print(e)
access_token = ['a','b','c']
graph_url = ['sss','xxx','ppp']
api_url = "https://graph.facebook.com/v2.9/"
for n in range(len(graph_url)):
scrape_facebook(api_url, access_token[n], graph_url[n])
time.sleep(5)
Lets say I have a website that I want to scrape. Ex. cheapoair.com
I want to use a normal requests in python to scrape the data on the first, hypothetical page. If I end up being blocked by the server, I want to switch to a proxy. I have a list of proxy servers and a method, and I also have a list of user agent strings. However, I think I need help thinking through the problem.
For reference
uagen() will return a user agent string
proxit() will return a proxy
Here is what I have so far:
import requests
from proxy_def import *
from http import cookiejar
import time
from socket import error as SocketError
import sys
start_time = time.time()
class BlockAll(cookiejar.CookiePolicy):
return_ok = set_ok = domain_return_ok = path_return_ok = lambda self, *args, **kwargs: False
netscape = True
rfc2965 = hide_cookie2 = False
headers = {'User-Agent': uagen()}
print(headers)
s = requests.Session()
s.cookies.set_policy(BlockAll)
cookies = {'SetCurrency': 'USD'}
sp = proxit()
for i in range(100000000000):
while True:
try:
print('trying on ', sp)
print('with user agent headers', headers)
s.proxies = {"http": sp}
r = s.get("http://www.cheapoair.com", headers=headers, timeout=15, cookies=cookies)
print(i, sp, 'success')
print("--- %s seconds ---" % (time.time() - start_time))
except SocketError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.ConnectionError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.Timeout as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except KeyboardInterrupt:
print("The program has been terminated")
sys.exit(1)
break
#print(r.text)
print('all done',
'\n')
What I am looking for is an idea of how to say, start with a normal requests (not from a proxy), and if you end up with an error (such as being rejected by the server), switch to a proxy and try again.
I can almost picture it, but cant quite see it.
I'm thinking, that if I place a variable after
for i in range(1000000000000):
But before while true: That updates the sp then it might work. Another possibility it to maybe declare s.proxies = {"http": ""} and then if I run into an error, switch to s.poxies = {"http": "proxit()"} or s.poxies = {"http": "sp"}
Thanks!
I figured it out.
while True:
try:
#do this thing
#but remove variable from here and declare it before "while True"
except SockerError as e:
#switch headers, switch user agent string
s.proxies = {"http": proxit()}
continue
That will refresh the variable after it gets an error from the server
I'm programing a program for downloading images from internet and I would like to speed it up using multiple requests at once.
So I wrote a code you can see here at GitHub.
I can request for webpage only like this:
def myrequest(url):
worked = False
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
while not worked:
try:
webpage_read = urlopen(req).read()
worked = True
except:
print("failed to connect to \n{}".format(url))
return(webpage_read)
url = "http://www.mangahere.co/manga/mysterious_girlfriend_x"
webpage_read = myrequest(url).decode("utf-8")
The while is here because I definitely want to download every single picture, so I'm trying until it work (nothing can go wrong except urllib.error.HTTPError: HTTP Error 504: Gateway Time-out)
My question is, how to run that multiple times at once?
My idea is to have " a comander" which will run 5 (or 85) pythonic scripts, give each url and get webpage from them once they are finished, but this is definitely a silly solution :)
EDIT:
I used _thread but it doesn't seem to speed up the program. That should have been the solution am I doing it wrong? that is my new question.
You can use link do get to my code on GitHub
def thrue_thread_download_pics(path, url, ep, name):
lock.acquire()
global goal
goal += 1
lock.release()
webpage_read = myrequest("{}/{}.html".format(url, ep))
url_to_pic = webpage_read.decode("utf-8").split('" onerror="')[0].split('<img src="')[-1]
pic = myrequest(url_to_pic)
myfile = open("{}/pics/{}.jpg".format(path, name), "wb")
myfile.write(pic)
myfile.close()
global finished
finished += 1
and I'm using it here:
for url_ep in urls_eps:
url, maxep = url_ep.split()
maxep = int(maxep)
chap = url.split("/")[-1][2:]
if "." in chap:
chap = chap.replace(".", "")
else:
chap = "{}0".format(chap)
for ep in range(1, maxep + 1):
ted = time.time()
name = "{}{}".format(chap, "{}{}".format((2 - len(str(ep))) * "0", ep))
if name in downloaded:
continue
_thread.start_new_thread(thrue_thread_download_pics, (path, url, ep, name))
checker = -1
while finished != goal:
if finished != checker:
checker = finished
print("{} of {} downloaded".format(finished, goal))
time.sleep(0.1)
Requests Futures is built on top of the very popular requests library and uses non-blocking IO:
from requests_futures.sessions import FuturesSession
session = FuturesSession()
# These requests will run at the same time
future_one = session.get('http://httpbin.org/get')
future_two = session.get('http://httpbin.org/get?foo=bar')
# Get the first result
response_one = future_one.result()
print(response_one.status_code)
print(response_one.text)
# Get the second result
response_two = future_two.result()
print(response_two.status_code)
print(response_two.text)