I have a list of ~250000 urls, that I need to get data from an API.
I have created a class using the grequests library to make asynchronous calls. However the API limit is 100 calls per second, which grequest surpasses.
Code using grequests:
import grequests
lst = ['url.com','url2.com']
class Test:
def __init__(self):
self.urls = lst
def exception(self, request, exception):
print ("Problem: {}: {}".format(request.url, exception))
def async(self):
return grequests.map((grequests.get(u) for u in self.urls), exception_handler=self.exception, size=100000)
def collate_responses(self, results):
return [x.text for x in results]
test = Test()
#here we collect the results returned by the async function
results = test.async()
Is there anyway I can use the requests library to make 100 calls per second?
I tried requests, but it times out after roughly 100000 calls.
In this case I am passing an ID into the URL.
import requests
L = [1,2,3]
for i in L:
#print (row)
url = 'url.com/Id={}'.format(i)
xml_data1 = requests.get(url).text
lst.append(xml_data1)
time.sleep(1)
print(xml_data1)
Use multithreading.
from multiprocessing.dummy import Pool as ThreadPool
def some_fun(url):
for i in L:
#print (row)
url = 'url.com/Id={}'.format(i)
xml_data1 = requests.get(url).text
lst.append(xml_data1)
time.sleep(1)
print(xml_data1)
if __name__ == '__main__':
lst = ['url.com','url2.com']
c_pool = ThreadPool(30) #add as many as threads you can
c_pool.map(some_fun, lst)
c_pool.close()
c_pool.join()
Cheers!
Related
the situation is that sometimes a request does not load or gets stuck in Python, in case that happens or any error occurs, I would like to retry it "n" times and wait up to a maximum of 3 seconds for each one and in case the attempts are over tell me a message that f"Could not process {type_1} and {type_2}". Everything runs in parallel with concurrent.futures. Could you help me with that?
import Requests
import concurrent.futures
import json
data = [['PEN','USD'],['USD','EUR']]
def currency(element):
type_1 =element[0]
type_2 = element[1]
s = requests.Session()
url = f'https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&fromCurr={type_1}&toCurr={type_2}'
a = s.get(url)
response = json.loads(a)
value = response["convertedAmount"]
return value
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(
currency, data)
for value in results:
print(value)
Your code is almost there. Here, I modified a few things:
from concurrent.futures import ThreadPoolExecutor
import time
import requests
def convert_currency(tup):
from_currency, to_currency = tup
url = (
"https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0"
"&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&"
f"fromCurr={from_currency}&toCurr={to_currency}"
)
session = requests.Session()
for _ in range(3):
try:
response = session.get(url, timeout=3)
if response.ok:
return response.json()["convertedAmount"]
except requests.exceptions.ConnectTimeout:
time.sleep(3)
return f"Could not process {from_currency} and {to_currency}"
data = [["VND", "XYZ"], ['PEN','USD'], ["ABC", "XYZ"], ['USD','EUR'], ["USD", "XXX"]]
with ThreadPoolExecutor() as executor:
results = executor.map(convert_currency, data)
for value in results:
print(value)
Notes
I retried 3 times (see the for loop)
Use timeout= to specify the time out (in seconds)
The .ok attribute will tell if the call was successful
No need to import json as the response object can JSON decode with the .json() method
You might experiment between ThreadPoolExecutor and ProcessPoolExecutor to see which one performs better
The code below is a sample from my complete program, I tried it to make understandable.
It sends requests to a REST API. It starts with an URL and the number of pages for this specific search and tries to catch the content for each page.
Each page has several results. Each result becomes a FinalObject.
Because there are as many API requests as there are pages, I decided to use multi-threading and the concurrent.futures module.
=> It works but, as I'm new in coding and Python, I still have these 2 questions:
How to use ThreadPoolExecutor sequentially in this case,
Is there a better way to handle multi-threading in this case?
from concurrent.futures import ThreadPoolExecutor
from requests import get as re_get
def main_function(global_page_number, headers, url_request):
# create a list of pages number
pages_numbers_list = [i for i in range(global_page_number)]
# for each page, call the page_handler (MultiThreading)
with ThreadPoolExecutor(max_workers=10) as executor:
for item in pages_numbers_list:
executor.submit(
page_handler,
item,
url_request,
headers
)
def page_handler(page_number, url_request, headers):
# we change the page number in the url request
url_request = change_page(url_request, page_number)
# new request with the new url
result = re_get(url_request, headers=headers)
result = result.json()
# in the result, with found the list of dict in order to create the
# final object
final_object_creation(result['results_list'])
def change_page(url_request, new_page_number):
"to increment the value of the 'page=' attribute in the url"
current_nb_page = ''
start_nb = url_request.find("page=") + len('page=')
while 1:
if url_request[start_nb].isdigit():
current_nb_page = url_request[start_nb]
else:
break
new_url_request = url_request.replace("page=" + current_nb_page,
"page=" + str(new_page_number))
return new_url_request
def final_object_creation(results_list):
'thanks to the object from requests.get(), it builts the final object'
global current_id_decision, dict_decisions
# each item in the results lis should be an instance of the final object
for item in results_list:
# On définit l'identifiant du nouvel objet Decision
current_id_decision += 1
new_id = current_id_decision
# On crée l'objet Décision et on l'ajoute au dico des décisions
dict_decisions[new_id] = FinalObject(item)
class FinalObject:
def __init__(self, content):
self.content = content
current_id_decision = 0
dict_decisions = {}
main_function(1000, "headers", "https://api/v1.0/search?page=0&query=test")
I have this snippet
config = {10: 'https://www.youtube.com/', 5: 'https://www.youtube.com/', 7: 'https://www.youtube.com/',
3: 'https://sportal.com/', 11: 'https://sportal.com/'}
def test(arg):
for key in arg.keys():
requests.get(arg[key], timeout=key)
test(config)
On that way the things are happaning synchronously. I want to do it аsynchronously. I want to iterate through the loop without waiting for response for each address and to go ahead to the next one. And so until I iterate though all addresses in dictionary. Than I want to wait until I get all responses for all addresses and after that to get out of test function. I know that I can do it with threading but I read that with asyncio lyb it can be done better, but I couldn't implement it. If anyone have even better suggestions I am open for them. Here is my try:
async def test(arg):
loop = asyncio.get_event_loop()
tasks = [loop.run_in_executor(requests.get(arg[key], timeout=key) for key in arg.keys())]
await asyncio.gather(*tasks)
asyncio.run(test(config))
Here is the solution:
def addresses(adr, to):
requests.get(adr, timeout=to)
async def test(arg):
loop = asyncio.get_event_loop()
tasks = [loop.run_in_executor(None, addresses, arg[key], key) for key in arg.keys()]
await asyncio.gather(*tasks)
asyncio.run(test(config))
Now it works аsynchronously with lyb asyncio not with threading.
Some good answers here. I had trouble with this myself (I do a lot of webscraping) and so I created a package to help me async-scrape (https://pypi.org/project/async-scrape/).
It supports GET and POST. I tried to make it as easy to use as possible. You just need to specify a handler function for the response when you instantiate and then use the scrape_all method to do the work.
It uses the term scrape becasue i've build in some handlers for common errors when scraping websites.
You can do some things in it as well like limit the call rate if you find you're getting blocked.
An example of it's use is:
# Create an instance
from async_scrape import AsyncScrape
def post_process(html, resp, **kwargs):
"""Function to process the gathered response from the request"""
if resp.status == 200:
return "Request worked"
else:
return "Request failed"
async_Scrape = AsyncScrape(
post_process_func=post_process,
post_process_kwargs={},
fetch_error_handler=None,
use_proxy=False,
proxy=None,
pac_url=None,
acceptable_error_limit=100,
attempt_limit=5,
rest_between_attempts=True,
rest_wait=60,
call_rate_limit=None,
randomise_headers=True
)
urls = [
"https://www.google.com",
"https://www.bing.com",
]
resps = async_Scrape.scrape_all(urls)
To do this inside a loop i collect the results and add then to a set and pop off the old ones.
EG
from async_scrape import AsyncScrape
from bs4 import BeautifulSoup as bs
def post_process(html, resp, **kwargs):
"""Function to process the gathered response from the request"""
new_urls = bs.findall("a", {"class":"new_link_on_website"}
return [new_urls, resp]
async_Scrape = AsyncScrape(
post_process_func=post_process,
post_process_kwargs={}
)
# Run the loop
urls = set(["https://initial_webpage.com/"])
processed = set()
all_resps = []
while len(urls):
resps = async_scrape.scrape_all(urls)
# Get failed urls
success_reqs = set([
r["req"] for r in resps
if not r["error"]
])
errored_reqs = set([
r["req"] for r in resps
if r["error"]
])
# Get what you want from the responses
for r in success_reqs:
# Add found urls to urls
urls |= set(r["func_resp"][0]) # "func_resp" is the key to the return from your handler function
# Collect the response
all_resps.extend(r["func_resp"][1])
# Add to processed urls
processed.add(r["url"]) # "url" is the key to the url from the response
# Remove processed urls
urls = urls - processed
I'm trying to get a dataframe from a api response.
For optimization I run parallels threads, but the time is really high.
An code example:
def parall_func(tuple):
output = pd.DataFrame()
list_caracts = list(map(str,tuple[2]))
item = [(tuple[1])]
q = len(list_caracts)
headers = {
'Content-Type':'application/json'
}
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = requests.get(url,headers=headers,data=raw_data)
resp_to_json = json.loads(response.text)
for i in resp_to_json['tag']:
output = output.append([i])
except:
print("Error: ", sys.exc_info()[0])
raise
return output
pool = Threads(cpu_count())
df_parall=list(pool.imap(parall_func, df_queries.itertuples(name=None)))
pool.close()
Final=pd.concat(df_parall, ignore_index=True)
can you help me to correct or suggest another logic or structure different to pandas
the final response has at about 3 millions of records
After I can get the structure i need do some of calcs and then connect to a db with pyodbc to save the data
The two things I would try are:
Create a requests.Session instance and use that to issue your GET requests. According to the documentation for this:
The Session object allows you to persist certain parameters across requests. It also persists cookies across all requests made from the Session instance, and will use urllib3’s connection pooling. So if you’re making several requests to the same host, the underlying TCP connection will be reused, which can result in a significant performance increase (see HTTP persistent connection).
Since you are using multithreading, limiting yourself to only a number of threads equal to the number of cores you have will result in under performance. Try creating 500 threads. The only issue becomes whether the website will not complain that too many requests per second are being made.
By the way, you source had an indentation error. I have supplied missing import statements as I suppose they should be and I have renamed argument tuple to tpl since tuple is a built-in type and you should not redefine built-in types without a good reason.
from multiprocessing.pool import ThreadPool as Threads
from requests import Session
from functools import partial
import pandas as pd
import sys
def parall_func(session, tpl):
output = pd.DataFrame()
list_caracts = list(map(str,tpl[2]))
item = [(tpl[1])]
q = len(list_caracts)
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = session.get(url, data=raw_data)
resp_to_json = json.loads(response.text)
for i in resp_to_json['tag']:
output = output.append([i])
except:
print("Error: ", sys.exc_info()[0])
raise
return output
with Session() as session:
headers = {
'Content-Type':'application/json'
}
session.headers = headers
pool = Threads(500)
df_parall=list(pool.imap(partial(parall_func, session), df_queries.itertuples(name=None)))
pool.close()
Final=pd.concat(df_parall, ignore_index=True)
Update
One additional thing you can try is to replace creating variable output by doing multiple append operations with a single concat:
def parall_func(session, tpl):
list_caracts = list(map(str,tpl[2]))
item = [(tpl[1])]
q = len(list_caracts)
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = session.get(url, data=raw_data)
resp_to_json = json.loads(response.text)
dataframes = [pd.DataFrame([i]) for i in resp_to_json['tag']]
output = pd.concat(dataframes)
except:
print("Error: ", sys.exc_info()[0])
raise
return output
If the above doesn't improve performance, one last thing to try is to have the creation of the dataframes done using multiprocessing:
from multiprocessing.pool import ThreadPool as Threads, Pool as MultiProcessingPool
from requests import Session
from functools import partial
import pandas as pd
import sys
def create_data_frames(response):
resp_to_json = json.loads(response.text)
dataframes = [pd.DataFrame([i]) for i in resp_to_json['tag']]
# Perhaps you might want to specify ignore_index=True on the following:
output = pd.concat(dataframes)
return output
def parall_func(session, multiprocessing_pool, tpl):
list_caracts = list(map(str,tpl[2]))
item = [(tpl[1])]
q = len(list_caracts)
raw_data = json.dumps(
{"item": item,"list_caracts": list_caracts, "sizePage":q, "numberPage":1}
)
try:
url = "https://thisisaurl.com/rep/store"
response = session.get(url, data=raw_data)
output = multiprocessing_pool.apply(create_data_frames, args=(response,))
except:
print("Error: ", sys.exc_info()[0])
raise
return output
with Session() as session:
headers = {
'Content-Type':'application/json'
}
session.headers = headers
multiprocessing_pool = MultiProcessingPool()
pool = Threads(500)
df_parall=list(pool.imap(partial(parall_func, session, multiprocessing_pool), df_queries.itertuples(name=None)))
multiprocessing_pool.close()
multiprocessing_pool.join()
pool.close()
pool.join()
Final=pd.concat(df_parall, ignore_index=True)
I have 2 functions(recharge_list and sms_list) in my below Server() class
import os
import json
import requests
import cherrypy
import ConfigParser
from bs4 import BeautifulSoup
class Server():
#cherrypy.expose
def index(self):
return "Seems Like You're Lost :D"
#cherrypy.expose
def recharge_list(self,carrier, state):
details_array=[]
small_details_array=[]
price_cell_array=[]
lst = []
url = "link{}/{}".format(carrier,state)
try:
if self.t_arr.get(url) is not None:
return json.dumps({'data': self.t_arr[url]})
except AttributeError:
self.t_arr = {}
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data,"html.parser")
table = soup.find('table',{'class':'table'})
s=""
detailtext = table.findAll('div',{'class':'detailtext'})
for det in detailtext:
details_array.append(det.text)
smalldetails = table.findAll('div',{'style':'padding-top:5px'})
for smallDet in smalldetails:
small_details_array.append(smallDet.text);
price_cells = table.findAll('td', {'class': 'pricecell'})
for price_cell in price_cells:
price_cell_array.append(price_cell.text)
for i in range(len(details_array)):
d_arr = {}
d_arr['detail']=details_array[i]
temp = small_details_array[i].split('\n')
d_arr['talktime'] = temp[1]
d_arr['keyword']=temp[3]
tempnew = price_cell_array[i].split('\n')
d_arr['price'] = tempnew[1]
d_arr['validity'] = tempnew[3]
# global list
lst.append(d_arr)
self.t_arr[url] = lst
return json.dumps({'data': self.t_arr[url]})
#cherrypy.expose
def sms_list(self,carrier, state):
details_array=[]
price_cell_array=[]
lst = []
url = "link/{}/{}".format(carrier,state)
try:
if self.t_arr.get(url) is not None:
return json.dumps({'data': self.t_arr[url]})
except AttributeError:
self.t_arr = {}
r = requests.get(url)
data = r.text
soup = BeautifulSoup(data,"html.parser")
table = soup.find('div',{'id':'SMS'})
table2 = table.find('table',{'class':'table'})
print(table2)
s=""
detailtext = table2.findAll('div',{'class':'detailtext'})
for det in detailtext:
details_array.append(det.text)
smalldetails = table2.findAll('div',{'style':'padding-top:5px'})
price_cells = table.findAll('td', {'class': 'pricecell'})
for price_cell in price_cells:
price_cell_array.append(price_cell.text)
for i in range(len(details_array)):
d_arr = {}
d_arr['detail']=details_array[i]
tempnew = price_cell_array[i].split('\n')
d_arr['price'] = tempnew[1]
d_arr['validity'] = tempnew[3]
# global list
lst.append(d_arr)
self.t_arr[url] = lst
return json.dumps({'data': self.t_arr[url]})
if __name__ == '__main__':
''' Setting up the Server with Specified Configuration'''
cherrypy.config.update({'server.socket_host': '0.0.0.0',})
cherrypy.config.update({'server.socket_port': int(os.environ.get('PORT', '5000')),})
cherrypy.quickstart(Server())
The problem is, when I run my server with recharge_list it works, but then I have to terminate my server from terminal and re-start the server to execute the sms_list function.
By my understanding the object once created by Server class is able to execute only the first called function.
What should I edit in my code such that I can execute the functions without terminating the server.
By my understanding the object once created by Server class is able to execute only the first called function.
This is not so. Each time an HTTP request is provided, the web server calls the function associated to the URL of that request.
What should I edit in my code such that I can execute the functions without terminating the server.
In sms_list (and not in recharge_list), replace every instance of t_arr with t_sms_arr.