Process a queue with parallel/async requests

Process a queue with parallel/async requests - python

I want to implement a parallel request.get() function, that processes a queue of requests and puts the result in a list, which, when finished, is processed by a standard sequential code. I tried the following, but my code doesn´t end and does not print the IDs.
import requests
from queue import Queue
from threading import Thread
BASE = 'http://www.uniprot.org'
KB_ENDPOINT = '/uniprot/'
FORMAT = ".xml"
num_threads = 10
ID_q = Queue()
ID_data = Queue()
# worker function
def get_ID_data(ID_q, ID_data, BASE, KB_ENDPOINT, FORMAT):
while True:
ID = ID_q.get()
print(ID)
ID_data.put(requests.get(BASE + KB_ENDPOINT + ID + FORMAT))
ID_q.task_done()
ID_data.task_done()
# initialize worker
for i in range(num_threads):
worker = Thread(target=get_ID_data, args=(ID_q, ID_data, BASE, KB_ENDPOINT, FORMAT))
worker.setDaemon(True)
worker.start()
# load IDs and put in queue
ID_list = ["A6ZMA9", "N1P5E6",
"H0GM11", "H0GZ91",
"A0A0L8VK54", "G2WKA0",
"C8ZEQ4", "B5VPH8",
"B3LLU5", "C7GL72",
"J8QFS9", "J8Q1C1",
"A0A0L8RDV1"]
for ID in ID_list:
ID_q.put(ID)
ID_q.join()
# work with ID_data
print(ID_data)
Update:
I changed #pkqxdd answer using asyncio and aiohttp to this:
import asyncio,aiohttp
IDs = ["A6ZMA9", "N1P5E6",
"H0GM11", "H0GZ91",
"A0A0L8VK54", "G2WKA0",
"C8ZEQ4", "B5VPH8",
"B3LLU5", "C7GL72",
"J8QFS9", "J8Q1C1",
"A0A0L8RDV1"]
BASE = 'http://www.uniprot.org'
KB_ENDPOINT = '/uniprot/'
FORMAT = ".xml"
async def get_data_coroutine(session, ID):
async with session.get(BASE + KB_ENDPOINT + ID + FORMAT) as response:
res = await response.text()
print(ID)
if not res:
raise NameError('{} is not available'.format(ID))
return res
async def main(loop):
async with aiohttp.ClientSession(loop=loop) as session:
tasks = [get_data_coroutine(session, ID) for ID in IDs]
return await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
result = loop.run_until_complete(main(loop))

Since you've mentioned async, I'm assuming you are using Python3.6 or higher.
The library requests doesn't really support async programming and it's kinda a dead end trying to make it async. A better idea is to use aiohttp instead.
You can achieve your goal with simple codes like this:
import asyncio,aiohttp
BASE = 'http://www.uniprot.org'
KB_ENDPOINT = '/uniprot/'
FORMAT = ".xml"
ID_list = ["A6ZMA9", "N1P5E6",
"H0GM11", "H0GZ91",
"A0A0L8VK54", "G2WKA0",
"C8ZEQ4", "B5VPH8",
"B3LLU5", "C7GL72",
"J8QFS9", "J8Q1C1",
"A0A0L8RDV1"]
session=aiohttp.ClientSession()
async def get_data(ID):
async with session.get(BASE + KB_ENDPOINT + ID + FORMAT) as response:
return await response.text()
coros=[]
for ID in ID_list:
coros.append(get_data(ID))
loop=asyncio.get_event_loop()
fut=asyncio.gather(*coros)
loop.run_until_complete(fut)
print(fut.result())
(Yes, I see the warning. But I don't really want to make the answer more complicated. You should change it to suit your purpose better.)

Related

Using Python asyncio and aiohttp - one request is fast, two or more very slow

I'm trying to speed up multiple requests to the US Census API by using asyncio and aiohttp.
If I make a request with only one lat/lon pair, it is fast, less than 1 second. With two or more, it is very slow, always 20 seconds or more.
Can't figure out why.
import pprint
import aiohttp
import time
async def getCensusInfo(lat, lon, session):
url = f'https://geocoding.geo.census.gov/geocoder/geographies/coordinates?x={lon}&y={lat}&format=json&benchmark=Public_AR_Current&vintage=Census2020_Current&layers=all'
async with session.get(url) as response:
result = await response.json()
return result
async def main():
async with aiohttp.ClientSession() as session:
tasks = []
locations = [(27.652250703997332, -80.42654388841413)]
locations = [(27.652250703997332, -80.42654388841413), (27.459669616175788, -80.30859777448217)]
for location in locations:
tasks.append(getCensusInfo(location[0], location[1], session))
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
pprint.pprint(result["result"]["geographies"]["Incorporated Places"][0], sort_dicts=False, indent = 4)
start_time = time.perf_counter()
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(main())
duration = time.perf_counter() - start_time
print(f'Time to download geoCode data: {duration} seconds')

asyncio not working on Google Cloud Functions

I have this function which works fine locally on my machine with python 3.8, but it throws runtime error on Google Cloud Functions.
def telegram_test(request):
request_json = request.get_json()
import datetime
import pandas as pd
from pyrogram import Client
session_string = "...............38Q8uTHG5gHwyWD8nW6h................."
# the rest of the authantication
api_id = 32494131641215
api_hash = "ioadsfsjnjksfgnfriuthg#qw]/zwq ]w/\lc ec,"
# one of bbc channels on telegram you want to access
channel_name = 'pyrogram'
# if you only want to get messages older than 7 days in unix style
seven_days = int((datetime.datetime.now() - datetime.timedelta(days=7)).timestamp())
# call telegram with parameters such as limit and date
# save the result to dataframe
with Client(session_string,api_id,api_hash, takeout=True,workers=2) as app:
hist_iter = app.iter_history(channel_name,offset_date=seven_days, limit=100)
msglist = [msg.__dict__ for msg in hist_iter]
df = pd.DataFrame(msglist)
print(df.head(5))
return f'it works!:{request_json}'
The error message I get from GCF log:
File "/opt/python3.8/lib/python3.8/asyncio/events.py", line 639, in
get_event_loop raise RuntimeError('There is no current event loop in
thread %r.' RuntimeError: There is no current event loop in thread
'ThreadPoolExecutor-0_0'.
Update
I updated the code, the runtime error gone. but I am getting time out error.
I put the timeout 180 secondes, but still when I test the function times out on 60 seconds.
Here is the updated code. Is there something I am doing wrong?
async def foo():
from datetime import datetime, timedelta
from pandas import DataFrame
from pyrogram import Client
import asyncio
session_string = "********zNmkubA4ibjsdjhsdfjlhweruifnjkldfioY5DE*********"
api_id = 325511548224831351
api_hash = "jdffjgtrkjhfklmrtgjtrm;sesews;;wex"
channel_name = 'cnn'
with Client(session_string, api_id, api_hash, takeout=True) as app:
hist_iter = app.iter_history(
channel_name, limit=10)
msglist = [msg.__dict__ for msg in hist_iter]
df = DataFrame(msglist)
return df
async def bar():
return await foo()
def test(request):
from asyncio import run
return run(bar())

The solution in the end was to change from Pyrogram to telethon and create the asyncio manaually before creating the client.
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
Note: you need valid session string, otherwise when you test the function, it will wait for you to auth with mobile number. so first run this code locally and authenticate, then copy the session string to the cloud function.
Here is the full code:
from telethon.sessions import StringSession
from telethon import TelegramClient
from pandas import DataFrame
import datetime
import asyncio
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
api_id = 101010101
api_hash = "jhafcgahagfbahgdbw17171736456gerf"
session_string = "hjksdhjbdsfhgbdsabeyitrgdsbfsdbdiyfhsbddasbdjdksf="
channel_name = 'bbcuzbek'
seven_days = int((datetime.datetime.now() -
datetime.timedelta(days=7)).timestamp())
client = TelegramClient(StringSession(session_string),
api_id, api_hash, loop=loop)
time_format = "%d/%m/%Y, %H:%M:%S"
download_date = datetime.datetime.now(
tz=datetime.timezone.utc).strftime(time_format)
cols = ["id", "date", "text", "views", "download_date"]
async def foo():
all_msgs = [[message.id, message.date.strftime(time_format), message.text, message.views, download_date] async for message in client.iter_messages(entity=channel_name, offset_date=seven_days, limit=10)]
df = DataFrame(data=all_msgs, columns=cols)
# write it to BQ
# print(df)
# async for message in client.iter_messages(entity=channel_name, offset_date=seven_days, limit=10):
# print(message.id, message.date, message.text, message.views)
print("it runs")
print(len(df))
return None
def test(request):
with client:
return client.loop.run_until_complete(foo())

bar() is redundant
You're trying to return a dataframe. Is it a valid HTTP response?
with -> async with
hist_iter = app.iter_history() -> hist_iter = await app.iter_history()
M.b. it waits for input?

python asyncio asynchronously fetch data by key from a dict when the key becomes available

Like title told, my use case is like this:
I have one aiohttp server, which accept request from client, when i have the request i generate one unique request id for it, and then i send the {req_id: req_pyaload} dict to some workers (the worker is not in python thus running in another process), when the workers complete the work, i get back the response and put them in a result dict like this: {req_id_1: res_1, req_id_2: res_2}.
Then I want my aiohttp server handler to await on above result dict, so when the specific response become available (by req_id) it can send it back.
I build below example code to try to simulate the process, but got stuck in implementing the coroutine async def fetch_correct_res(req_id) which should asynchronously/unblockly fetch the correct response by req_id.
import random
import asyncio
import shortuuid
n_tests = 1000
idxs = list(range(n_tests))
req_ids = []
for _ in range(n_tests):
req_ids.append(shortuuid.uuid())
res_dict = {}
async def fetch_correct_res(req_id):
pass
async def handler(req):
res = await fetch_correct_res(req)
assert req == res, "the correct res for the req should exactly be the req itself."
print("got correct res for req: {}".format(req))
async def randomly_put_res_to_res_dict():
for _ in range(n_tests):
random_idx = random.choice(idxs)
await asyncio.sleep(random_idx / 1000)
res_dict[req_ids[random_idx]] = req_ids[random_idx]
print("req: {} is back".format(req_ids[random_idx]))
So:
Is it possible to make this solution work? how?
If above solution is not possible, what should be the correct solution for this use case with asyncio?
Many thanks.
The only approach i can think of for now to make this work is: pre-created some asyncio.Queue with pre-assigned id, then for each incoming request assign one queue to it, so the handler just await on this queue, when the response come back i put it into this pre-assigned queue only, after the request fulfilled, i collect back the queue to use it for next incoming request. Not very elegant, but will solve the problem.

See if the below sample implementation fulfils your need
basically you want to respond back to the request(id) with your response(unable to predict the order) in an asynchronous way
So at the time of request handling, populate the dict with {request_id: {'event':<async.Event>, 'result': <result>}} and await on asyncio.Event.wait(), once the response is received, signal the event with asyncio.Event.set() which will release the await and then fetch the response from the dict based on the request id
I modified your code slightly to pre-populate the dict with request id and put the await on asyncio.Event.wait() until the signal comes from the response
import random
import asyncio
import shortuuid
n_tests = 10
idxs = list(range(n_tests))
req_ids = []
for _ in range(n_tests):
req_ids.append(shortuuid.uuid())
res_dict = {}
async def fetch_correct_res(req_id, event):
await event.wait()
res = res_dict[req_id]['result']
return res
async def handler(req, loop):
print("incoming request id: {}".format(req))
event = asyncio.Event()
data = {req :{}}
res_dict.update(data)
res_dict[req]['event']=event
res_dict[req]['result']='pending'
res = await fetch_correct_res(req, event)
assert req == res, "the correct res for the req should exactly be the req itself."
print("got correct res for req: {}".format(req))
async def randomly_put_res_to_res_dict():
random.shuffle(req_ids)
for i in req_ids:
await asyncio.sleep(random.randrange(2,4))
print("req: {} is back".format(i))
if res_dict.get(i) is not None:
event = res_dict[i]['event']
res_dict[i]['result'] = i
event.set()
loop = asyncio.get_event_loop()
tasks = asyncio.gather(handler(req_ids[0], loop),
handler(req_ids[1], loop),
handler(req_ids[2], loop),
handler(req_ids[3], loop),
randomly_put_res_to_res_dict())
loop.run_until_complete(tasks)
loop.close()
sample response from the above code
incoming request id: NDhvBPqMiRbteFD5WqiLFE
incoming request id: fpmk8yC3iQcgHAJBKqe2zh
incoming request id: M7eX7qeVQfWCCBnP4FbRtK
incoming request id: v2hAfcCEhRPUDUjCabk45N
req: VeyvAEX7YGgRZDHqa2UGYc is back
req: M7eX7qeVQfWCCBnP4FbRtK is back
got correct res for req: M7eX7qeVQfWCCBnP4FbRtK
req: pVvYoyAzvK8VYaHfrFA9SB is back
req: soP8NDxeQKYjgeT7pa3wtG is back
req: j3rcg5Lp59pQXuvdjCAyZe is back
req: NDhvBPqMiRbteFD5WqiLFE is back
got correct res for req: NDhvBPqMiRbteFD5WqiLFE
req: v2hAfcCEhRPUDUjCabk45N is back
got correct res for req: v2hAfcCEhRPUDUjCabk45N
req: porzHqMqV8SAuttteHRwNL is back
req: trVVqZrUpsW3tfjQajJfb7 is back
req: fpmk8yC3iQcgHAJBKqe2zh is back
got correct res for req: fpmk8yC3iQcgHAJBKqe2zh

This may work (note: I removed UUID in order to know req id in advance)
import random
import asyncio
n_tests = 1000
idxs = list(range(n_tests))
req_ids = []
for i in range(n_tests):
req_ids.append(i)
res_dict = {}
async def fetch_correct_res(req_id):
while not res_dict.get(req_id):
await asyncio.sleep(0.1)
return req_ids[req_id]
async def handler(req):
print("fetching req: ", req)
res = await fetch_correct_res(req)
assert req == res, "the correct res for the req should exactly be the req itself."
print("got correct res for req: {}".format(req))
async def randomly_put_res_to_res_dict(future):
for i in range(n_tests):
res_dict[req_ids[i]] = req_ids[i]
await asyncio.sleep(0.5)
print("req: {} is back".format(req_ids[i]))
future.set_result("done")
loop = asyncio.get_event_loop()
future = asyncio.Future()
asyncio.ensure_future(randomly_put_res_to_res_dict(future))
loop.run_until_complete(handler(10))
loop.close()
Is it the best solution? according to me No, basically its kind of requesting long running job status, and you should have (REST) api for doing the job submission and knowing job status like:
http POST server:port/job
{some job json paylod}
Response: 200 OK {"req_id": 1}
http GET server:port/job/1
Response: 200 OK {"req_id": 1, "status": "in process"}
http GET server:port/job/1
Response: 200 OK {"req_id": 1, "status": "done", "result":{}}

how to get response_time and response_size while using aiohttp

Is it possible to get response time and response size for each request made using aiohttp?
The documentation seems not to have those properties anywhere.
Thanks

len(response.text()) will return size of decompressed response.
If you want the size of the raw compressed response you need to set auto_decompress=False during creation of aiohttp.ClientSession. After that you can get it with len(await response.read()).
But it'll make response.text() unavailable since it needs uncompressed response. To make it available again you'll have to decompress it manually:
import time
import zlib
import brotli
async with aiohttp.ClientSession(auto_decompress=False) as session:
start = time.monotonic()
response = await session.get(url='www.test.com')
response_time = time.monotonic() - start
response_size = len(await response.read())
encoding = response.headers['Content-Encoding']
if encoding == 'gzip':
response._body = zlib.decompress(response._body, 16 + zlib.MAX_WBITS)
elif encoding == 'deflate':
response._body = zlib.decompress(response._body, -zlib.MAX_WBITS)
elif encoding == 'br':
response._body == brotli.decompress(response._body)
response_text = await response.text()
About time.time() from pymotw.com:
Because time.time() looks at the system clock, and the system clock can be changed by the user or system services for synchronizing clocks across multiple computers, calling time.time() repeatedly may produce values that go forwards and backwards. This can result in unexpected behavior when trying to measure durations or otherwise use those times for computation. Avoid those situations by using time.monotonic(), which always returns values that go forward.
aiohttp docs suggest to use loop.time() (which is also monotonic):
async def on_request_start(session, trace_config_ctx, params):
trace_config_ctx.start = asyncio.get_event_loop().time()
async def on_request_end(session, trace_config_ctx, params):
elapsed = asyncio.get_event_loop().time() - trace_config_ctx.start
print("Request took {}".format(elapsed))
trace_config = aiohttp.TraceConfig()
trace_config.on_request_start.append(on_request_start)
trace_config.on_request_end.append(on_request_end)
async with aiohttp.ClientSession(trace_configs=[trace_config]) as client:
client.get('http://example.com/some/redirect/')

One possibility might be:
measure point in time before request
measure point in time after request
the difference is the response time
with 'response.text()' you get the response and can determine the length with 'len()'
A small self-contained example could look like this:
import time
import asyncio
from aiohttp import ClientSession
async def fetch(session, url):
start = time.time()
async with session.get(url) as response:
result = await response.text()
end = time.time()
print(url, ": ", end - start, "response length:", len(result))
return result
async def crawl(urls: set):
async with ClientSession() as session:
tasks = []
for url in urls:
tasks.append(
fetch(session, url)
)
await asyncio.gather(*tasks)
if __name__ == "__main__":
urlSet = {"https://www.software7.biz/tst/number.php",
"https://www.software7.biz/tst/number1.php",
"https://www.software7.biz"}
asyncio.run(crawl(urlSet))
Test
The two endpoints number.php and number1.php have a delay on server side of 3 respective 1 second and are returning a two digit number each.
The output in the debug console looks like this then:
https://www.software7.biz : 0.16438698768615723 response length: 4431
https://www.software7.biz/tst/number1.php : 1.249755859375 response length: 2
https://www.software7.biz/tst/number.php : 3.214473009109497 response length: 2

You can get the size of the response content from the headers:
response.headers['content-length']

Asyncio and aiohttp returning task instead of results

I have a script to run parallel requests against an API within a class. However, the results I'm getting is basically a task instead of the actual results. Any reason why?
I mimicked the modified Client code on https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html.
import asyncio
from aiohttp import ClientSession
class Requestor:
async def _async_request(self, url, session, sema_sz=10):
sema = asyncio.Semaphore(sema_sz)
async with sema:
async with session.get(url) as response:
req = await response.json()
return req
async def _async_chunk_request(self, url, chunks, headers=None, sema_sz=10):
async with ClientSession(headers=headers) as session:
futures = [asyncio.ensure_future(self._async_request(url.format(chunk), session, sema_sz)) for chunk in chunks]
responses = asyncio.gather(*futures)
await responses
def get_request(self, url, chunks):
loop = asyncio.get_event_loop()
bulk_req = asyncio.ensure_future(self._async_chunk_request(url, chunks))
loop.run_until_complete(bulk_req)
return bulk_req
bulk_req is actually a Task variable and not the results and shows this in PyCharm, Task finished coro=<Requestor._async_chunk_request() done, defined at ...
When I debug, I see that req has a full and proper response value, so there's no issue with that. I feel like it's something to do with the actual gathering of the futures?

Your _chunk_request does not return anything.
async def _chunk_request(...):
...
...
await responses
I made a toy example trying to mimic your process. If I ended _chunk_request like you did, i got the same result - a finished Task with No results. Changing _chunk_request to return something fixed it:
async def _chunk_request(...):
...
...
return await responses
If you only need the return values from the tasks, get_request should return the result of the loop.run_until_complete() call.
My toy example
import asyncio
import random
from pprint import pprint
async def response(n):
asyncio.sleep(random.choice([1,3,5]))
return f'i am {n}'
async def _request(n):
req = await response(n)
#print(req)
return req
async def _chunk_request(chunks):
futures = [asyncio.ensure_future(_request(chunk)) for chunk in chunks]
#pprint(futures)
responses = asyncio.gather(*futures, return_exceptions=True)
#pprint(responses)
return await responses
def get_request(chunks):
loop = asyncio.get_event_loop()
bulk_req = asyncio.ensure_future(_chunk_request(chunks))
return loop.run_until_complete(bulk_req)
In [7]: result = get_request(range(1,6))
In [8]: print(result)
['i am 1', 'i am 2', 'i am 3', 'i am 4', 'i am 5']

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Process a queue with parallel/async requests - python

Related

Using Python asyncio and aiohttp - one request is fast, two or more very slow

asyncio not working on Google Cloud Functions

python asyncio asynchronously fetch data by key from a dict when the key becomes available

how to get response_time and response_size while using aiohttp

Asyncio and aiohttp returning task instead of results

Categories

Resources