How to use python aiohttp library to download multiple webpages? - python

I'm try to asynchronously scrape data from a leaderboard for a video game. There are weekly and daily challenges. I've based my code so far on this async client with semaphores. The difference is I'm trying to contain the end where the loop is used in a function. Here's the relevant portion of my code:
from urllib.parse import urljoin
import asyncio
import aiohttp
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def bound_fetch(url, session, sem):
async with sem:
await fetch(url, session)
async def fetch_pages(url,pages,session):
tasks = []
sem = asyncio.Semaphore(LIMIT)
for page in range(pages+1):
task_url = urljoin(url,str(page))
task = asyncio.ensure_future(bound_fetch(task_url, session, sem))
tasks.append(task)
await asyncio.gather(*tasks)
def leaderboard_crawler(date, entries=0, pages=1):
website = "https://www.thronebutt.com/archive/"
date_url = urljoin(website,date+"/")
entries_per_page = 30
number_of_entries = entries or pages * entries_per_page
full_pages, last_page = divmod(number_of_entries,30)
entry_list = [30 for x in range(full_pages)]
if last_page != 0:
entry_list.append(last_page)
loop = asyncio.get_event_loop()
with aiohttp.ClientSession() as session:
future = asyncio.ensure_future(fetch_pages(date_url,pages,session))
date_html = loop.run_until_complete(future)
return date_html
def weekly_leaderboard(week, year, entries=0, pages=1):
weekly_date = "{0:02d}{1}".format(week, year)
return leaderboard_crawler(weekly_date,entries,pages)
def daily_leaderboard(day, month, year, entries=0, pages=1):
daily_date = "{0:02d}{1:02d}{2}".format(day, month, year)
return leaderboard_crawler(daily_date, entries, pages)
I think the problem is in the asyncio.gather(*tasks) portion in the fetch_urls function. I can't figure out how to pass that to the leaderboard_crawler. Right now date_html is None. I've tried return await asyncio.gather(*tasks), which returns an array of Nones. I've also tried wrapping it in asyncio.ensure_future then passing it to loop.run_until_complete but that doesn't seem to work either.

The reason is simple, you are missing return in your call stack:
async def bound_fetch(url, session, sem):
async with sem:
# await fetch(url, session) # missing return
return await fetch(url, session) # this one is right
async def fetch_pages(url,pages,session):
tasks = []
sem = asyncio.Semaphore(LIMIT)
for page in range(pages+1):
task_url = urljoin(url,str(page))
task = asyncio.ensure_future(bound_fetch(task_url, session, sem))
tasks.append(task)
# await asyncio.gather(*tasks) # missing return
return await asyncio.gather(*tasks) # this one is right.
The working example is here:
from urllib.parse import urljoin
import asyncio
import aiohttp
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def bound_fetch(url, session, sem):
async with sem:
return await fetch(url, session)
async def fetch_pages(url,pages,session):
tasks = []
sem = asyncio.Semaphore(5)
for page in range(pages+1):
task_url = urljoin(url,str(page))
task = asyncio.ensure_future(bound_fetch(task_url, session, sem))
tasks.append(task)
return await asyncio.gather(*tasks)
def leaderboard_crawler(date, entries=0, pages=1):
website = "https://www.thronebutt.com/archive/"
date_url = urljoin(website,date+"/")
entries_per_page = 30
number_of_entries = entries or pages * entries_per_page
full_pages, last_page = divmod(number_of_entries,30)
entry_list = [30 for x in range(full_pages)]
if last_page != 0:
entry_list.append(last_page)
loop = asyncio.get_event_loop()
with aiohttp.ClientSession() as session:
future = asyncio.ensure_future(fetch_pages(date_url,pages,session))
date_html = loop.run_until_complete(future)
return date_html
def weekly_leaderboard(week, year, entries=0, pages=1):
weekly_date = "{0:02d}{1}".format(week, year)
return leaderboard_crawler(weekly_date,entries,pages)
def daily_leaderboard(day, month, year, entries=0, pages=1):
daily_date = "{0:02d}{1:02d}{2}".format(day, month, year)
return leaderboard_crawler(daily_date, entries, pages)

Related

How to run an async function through the task scheduler (aioschedule)?

I am writing a personal telegram bot, one of its functions is to show the balance of my accounts market.csgo.com. My code:
import asyncio
import aiogram
import aiohttp
...
async def get_balance(session, profiles_dict, message):
async with session.get(f'https://market.csgo.com/api/v2/get-money?key={profiles_dict[1][1]}') as resp:
html = await resp.json()
each_wallet = int(html['money'])
await bot.send_message(message.from_user.id,
f'🟢 {profiles_dict[0]} : <i>{each_wallet}</i>',
disable_web_page_preview=True, parse_mode=types.ParseMode.HTML)
...
#dp.message_handler(content_types=['text'])
async def main(message):
profiles = users()
async with aiohttp.ClientSession(trust_env=True) as session:
tasks = []
if message.text == 'Balance 💸':
await bot.send_message(message.from_user.id, 'Information request. Wait..')
for i in profiles.items():
task = asyncio.ensure_future(get_balance(session, i, message, stats))
tasks.append(task)
await asyncio.gather(*tasks)
if message.text == 'On Sale 💰':
...
if message.text == 'Timeout Items ⌛':
...
executor.start_polling(dp, skip_updates=False)
get_balance() works in async mode, sends aiohttp requests to the API and outputs information await bot.send_message(). Result:
Now the launch of the function is implemented through the keyboard button, but how to make the function run every hour? I am aware of the existence of asynchronous task scheduler aioschedule and have seen this example. But they run a function without arguments, but I have as many as 3 of them async def get_balance(session, profiles_dict, message). I tried to do this:
import asyncio
import aioschedule
async def scheduler(session, profiles_dict, message):
aioschedule.every().hour.do(get_balance(session, profiles_dict, message))
while True:
await aioschedule.run_pending()
await asyncio.sleep(1)
async def on_startup(session, profiles_dict, message):
asyncio.create_task(scheduler(session, profiles_dict, message))
if __name__ == '__main__':
executor.start_polling(dp, skip_updates=False, on_startup=on_startup(session, profiles_dict, message))
Obviously it doesn't work that way.
My question is:
How to run an async function with arguments that sends aiohttp requests through task scheduling aioschedule and display the result through telegram aiogram?
Solution:
import aiogram
import asyncio
import aiohttp
import aioschedule
...
async def get_balance(session, profiles_dict):
async with session.get(f'https://market.csgo.com/api/v2/get-money?key={profiles_dict[1][1]}') as resp:
html = await resp.json()
each_wallet = int(html['money'])
await bot.send_message(MY_TELEGRAM_ID,
f'🟢 {profiles_dict[0]} : <i>{each_wallet}</i>',
disable_web_page_preview=True, parse_mode=types.ParseMode.HTML)
...
#dp.message_handler(content_types=['text'])
async def main(message):
profiles = users()
async with aiohttp.ClientSession(trust_env=True) as session:
tasks = []
if message.text == 'Balance 💸':
await bot.send_message(message.from_user.id, 'Information request. Wait..')
for i in profiles.items():
task = asyncio.ensure_future(get_balance(session, i))
tasks.append(task)
await asyncio.gather(*tasks)
if message.text == 'On Sale 💰':
...
if message.text == 'Timeout Items ⌛':
...
# Client session get_balance function
async def session_get_balance():
profiles = users()
async with aiohttp.ClientSession(trust_env=True) as session:
tasks = []
for i in profiles.items():
task = asyncio.ensure_future(get_balance(session, i))
tasks.append(task)
await asyncio.gather(*tasks)
# Schedule functions by time
async def scheduler():
aioschedule.every().hour.do(session_get_balance)
while True:
await aioschedule.run_pending()
await asyncio.sleep(1)
# Function at start
async def on_startup(_):
asyncio.create_task(scheduler())
# Launch telegram bot
if __name__ == "__main__":
executor.start_polling(dp, skip_updates=True, on_startup=on_startup)
Since this is my personal bot, instead of message.from_user.id I specified my MY_TELEGRAM_ID.
await bot.send_message(MY_TELEGRAM_ID,
f'🟢 {profiles_dict[0]} : <i>{each_wallet}</i>',
disable_web_page_preview=True, parse_mode=types.ParseMode.HTML)

With PyTest, run_until_complete does continue without finishing

Basically, what it does, is to do 20 requests async to google.
If I launch it without using PyTest, just a snip of code, like this, it works:
import asyncio
import aiohttp
async def get(
session: aiohttp.ClientSession,
) -> dict:
url = f"https://www.google.com/"
resp = await session.request('GET', url=url)
data = await resp.json()
return data
async def sessions():
async with aiohttp.ClientSession() as session:
tasks = []
for i in range(20):
tasks.append(get(session=session))
return await asyncio.gather(*tasks, return_exceptions=True)
def main():
loop = asyncio.new_event_loop()
try:
asyncio.set_event_loop(loop)
htmls = loop.run_until_complete(sessions())
finally:
loop.close()
print(htmls)
But when I use PyTest, in spite of being the same code (almost), the "htmls" variable at the end is not assignated any value
import aiohttp
import asyncio
async def get(
session: aiohttp.ClientSession,
) -> dict:
url = f"https://www.google.com/"
resp = await session.request('GET', url=url)
data = await resp.json()
return data
async def sessions(self):
async with aiohttp.ClientSession() as session:
tasks = []
for i in range(20):
tasks.append(self.get(session=session))
return await asyncio.gather(*tasks, return_exceptions=True)
def test_example(self):
loop = asyncio.new_event_loop()
try:
asyncio.set_event_loop(loop)
htmls = loop.run_until_complete(self.sessions())
finally:
loop.close()
print(htmls)
Why is this? It is like loop.run_until_complete(self.sessions()) is not waiting for it to finish.
It is resolved. It needed a self as first parameter for the get() method :S

How to ensure existing only one coroutine of a specific kind

in my class I have a method that fetches the website (visible below).
I've noticed that other methods that use this method, can lead to the opening of multiple requests to one site (when one request is pending self._page is still none).
How can I avoid it?
I mean when there is another call to _get_page but one is pending, just return a future from the first one and don't repeat a page request
async def _get_page(self) -> HtmlElement:
if self._page is None:
async with self._get_session().get(self._url) as page:
self._page = lxml.html.document_fromstring(await page.text())
return self._page
How can I avoid [multiple requests]?
You could use an asyncio.Lock:
saync def __init__(self, ...):
...
self._page_lock = asyncio.Lock()
async def _get_page(self) -> HtmlElement:
async with self._page_lock:
if self._page is None:
async with self._get_session().get(self._url) as page:
self._page = lxml.html.document_fromstring(await page.text())
return self._page
Update for Python 3.8 and jupyter notebook
import asyncio
import aiohttp
from lxml import html
class MyClass:
def __init__(self):
self._url = 'https://www.google.com'
self._page = None
self._futures = []
self._working = False
self._session = aiohttp.ClientSession()
async def _close(self):
if self._session:
session = self._session
self._session = None
await session.close()
def _get_session(self):
return self._session
async def _get_page(self):
if self._page is None:
if self._working:
print('will await current page request')
loop = asyncio.get_event_loop()
future = loop.create_future()
self._futures.append(future)
return await future
else:
self._working = True
session = self._get_session()
print('making url request')
async with session.get(self._url) as page:
print('status =', page.status)
print('making page request')
self._page = html.document_fromstring(await page.text())
print('Got page text')
for future in self._futures:
print('setting result to awaiting request')
future.set_result(self._page)
self._futures = []
self._working = False
return self._page
async def main():
futures = []
m = MyClass()
futures.append(asyncio.ensure_future(m._get_page()))
futures.append(asyncio.ensure_future(m._get_page()))
futures.append(asyncio.ensure_future(m._get_page()))
results = await asyncio.gather(*futures)
for result in results:
print(result[0:80])
await m._close()
if __name__ == '__main__':
asyncio.run(main())
#await main() # In jupyter notebook and iPython
Note that on Windows 10 I have seen at termination:
RuntimeError: Event loop is closed
See https://github.com/aio-libs/aiohttp/issues/4324

Why does using pytest-asyncio and #parametrize cause tests to run for longer than without

I have a test. It sends a get request to a list of urls and checks that the response is not 500.
#pytest.mark.asyncio
#pytest.mark.parametrize('url_test_list', get_all_url_list(HOST_FOR_TEST))
async def test_check_status_urls(self, url_test_list):
returned_status = await get(url_test_list)
assert returned_status < 500
and this is my "get" function
async def get(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
response_status = response.status
return response_status
It works, but it is slow. It takes about 3 minutes to complete.
But when I use this test without #parametrize and my "get" function takes the url_list - it runs in about 1 minute. My code in second case:
#pytest.mark.asyncio
async def test_check_status_urls(self):
url_list = make_url_list()
returned_status = await get(url_list)
assert all(returned_status) > 500
async def get(urls):
good_list = []
async with aiohttp.ClientSession() as session:
for url in urls:
async with session.get(url) as response:
response_status = response.status
good_list.append(response_status)
return good_list
I would like to have the best of both worlds here. Is there a way I can have the tests run quickly, but also run as individual units?

python aiohttp and asyncio without loop event

My understanding that the "async def " needs to be called with loop_event, e.g.
loop = asyncio.get_event_loop()
loop.run_until_complete(<method>)
I created some code below without the loop. It still supports 100 async calls without an issue. Did I miss anything?
Dummy server sleeps 5 seconds.
from aiohttp import web
import asyncio
import time
async def hello(request):
#time.sleep(1)
await asyncio.sleep(5)
return web.Response(text='dummy done')
app = web.Application()
app.add_routes([web.get('/', hello)])
web.run_app(app,host='127.0.0.1', port=8081)
Actual server taking requests.
import json
from aiohttp import web
import aiohttp
import asyncio
n = 0
def mcowA(n):
print (n, " : A")
return
async def fetch(session, url):
async with getattr(session,"get")(url) as response:
return await response.text()
def mcowB(n):
print (n, " : B")
return
async def runMcows(request):
global n
n = n + 1
mcowA(n)
async with aiohttp.ClientSession() as session:
html = await fetch(session, 'http://localhost:8081')
print(n,html)
mcowB(n)
return web.Response(text=html)
try:
app = web.Application()
app.add_routes([web.get('/', runMcows)])
#loop = asyncio.get_event_loop(web.run_app(app))
#loop.run_forever()
web.run_app(app)
finally:
loop.close()

Categories