I am making a script that gets the HTML of almost 20 000 pages and parses it to get just a portion of it.
I managed to get the 20 000 pages' content in a dataframe with aynchronous requests using asyncio and aiohttp but this script still wait for all the pages to be fetched to parse them.
async def get_request(session, url, params=None):
async with session.get(url, headers=HEADERS, params=params) as response:
return await response.text()
async def get_html_from_url(urls):
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
tasks.append(get_request(session, url))
html_page_response = await asyncio.gather(*tasks)
return html_page_response
html_pages_list = asyncio_loop.run_until_complete(get_html_from_url(urls))
Once I have the content of each page I managed to use multiprocessing's Pool to parallelize the parsing.
get_whatiwant_from_html(html_content):
parsed_html = BeautifulSoup(html_content, "html.parser")
clean = parsed_html.find("div", class_="class").get_text()
# Some re.subs
clean = re.sub("", "", clean)
clean = re.sub("", "", clean)
clean = re.sub("", "", clean)
return clean
pool = Pool(4)
what_i_want = pool.map(get_whatiwant_from_html, html_content_list)
This code mixes asynchronously the fetching and the parsing but I would like to integrate multiprocessing into it:
async def process(url, session):
html = await getRequest(session, url)
return await get_whatiwant_from_html(html)
async def dispatch(urls):
async with aiohttp.ClientSession() as session:
coros = (process(url, session) for url in urls)
return await asyncio.gather(*coros)
result = asyncio.get_event_loop().run_until_complete(dispatch(urls))
Is there any obvious way to do this? I thought about creating 4 processes that each run the asynchronous calls but the implementation looks a bit complex and I'm wondering if there is another way.
I am very new to asyncio and aiohttp so if you have anything to advise me to read to get a better understanding, I will be very happy.
You can use ProcessPoolExecutor.
With run_in_executor you can do IO in your main asyncio process.
But your heavy CPU calculations in separate processes.
async def get_data(session, url, params=None):
loop = asyncio.get_event_loop()
async with session.get(url, headers=HEADERS, params=params) as response:
html = await response.text()
data = await loop.run_in_executor(None, partial(get_whatiwant_from_html, html))
return data
async def get_data_from_urls(urls):
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
tasks.append(get_data(session, url))
result_data = await asyncio.gather(*tasks)
return result_data
executor = concurrent.futures.ProcessPoolExecutor(max_workers=10)
asyncio_loop.set_default_executor(executor)
results = asyncio_loop.run_until_complete(get_data_from_urls(urls))
You can increase your parsing speed by changing your BeautifulSoup parser from html.parser to lxml which is by far the fastest, followed by html5lib. html.parser is the slowest of them all.
Your bottleneck is not processing issue but IO. You might want multiple threads and not process:
E.g. here is a template program that scraping and sleep to make it slow but ran in multiple threads and thus complete task faster.
from concurrent.futures import ThreadPoolExecutor
import random,time
from bs4 import BeautifulSoup as bs
import requests
URL = 'http://quotesondesign.com/wp-json/posts'
def quote_stream():
'''
Quoter streamer
'''
param = dict(page=random.randint(1, 1000))
quo = requests.get(URL, params=param)
if quo.ok:
data = quo.json()
author = data[0]['title'].strip()
content = bs(data[0]['content'], 'html5lib').text.strip()
print(f'{content}\n-{author}\n')
else:
print('Connection Issues :(')
def multi_qouter(workers=4):
with ThreadPoolExecutor(max_workers=workers) as executor:
_ = [executor.submit(quote_stream) for i in range(workers)]
if __name__ == '__main__':
now = time.time()
multi_qouter(workers=4)
print(f'Time taken {time.time()-now:.2f} seconds')
In your case, create a function that performs the task you want from starry to finish. This function would accept url and necessary parameters as arguments. After that create another function that calls the previous function in different threads, each thread having its our url. So instead of i in range(..), for url in urls. You can run 2000 threads at once, but I would prefer chunks of say 200 running parallel.
Related
I am making a python function which makes a lot of requests to an api. The function works like this:
async def get_one(session, url):
try:
with session.get(url) as resp:
resp = await resp.json()
except:
resp = None
return resp, url
async def get_all(session, urls):
tasks = [asyncio.create_task(get_one(session, url)) for url in urls]
results = await asyncio.gather(*tasks)
return results
async def make_requests(urls):
timeout = aiohttp.ClientTimeout(sock_read=10, sock_connect=10, total=0.1*len(urls))
connector = aiohttp.TCPConnector(limit=125)
async with aiohttp.ClientSession(connector=connector, skip_auto_headers=['User-Agent'], timeout=timeout) as session:
data = await get_all(session, ids)
return data
def main(urls):
results = []
while urls:
retry = []
response = asyncio.run(make_requests(urls))
for resp, url in response:
if resp is not None:
results.append(resp)
else:
retry.append(url)
urls = retry
return results
The problem is my function keeps building up memory, especially when there are more errors in the try-except block inside the 'get_one' function, the more times I have to retry, the more memory it consumes (something is preventing python from collecting the garbage).
I have come accross an old answer (Asyncio with memory leak (Python)) stating that create_task() is responsible for this (or ensure_future), as it keeps a reference to the original task.
But it is still not clear to me if this is really the case, or how to solve this issue if it is. Any help will appreciated, thank you!
I am using asyncio in python to get data from a large number of urls.
Here I am trying to get data from yahoo.com 1000 times. And ~90% of the requests fail. Reducing the number of parallel requests reduces the % of fails. Trying to understand why this happens.
import asyncio
import aiohttp
could_not_fetch = []
fetching data from the url. almost 90% of them fail here.
async def fetch_page(url, session, id):
try:
async with session.get(url, timeout = 3) as res:
html = await res.text()
return html
except:
could_not_fetch.append(id)
async def process(id, url, session):
html = await fetch_page(url, session,id)
using aiohttp.ClientSession() to request data from urls, also passing their index.
async def dispatch(urls):
async with aiohttp.ClientSession() as session:
coros = (process(id, url, session) for id, url in enumerate(urls))
return await asyncio.gather(*coros)
using asyncio to get data from yahoo.com 1000 times. if I reduce the number to 100, a much lesser ~ 10% of requests fail.
def main():
loop = asyncio.get_event_loop()
loop = loop.run_until_complete(dispatch(1000 * ['https://yahoo.com/']))
print('could_not_fetch', len(could_not_fetch))
if __name__ == '__main__':
main()
Trying to understand why these requests fail and how to rectify this while doing 1k requests at a time.
I am running a webscraper class who's method name is self.get_with_random_proxy_using_chain.
I am trying to send multithreaded calls to the same url, and would like that once there is a result from any thread, the method returns a response and closes other still active threads.
So far my code looks like this (probably naive):
from concurrent.futures import ThreadPoolExecutor, as_completed
# class initiation etc
max_workers = cpu_count() * 5
urls = [url_to_open] * 50
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url=[]
for url in urls: # i had to do a loop to include sleep not to overload the proxy server
future_to_url.append(executor.submit(self.get_with_random_proxy_using_chain,
url,
timeout,
update_proxy_score,
unwanted_keywords,
unwanted_status_codes,
random_universe_size,
file_path_to_save_streamed_content))
sleep(0.5)
for future in as_completed(future_to_url):
if future.result() is not None:
return future.result()
But it runs all the threads.
Is there a way to close all threads once the first future has completed.
I am using windows and python 3.7x
So far I found this link, but I don't manage to make it work (pogram still runs for a long time).
As far as I know, running futures cannot be cancelled. Quite a lot has been written about this. And there are even some workarounds.
But I would suggest taking a closer look at the asyncio module. It is quite well suited for such tasks.
Below is a simple example, when several concurrent requests are made, and upon receiving the first result, the rest are canceled.
import asyncio
from typing import Set
from aiohttp import ClientSession
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def wait_for_first_response(tasks):
done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
for p in pending:
p.cancel()
return done.pop().result()
async def request_one_of(*urls):
tasks = set()
async with ClientSession() as session:
for url in urls:
task = asyncio.create_task(fetch(url, session))
tasks.add(task)
return await wait_for_first_response(tasks)
async def main():
response = await request_one_of("https://wikipedia.org", "https://apple.com")
print(response)
asyncio.run(main())
I've written a script in asyncio in association with aiohttp library to parse the content of a website asynchronously. I've tried to apply the logic within the following script the way it is usually applied in scrapy.
However, when I execute my script, it acts like how syncronous libraries like requests or urllib.request do. Therefore, it is very slow and doesn't serve the purpose.
I know I can get around this by defining all the next page link within the link variable. But, am I not doing the task with my existing script in the right way already?
Within the script what processing_docs() function does is collect all the links of the different posts and pass the refined links to the fetch_again() function to fetch the title from it's target page. There is a logic applied within processing_docs() function which collects the next_page link and supply the same to fetch() function to repeat the same. This next_page call is making the script slower whereas we usually do the same inscrapyand get expected performance.
My question is: How can I achieve the same keeping the existing logic intact?
import aiohttp
import asyncio
from lxml.html import fromstring
from urllib.parse import urljoin
link = "https://stackoverflow.com/questions/tagged/web-scraping"
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
text = await response.text()
result = await processing_docs(session, text)
return result
async def processing_docs(session, html):
tree = fromstring(html)
titles = [urljoin(link,title.attrib['href']) for title in tree.cssselect(".summary .question-hyperlink")]
for title in titles:
await fetch_again(session,title)
next_page = tree.cssselect("div.pager a[rel='next']")
if next_page:
page_link = urljoin(link,next_page[0].attrib['href'])
await fetch(page_link)
async def fetch_again(session,url):
async with session.get(url) as response:
text = await response.text()
tree = fromstring(text)
title = tree.cssselect("h1[itemprop='name'] a")[0].text
print(title)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*(fetch(url) for url in [link])))
loop.close()
Whole point of using asyncio is that you may run multiple fetches concurrently (in parallel to each other). Let's look at your code:
for title in titles:
await fetch_again(session,title)
This part means that each new fetch_again will be started only after previous was awaited (finished). If you do things this way, yes, there's no difference with using sync approach.
To invoke all power of asyncio start multiple fetches concurrently using asyncio.gather:
await asyncio.gather(*[
fetch_again(session,title)
for title
in titles
])
You'll see significant speedup.
You can go event futher and start fetch for next page concurrently with fetch_again for titles:
async def processing_docs(session, html):
coros = []
tree = fromstring(html)
# titles:
titles = [
urljoin(link,title.attrib['href'])
for title
in tree.cssselect(".summary .question-hyperlink")
]
for title in titles:
coros.append(
fetch_again(session,title)
)
# next_page:
next_page = tree.cssselect("div.pager a[rel='next']")
if next_page:
page_link = urljoin(link,next_page[0].attrib['href'])
coros.append(
fetch(page_link)
)
# await:
await asyncio.gather(*coros)
Important note
While such approach allows you to do things much faster you may want to limit number of concurrent requests at the time to avoid significant resources usage on both your machine and on server.
You can use asyncio.Semaphore for this purpose:
semaphore = asyncio.Semaphore(10)
async def fetch(url):
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
text = await response.text()
result = await processing_docs(session, text)
return result
I have the following problem that my code for api requests is really non deterministic. I use asyncio to make asynchronous requests, because I want to send multiple requests and have big frequency of changes(that's why I am sending 30 the same requests). Sometimes my code executes really quickly about 0.5s but sometimes it stucks after sending for example a half of the requests. Could anyone see some code bugs which can produce the following error? Or such thing is caused by some delays of the server responses?
import asyncio
from aiohttp import ClientSession
async def fetch(url, session):
async with session.get(url) as response:
data = await response.json()
print(data)
return await response.read()
async def run(r):
url = "https://www.bitstamp.net/api/ticker/"
tasks = []
async with ClientSession() as session:
for i in range(r):
task = asyncio.ensure_future(fetch(url.format(i), session))
tasks.append(task)
responses = asyncio.gather(*tasks)
await responses
t1 = time.time()
number = 30
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)
t2= time.time()
print(t2-t1)