Fetch file from localhost without web server - python

I want to fetch file from localhost without web server asynchronously. Seems that it is possible to do using file:// scheme. The following code sample is taken from documentation, but obviously it doesn't work:
import aiohttp
import asyncio
async def fetch(session, url):
async with session.get(url) as response:
return await response.text()
async def main():
async with aiohttp.ClientSession() as session:
html = await fetch(session, 'file://localhost/Users/user/test.txt')
print(html)
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
How to make it work?
The one way I see is to use "curl file://path" in separate thread pool using run_in_executor, but I think there should be a way to fix code

If you need to obtain the contents of a local file, you can do it with ordinary Python built-ins, such as:
with open('Users/user/test.txt') as rd:
html = rd.read()
If the file is not very large, and is stored on a local filesystem, you don't even need to make it async, as reading it will be fast enough not to disturb the event loop. If the file is large or reading it might be slow for other reasons, you should read it through run_in_executor to prevent it from blocking other asyncio code. For example (untested):
def read_file_sync(file_name):
with open('Users/user/test.txt') as rd:
return rd.read()
async def read_file(file_name):
loop = asyncio.get_event_loop()
html = await loop.run_in_executor(None, read_file_sync, file_name)
return html

Related

Parallelize requests call with asyncio in Python

I'm trying to create an interface to an API, and I want to have the option to easily run the requests sync or asynchronously, and I came up with the following code.
import asyncio
import requests
def async_run(coro_list):
loop = asyncio.get_event_loop()
futures = [loop.run_in_executor(None, asyncio.run, coro) for coro in coro_list]
result = loop.run_until_complete(asyncio.gather(*futures))
return result
def sync_get(url):
return requests.get(url)
async def async_get(url):
return sync_get(url)
coro_list = [async_get("https://google.com"), async_get("https://google.com")]
responses = async_run(coro_list)
print(responses)
For me it's very intuitive to either call sync_get or create a list of async_get and call async_run, and requires no knowledge of async Python to understand how it works.
The only problem is that loop.run_in_executor(None, asyncio.run, coro) doesn't sound too optimal, and I couldn't find anyone else running this code on Github. So I'm wondering, is there a simpler way to accomplish the objective of abstracting these threading and asyncio concepts in some similar way, or is this code already optimal?
asyncio.run() is usually used as the main entry to run async code from sync code.
loop.run_in_executor(None, asyncio.run, coro) cause an event loop created in executor threads to run coro in coro_list. Why not directly run sync_get in executor threads?
import asyncio
import requests
def async_run(url_list):
loop = asyncio.get_event_loop()
futures = [loop.run_in_executor(None, sync_get, url) for url in url_list]
result = await asyncio.gather(*futures)
return result
def sync_get(url):
return requests.get(url)
#
# async def async_get(url):
# return sync_get(url)
url_list = ["https://google.com", "https://google.com"]
responses = asyncio.run(async_run(url_list))
print(responses)
There are async libaries, eg. aiohttp and httpx, to accomplish similar work.
At the end I chose not to cover completely asyncio under my interface.
Still with the goal of having not having to manage 2 "requests" functions, I made the API async first, and run the synchronous one with asyncio and I ended up with something like this.
def sync_request():
return asyncio.run(async_request(...))
async def async_request():
return await aiohttp.request(...) # pseudo code

aiohttp download large list of pdf files

i am trying to download large number of pdf files asynchronously, python requests does not work well with async functionalities
but i am finding aiohttp hard to implement with pdf downloads, and can't find a thread for this specific task, for someone new into python async world to understand easily.
yeah it can be done with threadpoolexecutor but in this case better to keep in one thread.
this code works but need to do with 100 or so urls
asynchronously
import aiohttp
import aiofiles
async with aiohttp.ClientSession() as session:
url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
async with session.get(url) as resp:
if resp.status == 200:
f = await aiofiles.open('download_pdf.pdf', mode='wb')
await f.write(await resp.read())
await f.close()
Thanks in advance.
You could do try something like this. For the sake of simplicity the same dummy pdf will be downloaded multiple times to disk with different file names:
from asyncio import Semaphore, gather, run, wait_for
from random import randint
import aiofiles
from aiohttp.client import ClientSession
# Mock a list of different pdfs to download
pdf_list = [
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
"https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf",
]
MAX_TASKS = 5
MAX_TIME = 5
async def download(pdf_list):
tasks = []
sem = Semaphore(MAX_TASKS)
async with ClientSession() as sess:
for pdf_url in pdf_list:
# Mock a different file name each iteration
dest_file = str(randint(1, 100000)) + ".pdf"
tasks.append(
# Wait max 5 seconds for each download
wait_for(
download_one(pdf_url, sess, sem, dest_file),
timeout=MAX_TIME,
)
)
return await gather(*tasks)
async def download_one(url, sess, sem, dest_file):
async with sem:
print(f"Downloading {url}")
async with sess.get(url) as res:
content = await res.read()
# Check everything went well
if res.status != 200:
print(f"Download failed: {res.status}")
return
async with aiofiles.open(dest_file, "+wb") as f:
await f.write(content)
# No need to use close(f) when using with statement
if __name__ == "__main__":
run(download(pdf_list))
Keep in mind that firing multiple concurrent request to a server might get your IP banned for a period of time. In that case, consider adding a sleep call (which kind of defeats the purpose of using aiohttp) or switching to a classic sequential script. In order to keep things concurrent but kinder to the server, the script will fire max 5 requests at any given time (MAX_TASKS).

Python aiohttp module: ambiguous .content attribute

Here is a little code snippet:
import aiohttp
import aiofiles
async def fetch(url):
# starting a session
async with aiohttp.ClientSession() as session:
# starting a get request
async with session.get(url) as response:
# getting response content
content = await response.content
return content
async def save_file(file_name, content):
async with aiofiles.open(f'./binary/{file_name}', 'wb') as f:
while True:
chunk = content.read(1024)
if not chunk:
break
f.write(chunk)
I am trying to download some binary files using the aiohttp library and then passing them to a coroutine using aiofiles library to write the file in the disk.
I have read the documentation but still couldn't figure out if I can pass content = await response.content or is it closed when the handle async with.. is closed? Because on a secondary blog, I found:
According to aiohttp’s documentation, because the response object was created in a context manager, it technically calls release() implicitly.
Which confuses me, should I embed the logic of the second function inside the response handle or is my logic correct?
The async context manager will close the resources related to the request, so if you return from the function, you have to make sure you've read everything of interest. So you have two options:
read the entire response into memory, e.g. with content = await response.read() or, if the file doesn't fit into memory (and also if you want to speed things up by reading and writing in parallel)
use a queue or an async iterator to parallelize reading and writing.
Here is an untested implementation of #2:
async def fetch(url):
# return an async generator over contents of URL
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
# getting response content in chunks no larger than 4K
for chunk in response.content.iter_chunked(4096):
yield chunk
async def save_file(file_name, content_iter):
async with aiofiles.open(f'./binary/{file_name}', 'wb') as f:
for chunk in content_iter:
f.write(chunk) # maybe you need to await this?
async def main():
save_file(file_name, fetch(url))
Thanks to user4815162342's code I could find a solution by parellelizing the fetch and write coroutines. I would've checked his code as the accepted solution but since I had to add some code to make it work, here it is:
# fetch binary from server
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
async for chunk in response.content.iter_chunked(4096):
yield chunk
# write binary function
async def save_file(file_name, chunk_iter):
list(map(create_dir_tree, list_binary_sub_dirs))
async with aiofiles.open(f'./binary/bin_ts/{file_name}', 'wb') as f:
async for chunk in chunk_iter:
await f.write(chunk)
async def main(urls):
tasks = []
for url in urls:
print('running on sublist')
file_name = url.rpartition('/')[-1]
request_ts = fetch(url)
tasks.append(save_file(file_name, request_ts))
await asyncio.gather(*tasks)
asyncio.run(main(some_list_of_urls))

Retrieving data from python's coroutine object

I am trying to learn async, and now I am trying to get whois information for a batch of domains. I found this lib aiowhois, but there are only a few strokes of information, not enough for such newbie as I am.
This code works without errors, but I don't know how to print data from parsed whois variable, which is coroutine object.
resolv = aiowhois.Whois(timeout=10)
async def coro(url, sem):
parsed_whois = await resolv.query(url)
async def main():
tasks = []
sem = asyncio.Semaphore(4)
for url in domains:
task = asyncio.Task(coro(url, sem))
tasks.append(task)
await asyncio.gather(*tasks)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
You can avoid using tasks. Just apply gather to the coroutine directly.
In case you are confused about the difference, this SO QA might help you (especially the second answer).
You can have each coroutine return its result, without resorting to global variables:
async def coro(url):
return await resolv.query(url)
async def main():
domains = ...
ops = [coro(url) for url in domains]
rets = await asyncio.gather(*ops)
print(rets)
Please see the official docs to learn more about how to use gather or wait or even more options
Note: if you are using the latest python versions, you can also simplify the loop running with just
asyncio.run(main())
Note 2: I have removed the semaphore from my code, as it's unclear why you need it and where.
all_parsed_whois = [] # make a global
async def coro(url, sem):
all_parsed_whois.append(await resolv.query(url))
If you want the data as soon as it is available you could task.add_done_callback()
python asyncio add_done_callback with async def

Combine aiohttp with multiprocessing

I am making a script that gets the HTML of almost 20 000 pages and parses it to get just a portion of it.
I managed to get the 20 000 pages' content in a dataframe with aynchronous requests using asyncio and aiohttp but this script still wait for all the pages to be fetched to parse them.
async def get_request(session, url, params=None):
async with session.get(url, headers=HEADERS, params=params) as response:
return await response.text()
async def get_html_from_url(urls):
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
tasks.append(get_request(session, url))
html_page_response = await asyncio.gather(*tasks)
return html_page_response
html_pages_list = asyncio_loop.run_until_complete(get_html_from_url(urls))
Once I have the content of each page I managed to use multiprocessing's Pool to parallelize the parsing.
get_whatiwant_from_html(html_content):
parsed_html = BeautifulSoup(html_content, "html.parser")
clean = parsed_html.find("div", class_="class").get_text()
# Some re.subs
clean = re.sub("", "", clean)
clean = re.sub("", "", clean)
clean = re.sub("", "", clean)
return clean
pool = Pool(4)
what_i_want = pool.map(get_whatiwant_from_html, html_content_list)
This code mixes asynchronously the fetching and the parsing but I would like to integrate multiprocessing into it:
async def process(url, session):
html = await getRequest(session, url)
return await get_whatiwant_from_html(html)
async def dispatch(urls):
async with aiohttp.ClientSession() as session:
coros = (process(url, session) for url in urls)
return await asyncio.gather(*coros)
result = asyncio.get_event_loop().run_until_complete(dispatch(urls))
Is there any obvious way to do this? I thought about creating 4 processes that each run the asynchronous calls but the implementation looks a bit complex and I'm wondering if there is another way.
I am very new to asyncio and aiohttp so if you have anything to advise me to read to get a better understanding, I will be very happy.
You can use ProcessPoolExecutor.
With run_in_executor you can do IO in your main asyncio process.
But your heavy CPU calculations in separate processes.
async def get_data(session, url, params=None):
loop = asyncio.get_event_loop()
async with session.get(url, headers=HEADERS, params=params) as response:
html = await response.text()
data = await loop.run_in_executor(None, partial(get_whatiwant_from_html, html))
return data
async def get_data_from_urls(urls):
tasks = []
async with aiohttp.ClientSession() as session:
for url in urls:
tasks.append(get_data(session, url))
result_data = await asyncio.gather(*tasks)
return result_data
executor = concurrent.futures.ProcessPoolExecutor(max_workers=10)
asyncio_loop.set_default_executor(executor)
results = asyncio_loop.run_until_complete(get_data_from_urls(urls))
You can increase your parsing speed by changing your BeautifulSoup parser from html.parser to lxml which is by far the fastest, followed by html5lib. html.parser is the slowest of them all.
Your bottleneck is not processing issue but IO. You might want multiple threads and not process:
E.g. here is a template program that scraping and sleep to make it slow but ran in multiple threads and thus complete task faster.
from concurrent.futures import ThreadPoolExecutor
import random,time
from bs4 import BeautifulSoup as bs
import requests
URL = 'http://quotesondesign.com/wp-json/posts'
def quote_stream():
'''
Quoter streamer
'''
param = dict(page=random.randint(1, 1000))
quo = requests.get(URL, params=param)
if quo.ok:
data = quo.json()
author = data[0]['title'].strip()
content = bs(data[0]['content'], 'html5lib').text.strip()
print(f'{content}\n-{author}\n')
else:
print('Connection Issues :(')
def multi_qouter(workers=4):
with ThreadPoolExecutor(max_workers=workers) as executor:
_ = [executor.submit(quote_stream) for i in range(workers)]
if __name__ == '__main__':
now = time.time()
multi_qouter(workers=4)
print(f'Time taken {time.time()-now:.2f} seconds')
In your case, create a function that performs the task you want from starry to finish. This function would accept url and necessary parameters as arguments. After that create another function that calls the previous function in different threads, each thread having its our url. So instead of i in range(..), for url in urls. You can run 2000 threads at once, but I would prefer chunks of say 200 running parallel.

Categories