I am trying to make a request to server A, where the response will be a list of requests, which I will make to server B.
Currently request to server A is just a simple sync request like this:
import requests
req = requests.get('https://server-a.com')
data = req.json()
list_of_requests = data['requests'] # requests for server B
Since list_of_requests can be a few thousand items long, I would like to use async to speed up the requests to B.
I've looked at several examples of async HTTP requests using aiohttp, such as from
https://towardsdatascience.com/fast-and-async-in-python-accelerate-your-requests-using-asyncio-62dafca83c33
import aiohttp
import asyncio
import os
from aiohttp import ClientSession
GOOGLE_BOOKS_URL = "https://www.googleapis.com/books/v1/volumes?q=isbn:"
LIST_ISBN = [
'9780002005883',
'9780002238304',
'9780002261982',
'9780006163831',
'9780006178736',
'9780006280897',
'9780006280934',
'9780006353287',
'9780006380832',
'9780006470229',
]
def extract_fields_from_response(response):
"""Extract fields from API's response"""
item = response.get("items", [{}])[0]
volume_info = item.get("volumeInfo", {})
title = volume_info.get("title", None)
subtitle = volume_info.get("subtitle", None)
description = volume_info.get("description", None)
published_date = volume_info.get("publishedDate", None)
return (
title,
subtitle,
description,
published_date,
)
async def get_book_details_async(isbn, session):
"""Get book details using Google Books API (asynchronously)"""
url = GOOGLE_BOOKS_URL + isbn
try:
response = await session.request(method='GET', url=url)
response.raise_for_status()
print(f"Response status ({url}): {response.status}")
except HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
except Exception as err:
print(f"An error ocurred: {err}")
response_json = await response.json()
return response_json
async def run_program(isbn, session):
"""Wrapper for running program in an asynchronous manner"""
try:
response = await get_book_details_async(isbn, session)
parsed_response = extract_fields_from_response(response)
print(f"Response: {json.dumps(parsed_response, indent=2)}")
except Exception as err:
print(f"Exception occured: {err}")
pass
async with ClientSession() as session:
await asyncio.gather(*[run_program(isbn, session) for isbn in LIST_ISBN])
However, all of the examples I have looked at start with the list of requests already defined. My question is, what is the proper pythonic way/pattern of combining a single sync request and then using that request to 'spawn' async tasks?
Thanks a bunch!
Related
I have a file defining a list of RSS feeds:
RSS_FEEDS = [
"https://www.fanpage.it/feed/",
"https://www.ilfattoquotidiano.it/feed/",
"https://forbes.it/feed/",
"https://formiche.net/feed/",
]
I wrote the following test:
import requests
from feeds import RSS_FEEDS
for rssfeed in RSS_FEEDS:
response = requests.get(rssfeed)
assert response.status_code == 200
Are there more efficient (download less stuff) ways?
How would you handle a slow response vs a dead link?
The above would just tell me if the URL is fetchable, but how could I assess if it's a valid RSS stream?
You could solve it using the aiohttp library also together with asyncio, like this:
from aiohttp import ClientSession
from asyncio import gather, create_task, run, set_event_loop, set_event_loop_policy
from traceback import format_exc
import sys
# This is necessary on my Windows computer
if sys.version_info[0] == 3 and sys.version_info[1] >= 8 and sys.platform.startswith('win'): # Check for operating system
from asyncio import ProactorEventLoop, WindowsSelectorEventLoopPolicy
set_event_loop(ProactorEventLoop())
set_event_loop_policy(WindowsSelectorEventLoopPolicy()) # Bug is not present in Linux
RSS_FEEDS = [
"https://www.fanpage.it/feed/",
"https://www.ilfattoquotidiano.it/feed/",
"https://forbes.it/feed/",
"https://formiche.net/feed/",
]
async def GetRessource(url: str, session: ClientSession) -> dict:
try:
async with session.get(url) as response:
if response.status == 200:
return(response.status)
else:
r: str = await response.text()
print(f"Error, got response code: {response.status} message: {r}")
except Exception:
print(f"General Exception:\n{format_exc()}")
return({})
async def GetUrls() -> None:
async with ClientSession() as session:
Tasks: list = [create_task(GetRessource(url, session)) for url in RSS_FEEDS]
Results: list = await gather(*Tasks, return_exceptions=False)
for result in Results:
assert result == 200
async def main():
await GetUrls()
if __name__ == "__main__":
run(main())
Result of Results:
200
200
200
200
It's checking the URLs in parallel.
To optimize network usage, add a timeout parameter to the get request to limit the wait time for a response and a stream parameter to the get request to only download a portion of the response in chunks rather than the entire file.
To handle a slow/dead link, add a timeout parameter to the get request to raise an exception if the response takes too long, and catch and handle exceptions raised by the get request such as TimeoutError, ConnectionError, and HTTPError (e.g. retry, log error)
To validate an RSS stream, use a library like feedparser to parse the response and determine whether it's a valid RSS feed, as well as look for specific elements/attributes in the response (e.g. channel, item, title, link) that are required for an RSS feed.
import requests
import feedparser
from requests.exceptions import Timeout, ConnectionError, HTTPError
for rssfeed in RSS_FEEDS:
try:
response = requests.get(rssfeed, timeout=5)
response.raise_for_status()
feed = feedparser.parse(response.content)
if not feed.bozo:
# feed is valid
else:
# feed is invalid
except (Timeout, ConnectionError, HTTPError) as e:
# handle exceptions here (e.g. retry, log error)
pass
Hopefully this is not a too stupid question, but I am having trouble with aiohttp cookie processing.
Aiohttp's CookieJar class mentions it implements cookie storage adhering to RFC 6265, which states that:
cookies for a given host are shared across all the ports on that host
Cookies do not provide isolation by port. If a cookie is readable by a service running on one port, the cookie is also readable by a service running on another port of the same server.
But if I create two aiohttp servers, one that makes you "login" and gives you a cookie back, and another one with an endpoint that expects you to have a cookie, both hosted on localhost (two different ports I guess), the cookie will not be processed.
Here's a set of 4 tests using aiohttp, pytest, pytest and pytest-aiohttp to explain:
import functools
import pytest
from aiohttp import web
pytestmark = pytest.mark.asyncio
def attach_session(f):
#functools.wraps(f)
async def wrapper(request: web.Request):
session_id = request.cookies.get("testcookie")
request["mysession"] = session_id
response = await f(request)
response.set_cookie("testcookie", session_id)
return response
return wrapper
def is_logged_in(f):
#functools.wraps(f)
#attach_session
async def wrapper(request: web.Request):
session = request["mysession"]
if not session:
raise web.HTTPUnauthorized
return await f(request)
return wrapper
async def login(_: web.Request):
response = web.Response()
response.set_cookie("testcookie", "somerandomstring")
return response
#is_logged_in
async def some_endpoint(request: web.Request):
return web.Response(text="sweet")
#pytest.fixture
def auth_client(event_loop, aiohttp_client):
app = web.Application()
app.router.add_post("/login", login)
return event_loop.run_until_complete(aiohttp_client(app))
#pytest.fixture
def core_client(event_loop, aiohttp_client):
app = web.Application()
app.router.add_get("/some_endpoint", some_endpoint)
return event_loop.run_until_complete(aiohttp_client(app))
async def test_login(auth_client):
resp = await auth_client.post("/login")
assert resp.status == 200
assert resp.cookies.get("testcookie").value == "somerandomstring"
async def test_some_endpoint_anonymous(core_client):
resp = await core_client.get("/some_endpoint")
assert resp.status == 401
async def test_some_endpoint_as_logged_in(auth_client, core_client):
resp1 = await auth_client.post("/login")
resp2 = await core_client.get("/some_endpoint", cookies=resp1.cookies)
assert resp2.status == 401
async def test_some_endpoint_as_logged_in_again(auth_client, core_client):
resp1 = await auth_client.post("/login")
_cookie = list(resp1.cookies.values())[0]
resp2 = await core_client.get(
"/some_endpoint", cookies={_cookie.key: _cookie.value}
)
assert resp2.status == 200
But from my understanding, the "test_some_endpoint_as_logged_in" test should work. Why is it returning 401, while the same thing but with sending the cookie as a dict returns 200?
I think the correct way of sharing the cookies between clients would be loading the SimpleCookie object of the resp1 to the core_client.session.cookie_jar.
Changing the code of the test_some_endpoint_as_logged_in to should fix it:
async def test_some_endpoint_as_logged_in(auth_client, core_client):
resp1 = await auth_client.post("/login")
core_client.session.cookie_jar.update_cookies(resp1.cookies)
resp2 = await core_client.get("/some_endpoint")
assert resp2.status == 401
Cookie data is kept in the session object as the auth_client and core_client are different sessions with there own data cookie data is not shared. It is comparable to using a different browser with each there own cookie_jar.
How can I create http error router in aiohttp web server?
from aiohttp import web
routes = list()
routes.append(web.route('POST', '/reg', handler))
How can I create for example:
routes.append(web.error_handler(404, handler404)
Common way to create custom error pages in aiohttp is to use middleware:
from aiohttp import web
#web.middleware
async def error_middleware(request, handler):
try:
response = await handler(request)
# this is needed to handle ``return web.HTTPNotFound()`` case
if response.status == 404:
return web.Response(text='First custom 404 message', status=404)
return response
except web.HTTPException as ex:
# this is needed to handle ``raise web.HTTPNotFound()`` case
if ex.status == 404:
return web.Response(text='Second custom 404 message', status=404)
raise
# this is needed to handle non-HTTPException
except Exception:
return web.Response(text='Oops, something went wrong', status=500)
app = web.Application(middlewares=[error_middleware])
One more example is here
I'm trying to use https proxy within async requests making use of asyncio library. When it comes to use http proxy, there is a clear instruction here but I get stuck in case of using https proxy. Moreover, I would like to reuse the same session, not creating a new session every time I send a requests.
I've tried so far (proxies used within the script are directly taken from a free proxy site, so consider them as placeholders):
import asyncio
import aiohttp
from bs4 import BeautifulSoup
proxies = [
'http://89.22.210.191:41258',
'http://91.187.75.48:39405',
'http://103.81.104.66:34717',
'http://124.41.213.211:41828',
'http://93.191.100.231:3128'
]
async def get_text(url):
global proxies,proxy_url
while True:
check_url = proxy_url
proxy = f'http://{proxy_url}'
print("trying using:",check_url)
async with aiohttp.ClientSession() as session:
try:
async with session.get(url,proxy=proxy,ssl=False) as resp:
return await resp.text()
except Exception:
if check_url == proxy_url:
proxy_url = proxies.pop()
async def field_info(field_link):
text = await get_text(field_link)
soup = BeautifulSoup(text,'lxml')
for item in soup.select(".summary .question-hyperlink"):
print(item.get_text(strip=True))
if __name__ == '__main__':
proxy_url = proxies.pop()
links = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=50".format(page) for page in range(2,5)]
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(asyncio.gather(*(field_info(url) for url in links)))
loop.run_until_complete(future)
loop.close()
How can I use https proxies within the script along with reusing the same session?
This script creates dictionary proxy_session_map, where keys are proxies and values are sessions. That way we know for which proxy belongs which session.
If there's some error using the proxy, I add this proxy to disabled_proxies set so I won't use this proxy again:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
from random import choice
proxies = [
'http://89.22.210.191:41258',
'http://91.187.75.48:39405',
'http://103.81.104.66:34717',
'http://124.41.213.211:41828',
'http://93.191.100.231:3128'
]
disabled_proxies = set()
proxy_session_map = {}
async def get_text(url):
while True:
try:
available_proxies = [p for p in proxies if p not in disabled_proxies]
if available_proxies:
proxy = choice(available_proxies)
else:
proxy = None
if proxy not in proxy_session_map:
proxy_session_map[proxy] = aiohttp.ClientSession(timeout = aiohttp.ClientTimeout(total=5))
print("trying using:",proxy)
async with proxy_session_map[proxy].get(url,proxy=proxy,ssl=False) as resp:
return await resp.text()
except Exception as e:
if proxy:
print("error, disabling:",proxy)
disabled_proxies.add(proxy)
else:
# we haven't used proxy, so return empty string
return ''
async def field_info(field_link):
text = await get_text(field_link)
soup = BeautifulSoup(text,'lxml')
for item in soup.select(".summary .question-hyperlink"):
print(item.get_text(strip=True))
async def main():
links = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=50".format(page) for page in range(2,5)]
tasks = [field_info(url) for url in links]
await asyncio.gather(
*tasks
)
# close all sessions:
for s in proxy_session_map.values():
await s.close()
if __name__ == '__main__':
asyncio.run(main())
Prints (for example):
trying using: http://89.22.210.191:41258
trying using: http://124.41.213.211:41828
trying using: http://124.41.213.211:41828
error, disabling: http://124.41.213.211:41828
trying using: http://93.191.100.231:3128
error, disabling: http://124.41.213.211:41828
trying using: http://103.81.104.66:34717
BeautifulSoup to get image name from P class picture tag in Python
Scrape instagram public information from google cloud functions [duplicate]
Webscraping using R - the full website data is not loading
Facebook Public Data Scraping
How it is encode in javascript?
... and so on.
I have a server which waits for a request containing a pictures:
#app.route("/uploader_ios", methods=['POST'])
def upload_file_ios():
imagefile = request.files['imagefile']
I can submit a post request quite easily using requests in python like so:
url = "<myserver>/uploader_ios"
files = {'imagefile': open(fname, 'rb')}
%time requests.post(url, files=files).json() # 2.77s
However, what I would like to do is submit 1000 or perhaps 100,000 requests at the same time. I wanted to try to do this using asyncio because I have been able to use this for get requests without a problem. However I can't see to create a valid post request that the server accepts.
My attempt is below:
import aiohttp
import asyncio
import json
# Testing with small amount
concurrent = 2
url_list = ['<myserver>/uploader_ios'] * 10
def handle_req(data):
return json.loads(data)['English']
def chunked_http_client(num_chunks, s):
# Use semaphore to limit number of requests
semaphore = asyncio.Semaphore(num_chunks)
#asyncio.coroutine
# Return co-routine that will work asynchronously and respect
# locking of semaphore
def http_get(url):
nonlocal semaphore
with (yield from semaphore):
# Attach files
files = aiohttp.FormData()
files.add_field('imagefile', open(fname, 'rb'))
response = yield from s.request('post', url, data=files)
print(response)
body = yield from response.content.read()
yield from response.wait_for_close()
return body
return http_get
def run_experiment(urls, _session):
http_client = chunked_http_client(num_chunks=concurrent, s=_session)
# http_client returns futures, save all the futures to a list
tasks = [http_client(url) for url in urls]
dfs_route = []
# wait for futures to be ready then iterate over them
for future in asyncio.as_completed(tasks):
data = yield from future
try:
out = handle_req(data)
dfs_route.append(out)
except Exception as err:
print("Error {0}".format(err))
return dfs_route
with aiohttp.ClientSession() as session: # We create a persistent connection
loop = asyncio.get_event_loop()
calc_routes = loop.run_until_complete(run_experiment(url_list, session))
The issue is that the response I get is:
.../uploader_ios) [400 BAD REQUEST]>
I am assuming this is because I am not correctly attaching the image-file