here's the problem:
I have a list of authorization headers to check.
For that, I use aiohhtp
def make_tasks(session, proxies, unchecked_headers):
tasks = list()
for unchecked_header in unchecked_headers:
current_proxy = proxies.pop()
headers['authorization'] = unchecked_header
t = session.get(url, proxy=current_proxy, headers=headers)
tasks.append(t)
return tasks
async def check_headers(proxies, unchecked_headers):
async with aiohttp.ClientSession() as s:
tasks = make_tasks(s, proxies, unchecked_headers)
results = await asyncio.gather(*tasks)
for result in results:
...
Now, depending on the response code, I need to log some information about that authorization header. The issue is that no where in the response it mentions the actual header. So, I get all of the responses back, yet I don't know which response corresponds to which header, as they are asynchronous.
I looked around, and didn't find a way to check which headers were sent initially on the ClientResponse object. What can I do here?
You can use asyncio.as_completed + wrapper around session.get to return response and any additional data you want. For example:
import aiohttp
import asyncio
url = "https://httpbin.org/get"
unchecked_headers = [
"A",
"B",
"C",
]
proxies = ["Proxy1", "Proxy2", "Proxy3"]
headers = {}
def make_tasks(session, proxies, unchecked_headers):
async def _wrapper(t, *args):
response = await t
return response, *args
tasks = list()
for unchecked_header in unchecked_headers:
current_proxy = proxies.pop()
headers["authorization"] = unchecked_header
# I commented this out because I don't have access to proxy:
# t = session.get(url, proxy=current_proxy, headers=headers)
t = session.get(url, headers=headers)
tasks.append(_wrapper(t, current_proxy, unchecked_header))
return tasks
async def check_headers(proxies, unchecked_headers):
async with aiohttp.ClientSession() as s:
for task in asyncio.as_completed(
make_tasks(s, proxies, unchecked_headers)
):
response, proxy, header = await task
print(response.url, proxy, header)
async def main():
await check_headers(proxies, unchecked_headers)
if __name__ == "__main__":
asyncio.run(main())
Prints:
https://httpbin.org/get Proxy3 A
https://httpbin.org/get Proxy1 C
https://httpbin.org/get Proxy2 B
Related
I have csv file from which one column serves as the data that is passed in two of the POST method calls. I have three different URLs which need to be called subsequently in the code. I define the session using aiohttp and loop for the column length(one whose cell values are sent in two of the POST calls). The data obtained from the responses is stored in a list and that is used for a third POST call with different url.
The first two calls seem to execute fine and pretty fast(i printed the responses for them and verified), but when execution comes to the third one, this error shows up:
OSError: [Errno 24] Too many open files
I have tried some solutions which suggest to specify connector=aiohttp.TCPConnector(verify_ssl=False) in aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False))
But this doesn't work. What should be the best way to tackle this? The second URL curently that i use is an HTTP one, localhost url, which will be changed to https cloud based url eventually.
Here is the example code depicting the situation:
import aiohttp
import pandas as pd
import asyncio
import requests
import json
import time
start_time = time.time()
df = pd.read_csv('Sample.csv', header=None, dtype=str)
RequestBodyvalues = df.iloc[:, [0]].values
async def get_FirstUrlresponse(session, url, requestBody, headers):
async with session.post(url, data = json.dumps(requestBody), headers = headers) as resp:
response = await resp.json()
return response
async def get_SecondUrlresponse(session, url, requestBody, headers):
async with session.post(url, data = json.dumps(requestBody), headers = headers) as resp:
response = await resp.json()
return response
async def get_ThirdUrlresponse(session, url, requestBody, headers):
async with session.post(url, data = json.dumps(requestBody), headers = headers) as resp:
response = await resp.json()
return response
async def main():
async with aiohttp.ClientSession() as session:
FirstUrlTasks = []
SecondUrlTasks = []
for reqBody in RequestBodyvalues:
firstUrl = 'https://firstUrl.com/searchByValue'
secondUrl = 'http://secondUrl.com/someEndpoint'
requestBody = {'value': reqBody}
headers = {'Authorization' : 'Bearer Token',
'content-type': 'application/json'}
FirstUrlTasks.append(asyncio.ensure_future(get_FirstUrlresponse(session, firstUrl, requestBody, headers)))
SecondUrlTasks.append(asyncio.ensure_future(get_SecondUrlresponse(session, secondUrl, requestBody, headers)))
firstUrlResponses = await asyncio.gather(*FirstUrlTasks)
secondUrlresponses = await asyncio.gather(*SecondUrlTasks)
valuesForThridUrl = []
for secondUrlresponse in secondUrlresponses:
#Logic to fetch values to pass to Third Url stored in list
ThirdUrlTasks = []
for value in valuesForThridUrl:
ThirdUrl = 'https://thirdUrl.com/someEndpoint'
requestBody = {'reqBody': value}
headers = {'Authorization' : 'Bearer Token',
'content-type': 'application/json'}
ThirdUrlTasks.append(asyncio.ensure_future(get_ThirdUrlresponse(session, ThirdUrl, requestBody, headers)))
thirdUrlresponses = await asyncio.gather(*ThirdUrlTasks)
asyncio.run(main())
See open file limits with this command
ulimit -n
and then increase limit
ulimit -n NUM
I'm creating an optimized multi-threading app using asyncio and want to add a rotating proxy into the mix.
Starting with a sample taken from this outstanding article:
Speed Up Your Python Program With Concurrency
I added a rotating proxy and it stopped working. The code simply exits the function after touching the line for the proxy.
This little snippet of code works, but not when added to the main script as shown in the screenshot above.
import asyncio
import random as rnd
async def download_site():
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
asyncio.run(download_site())
And here's the full sample:
import asyncio
import time
import aiohttp
# Sample code taken from here:
# https://realpython.com/python-concurrency/#asyncio-version
# Info for adding headers for the proxy (Scroll toward the bottom)
# https://docs.aiohttp.org/en/stable/client_advanced.html
# Good read to possible improve performance on large lists of URLs
# https://asyncio.readthedocs.io/en/latest/webscraper.html
# RUN THIS METHOD TO SEE HOW IT WORKS.
# # Original Code (working...)
# async def download_site(session, url):
# async with session.get(url, proxy="http://proxy.com") as response:
# print("Read {0} from {1}".format(response.content_length, url))
def get_proxy(self):
proxy_list = [
(754, '38.39.205.220:80'),
(681, '38.39.204.100:80'),
(682, '38.39.204.101:80'),
(678, '38.39.204.94:80')
]
proxy = random.choice(proxy_list)
print(proxy[1])
return proxy
async def download_site(session, url):
proxy_list = [
('38.39.205.220:80'),
('38.39.204.100:80'),
('38.39.204.101:80'),
('38.39.204.94:80')
]
await asyncio.sleep(1)
proxy = rnd.choice(proxy_list)
print(proxy)
async with session.get(url, proxy="http://" + proxy) as response:
print("Read {0} from {1}".format(response.content_length, url))
async def download_all_sites(sites):
async with aiohttp.ClientSession() as session:
tasks = []
for url in sites:
task = asyncio.ensure_future(download_site(session, url))
tasks.append(task)
await asyncio.gather(*tasks, return_exceptions=True)
# Modified to loop thru only 1 URL to make debugging simple
if __name__ == "__main__":
sites = [
"https://www.jython.org",
# "http://olympus.realpython.org/dice",
] #* 80
start_time = time.time()
asyncio.get_event_loop().run_until_complete(download_all_sites(sites))
duration = time.time() - start_time
print(f"Downloaded {len(sites)} sites in {duration} seconds")
Thank you for any help you can offer.
You use return_exceptions=True but you don't actually check the returned results for errors. You can use asyncio.as_completed to handle exceptions and get the earliest next result:
import asyncio
import random
import traceback
import aiohttp
URLS = ("https://stackoverflow.com",)
TIMEOUT = 5
PROXIES = (
"http://38.39.205.220:80",
"http://38.39.204.100:80",
"http://38.39.204.101:80",
"http://38.39.204.94:80",
)
def get_proxy():
return random.choice(PROXIES)
async def download_site(session, url):
proxy = get_proxy()
print(f"Got proxy: {proxy}")
async with session.get(url, proxy=f"{proxy}", timeout=TIMEOUT) as resp:
print(f"{url}: {resp.status}")
return await resp.text()
async def main():
tasks = []
async with aiohttp.ClientSession() as session:
for url in URLS:
tasks.append(asyncio.create_task(download_site(session, url)))
for coro in asyncio.as_completed(tasks):
try:
html = await coro
except Exception:
traceback.print_exc()
else:
print(len(html))
if __name__ == "__main__":
asyncio.run(main())
I have X initial urls that are paginated - in order to get the next set of data, I have to grab the next url from the response header until there is no next url. I am having trouble getting this going right. I'm trying a queue approach that I found here.
import asyncio
from aiohttp import ClientSession, TCPConnector
async def get(session, url):
headers = {
'Authorization': 'Bearer KEY',
}
async with session.get(url, headers=headers) as response:
json = await response.json()
return json, response
async def process(session, url, q):
try:
try:
views, response = await get(session, url)
scode = response.status
if scode == 404:
return
except Exception as e:
print(e)
return
try:
await q.put(str(response.links["next"]["url"]))
except:
pass
<do something with views>
except Exception as e:
print(e)
async def fetch_worker(session, q):
while True:
url = await q.get()
try:
await process(session, url, q)
except Exception as e:
print(e)
finally:
q.task_done()
async def d():
<code to query and put data into stdrows>
connector = TCPConnector(limit=500)
async with ClientSession(connector=connector) as session:
url = '<some base url>'
for i in range(500):
tasks.append(asyncio.create_task(fetch_worker(session, url_queue)))
for row in stdrows:
await url_queue.put(url.format(row[1]))
await asyncio.gather(*tasks)
await url_queue.join()
asyncio.run(d())
This appears not to be going at 500 tasks/sec. is it even possible to get to this rate without knowing all the URLs ahead of time? I am hoping to fetch the next url from whatever initial url (or from its paginated url) while i work with views.
I'm trying to check a set of URLs for their status code and return all that are of code 4xx or 5xx.
In total I need to check about 12500 URLs and my script works fine for up to about 7000 URLs. Above that the script crashes with ResourceWarning unclosed transport error.
I'm using python-3.6 and aiohttp 3.5.4
Any idea what's causing this?
async def fetch(url, session):
async with session.get(url) as response:
data = response.status
return url, data
async def bound_fetch(sem, url, session):
async with sem:
return await fetch(url, session)
async def check_urls(url_list):
''' get status code for all urls and write into dictionary '''
base_url = <base_url>
tasks = []
sem = asyncio.Semaphore(10)
async with ClientSession() as session:
for url in url_list:
full_url = base_url + url
task = asyncio.ensure_future(bound_fetch(sem, full_url.format(), session))
tasks.append(task)
results = await asyncio.gather(*tasks)
results_dict = defaultdict(list)
for res in results:
if res[1] != 200 and res[1] != 301 and res[1] != 302:
print(f'ERROR {str(res[1])} {res[0]}')
results_dict[res[1]].append(res[0])
print(f'URLs checked, found {str(len(results_dict))} errors')
''' main function'''
loop = asyncio.get_event_loop()
loop.set_debug(True)
warnings.simplefilter('always', ResourceWarning)
future = asyncio.ensure_future(check_urls(list_of_urls))
loop.run_until_complete(future)
First of all heres the code:
import random
import asyncio
from aiohttp import ClientSession
import csv
headers =[]
def extractsites(file):
sites = []
readfile = open(file, "r")
reader = csv.reader(readfile, delimiter=",")
raw = list(reader)
for a in raw:
sites.append((a[1]))
return sites
async def bound_fetch(sem, url):
async with sem:
print("doing request for "+ url)
async with ClientSession() as session:
async with session.get(url) as response:
responseheader = await response.headers
print(headers)
async def run():
urls = extractsites("cisco-umbrella.csv")
tasks = []
sem = asyncio.Semaphore(100)
for i in urls:
task = asyncio.ensure_future(bound_fetch(sem, "http://"+i))
tasks.append(task)
headers = await asyncio.wait(*tasks)
print(headers)
def main():
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run())
loop.run_until_complete(future)
if __name__ == '__main__':
main()
As per my last question I'm following this blog post:
https://pawelmhm.github.io/asyncio/python/aiohttp/2016/04/22/asyncio-aiohttp.html
I tried to adapt my code as close as possible to the example implementation but this code is still not making any requests and printing the headers in bound_headers as I wish.
Can somebody spot whats wrong with this code ?
response.headers is a regular property, no need to put await before the call
asyncio.wait on other hand accepts a list of futures and returns (done, pending) pair.
Looks like you should replace await wait() call with await asyncio.gather(*tasks) (gather doc)