How to do parallel requests using python?
My code so far:
x = ("address")
x.signin(signin.request())
# this code signs in the user and generate the response how to do a parallel response.
Looked into those but couldn't figure out where to use that
import asyncio
import requests
async def main():
loop = asyncio.get_event_loop()
futures = [
loop.run_in_executor(
None,
requests.get,
'http://example.org/'
)
for i in range(20)
]
for response in await asyncio.gather(*futures):
pass
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
looked into this thread also:
What is the fastest way to send 100,000 HTTP requests in Python?
Related
import requests
import json
from tqdm import tqdm
list of links to loop through
links =['https://www.google.com/','https://www.google.com/','https://www.google.com/']
for loop for the link using requests
data = []
for link in tqdm(range(len(links))):
response = requests.get(links[link])
response = response.json()
data.append(response)
the above for loop is used to loop through all the list of links but its time consuming when I tried to loop on around a thousand links any help.
Simplest way is to turn it multithreaded. Best way is probably asynchronous.
Multithreaded solution:
import requests
from tqdm.contrib.concurrent import thread_map
links =['https://www.google.com/','https://www.google.com/','https://www.google.com/']
def get_data(url):
response = requests.get(url)
response = response.json() # Do note this might fail at times
return response
data = thread_map(get_data, links)
Or without using tqdm.contrib.concurrent.thread_map:
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
links =['https://www.google.com/','https://www.google.com/','https://www.google.com/']
def get_data(url):
response = requests.get(url)
response = response.json() # Do note this might fail at times
return response
executor = ThreadPoolExecutor()
data = list(tqdm(executor.map(get_data, links), total=len(links)))
As suggested in the comment you can use asyncio and aiohttp.
import asyncio
import aiohttp
urls = ["your", "links", "here"]
# create aio connector
conn = aiohttp.TCPConnector(limit_per_host=100, limit=0, ttl_dns_cache=300)
# set number of parallel requests - if you are requesting different domains you are likely to be able to set this higher, otherwise you may be rate limited
PARALLEL_REQUESTS = 10
# Create results array to collect results
results = []
async def gather_with_concurrency(n):
# Create semaphore for async i/o
semaphore = asyncio.Semaphore(n)
# create an aiohttp session using the previous connector
session = aiohttp.ClientSession(connector=conn)
# await logic for get request
async def get(URL):
async with semaphore:
async with session.get(url, ssl=False) as response:
obj = await response.read()
# once object is acquired we append to list
results.append(obj)
# wait for all requests to be gathered and then close session
await asyncio.gather(*(get(url) for url in urls))
await session.close()
# get async event loop
loop = asyncio.get_event_loop()
# run using number of parallel requests
loop.run_until_complete(gather_with_concurrency(PARALLEL_REQUESTS))
# Close connection
conn.close()
# loop through results and do something to them
for res in results:
do_something(res)
I have tried to comment on the code as well as possible.
I have used BS4 to parse requests in this manner (in the do_something logic), but it will really depend on your use case.
I'm new to Python multiprocessing. I don't quite understand the difference between Pool and Process. Can someone suggest which one I should use for my needs?
I have thousands of http GET requests to send. After sending each and getting the response, I want to store to response (a simple int) to a (shared) dict. My final goal is to write all data in the dict to a file.
This is not CPU intensive at all. All my goal is the speed up sending the http GET requests because there are too many. The requests are all isolated and do not depend on each other.
Shall I use Pool or Process in this case?
Thanks!
----The code below is added on 8/28---
I programmed with multiprocessing. The key challenges I'm facing are:
1) GET request can fail sometimes. I have to set 3 retries to minimize the need to rerun my code/all requests. I only want to retry the failed ones. Can I achieve this with async http requests without using Pool?
2) I want to check the response value of every requests, and have exception handling
The code below is simplified from my actual code. It is working fine, but I wonder if it's the most efficient way of doing things. Can anyone give any suggestions? Thanks a lot!
def get_data(endpoint, get_params):
response = requests.get(endpoint, params = get_params)
if response.status_code != 200:
raise Exception("bad response for " + str(get_params))
return response.json()
def get_currency_data(endpoint, currency, date):
get_params = {'currency': currency,
'date' : date
}
for attempt in range(3):
try:
output = get_data(endpoint, get_params)
# additional return value check
# ......
return output['value']
except:
time.sleep(1) # I found that sleeping for 1s almost always make the retry successfully
return 'error'
def get_all_data(currencies, dates):
# I have many dates, but not too many currencies
for currency in currencies:
results = []
pool = Pool(processes=20)
for date in dates:
results.append(pool.apply_async(get_currency_data, args=(endpoint, date)))
output = [p.get() for p in results]
pool.close()
pool.join()
time.sleep(10) # Unfortunately I have to give the server some time to rest. I found it helps to reduce failures. I didn't write the server. This is not something that I can control
Neither. Use asynchronous programming. Consider the below code pulled directly from that article (credit goes to Paweł Miech)
#!/usr/local/bin/python3.5
import asyncio
from aiohttp import ClientSession
async def fetch(url, session):
async with session.get(url) as response:
return await response.read()
async def run(r):
url = "http://localhost:8080/{}"
tasks = []
# Fetch all responses within one Client session,
# keep connection alive for all requests.
async with ClientSession() as session:
for i in range(r):
task = asyncio.ensure_future(fetch(url.format(i), session))
tasks.append(task)
responses = await asyncio.gather(*tasks)
# you now have all response bodies in this variable
print(responses)
def print_responses(result):
print(result)
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(run(4))
loop.run_until_complete(future)
Just maybe create a URL's array, and instead of the given code, loop against that array and issue each one to fetch.
EDIT: Use requests_futures
As per #roganjosh comment below, requests_futures is a super-easy way to accomplish this.
from requests_futures.sessions import FuturesSession
sess = FuturesSession()
urls = ['http://google.com', 'https://stackoverflow.com']
responses = {url: sess.get(url) for url in urls}
contents = {url: future.result().content
for url, future in responses.items()
if future.result().status_code == 200}
EDIT: Use grequests to support Python 2.7
You can also us grequests, which supports Python 2.7 for performing asynchronous URL calling.
import grequests
urls = ['http://google.com', 'http://stackoverflow.com']
responses = grequests.map(grequests.get(u) for u in urls)
print([len(r.content) for r in rs])
# [10475, 250785]
EDIT: Using multiprocessing
If you want to do this using multiprocessing, you can. Disclaimer: You're going to have a ton of overhead by doing this, and it won't be anywhere near as efficient as async programming... but it is possible.
It's actually pretty straightforward, you're mapping the URL's through the http GET function:
import requests
urls = ['http://google.com', 'http://stackoverflow.com']
from multiprocessing import Pool
pool = Pool(8)
responses = pool.map(requests.get, urls)
The size of the pool will be the number of simultaneously issues GET requests. Sizing it up should increase your network efficiency, but it'll add overhead on the local machine for communication and forking.
Again, I don't recommend this, but it certainly is possible, and if you have enough cores it's probably faster than doing the calls synchronously.
I have a server which waits for a request containing a pictures:
#app.route("/uploader_ios", methods=['POST'])
def upload_file_ios():
imagefile = request.files['imagefile']
I can submit a post request quite easily using requests in python like so:
url = "<myserver>/uploader_ios"
files = {'imagefile': open(fname, 'rb')}
%time requests.post(url, files=files).json() # 2.77s
However, what I would like to do is submit 1000 or perhaps 100,000 requests at the same time. I wanted to try to do this using asyncio because I have been able to use this for get requests without a problem. However I can't see to create a valid post request that the server accepts.
My attempt is below:
import aiohttp
import asyncio
import json
# Testing with small amount
concurrent = 2
url_list = ['<myserver>/uploader_ios'] * 10
def handle_req(data):
return json.loads(data)['English']
def chunked_http_client(num_chunks, s):
# Use semaphore to limit number of requests
semaphore = asyncio.Semaphore(num_chunks)
#asyncio.coroutine
# Return co-routine that will work asynchronously and respect
# locking of semaphore
def http_get(url):
nonlocal semaphore
with (yield from semaphore):
# Attach files
files = aiohttp.FormData()
files.add_field('imagefile', open(fname, 'rb'))
response = yield from s.request('post', url, data=files)
print(response)
body = yield from response.content.read()
yield from response.wait_for_close()
return body
return http_get
def run_experiment(urls, _session):
http_client = chunked_http_client(num_chunks=concurrent, s=_session)
# http_client returns futures, save all the futures to a list
tasks = [http_client(url) for url in urls]
dfs_route = []
# wait for futures to be ready then iterate over them
for future in asyncio.as_completed(tasks):
data = yield from future
try:
out = handle_req(data)
dfs_route.append(out)
except Exception as err:
print("Error {0}".format(err))
return dfs_route
with aiohttp.ClientSession() as session: # We create a persistent connection
loop = asyncio.get_event_loop()
calc_routes = loop.run_until_complete(run_experiment(url_list, session))
The issue is that the response I get is:
.../uploader_ios) [400 BAD REQUEST]>
I am assuming this is because I am not correctly attaching the image-file
I'm trying to rewrite this Python2.7 code to the new async world order:
def get_api_results(func, iterable):
pool = multiprocessing.Pool(5)
for res in pool.map(func, iterable):
yield res
map() blocks until all results have been computed, so I'm trying to rewrite this as an async implementation that will yield results as soon as they are ready. Like map(), return values must be returned in the same order as iterable. I tried this (I need requests because of legacy auth requirements):
import requests
def get(i):
r = requests.get('https://example.com/api/items/%s' % i)
return i, r.json()
async def get_api_results():
loop = asyncio.get_event_loop()
futures = []
for n in range(1, 11):
futures.append(loop.run_in_executor(None, get, n))
async for f in futures:
k, v = await f
yield k, v
for r in get_api_results():
print(r)
but with Python 3.6 I'm getting:
File "scratch.py", line 16, in <module>
for r in get_api_results():
TypeError: 'async_generator' object is not iterable
How can I accomplish this?
Regarding your older (2.7) code - multiprocessing is considered a powerful drop-in replacement for the much simpler threading module for concurrently processing CPU intensive tasks, where threading does not work so well. Your code is probably not CPU bound - since it just needs to make HTTP requests - and threading might have been enough for solving your problem.
However, instead of using threading directly, Python 3+ has a nice module called concurrent.futures that with a cleaner API via cool Executor classes. This module is available also for python 2.7 as an external package.
The following code works on python 2 and python 3:
# For python 2, first run:
#
# pip install futures
#
from __future__ import print_function
import requests
from concurrent import futures
URLS = [
'http://httpbin.org/delay/1',
'http://httpbin.org/delay/3',
'http://httpbin.org/delay/6',
'http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.coooom/',
]
def fetch(url):
r = requests.get(url)
r.raise_for_status()
return r.content
def fetch_all(urls):
with futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(fetch, url): url for url in urls}
print("All URLs submitted.")
for future in futures.as_completed(future_to_url):
url = future_to_url[future]
if future.exception() is None:
yield url, future.result()
else:
# print('%r generated an exception: %s' % (
# url, future.exception()))
yield url, None
for url, s in fetch_all(URLS):
status = "{:,.0f} bytes".format(len(s)) if s is not None else "Failed"
print('{}: {}'.format(url, status))
This code uses futures.ThreadPoolExecutor, based on threading. A lot of the magic is in as_completed() used here.
Your python 3.6 code above, uses run_in_executor() which creates a futures.ProcessPoolExecutor(), and does not really use asynchronous IO!!
If you really want to go forward with asyncio, you will need to use an HTTP client that supports asyncio, such as aiohttp. Here is an example code:
import asyncio
import aiohttp
async def fetch(session, url):
print("Getting {}...".format(url))
async with session.get(url) as resp:
text = await resp.text()
return "{}: Got {} bytes".format(url, len(text))
async def fetch_all():
async with aiohttp.ClientSession() as session:
tasks = [fetch(session, "http://httpbin.org/delay/{}".format(delay))
for delay in (1, 1, 2, 3, 3)]
for task in asyncio.as_completed(tasks):
print(await task)
return "Done."
loop = asyncio.get_event_loop()
resp = loop.run_until_complete(fetch_all())
print(resp)
loop.close()
As you can see, asyncio also has an as_completed(), now using real asynchronous IO, utilizing only one thread on one process.
You put your event loop in another co-routine. Don't do that. The event loop is the outermost 'driver' of async code, and should be run synchronous.
If you need to process the fetched results, write more coroutines that do so. They could take the data from a queue, or could be driving the fetching directly.
You could have a main function that fetches and processes results, for example:
async def main(loop):
for n in range(1, 11):
future = loop.run_in_executor(None, get, n)
k, v = await future
# do something with the result
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
I'd make the get() function properly async too using an async library like aiohttp so you don't have to use the executor at all.
i want to write an asynchronous http client using twisted framework which fires 5 requests asynchronously/simultaneously to 5 different servers. Then compare those responses and display a result. Could someone please help regarding this.
For this situation I'd suggest using treq and DeferredList to aggregate the responses then fire a callback when all the URLs have been returned. Here is a quick example:
import treq
from twisted.internet import reactor, defer, task
def fetchURL(*urls):
dList = []
for url in urls:
d = treq.get(url)
d.addCallback(treq.content)
dList.append(d)
return defer.DeferredList(dList)
def compare(responses):
# the responses are returned in a list of tuples
# Ex: [(True, b'')]
for status, content in responses:
print(content)
def main(reactor):
urls = [
'http://swapi.co/api/films/schema',
'http://swapi.co/api/people/schema',
'http://swapi.co/api/planets/schema',
'http://swapi.co/api/species/schema',
'http://swapi.co/api/starships/schema',
]
d = fetchURL(*urls) # returns Deferred
d.addCallback(compare) # fire compare() once the URLs return w/ a response
return d # wait for the DeferredList to finish
task.react(main)
# usually you would run reactor.run() but react() takes care of that
In the main function, a list of URLs are passed into fecthURL(). There, each site will make an async request and return a Deferred that will be appended to a list. Then the final list will be used to create and return a DeferredList obj. Finally we add a callback (compare() in this case) to the DeferredList that will access each response. You would put your comparison logic in the compare() function.
You don't necessarily need twisted to make asynchronous http requests. You can use python threads and the wonderful requests package.
from threading import Thread
import requests
def make_request(url, results):
response = requests.get(url)
results[url] = response
def main():
results = {}
threads = []
for i in range(5):
url = 'http://webpage/{}'.format(i)
t = Thread(target=make_request, kwargs={'url': url, 'results': results})
t.start()
threads.append(t)
for t in threads():
t.join()
print results