EDIT2: Solved! Thanks to Michael Butscher's comment, I made a shallow copy of req_params by renaming the argument req_params to req_params_arg and then adding req_params = req_params_arg.copy() in get_assets_api_for_range().
The variable was shared between the threads, so copying before use solved the problem.
EDIT: It seems that "python requests" doesn't like being called simultaneously by several threads, I activated debug mode and I can see that the requests sent to the API are sometimes equal (same range asked) which leads to the duplicates. Weird behavior... Do you think I need to use aiohttp or asyncio??
I'm struggling with the concurrent.futures module in order to fetch a large volume of data from an API.
The API I'm using is limiting results to 100 results per page, then I'm calling it multiple times in order to get all the data.
To accelerate the process I tried to use multithreading (ThreadPoolExecutor) with a maximum of 10 threads.
It works fine and it's very quick but the results are different each time... Sometimes I get duplicates, sometimes not.
It seems not to be thread-safe somewhere but I cannot figure out where... Maybe it is hidden into the functions that uses pandas behind?
I tried to echo the no of pages it's getting and it's pretty correct (not in order but normal):
Fetching 300-400
Fetching 700-800
Fetching 800-900
Fetching 500-600
Fetching 400-500
Fetching 0-100
Fetching 200-300
Fetching 100-200
Fetching 900-1000
Fetching 600-700
Fetching 1100-1159
Fetching 1000-1100
Another weird behavior is here: when I put the line req_sess = authenticate() after the line print("Fetching {}".format(req_params['range'])) in get_assets_api_for_range function, only one page (the last I believe) is fetched multiple times.
Thanks for your help!!
Here is my code (I removed some parts, I should be enough I think), the main function called is get_assets_from_api_in_df():
from functools import partial
import pandas as pd
import requests
import concurrent.futures as cfu
def get_assets_api_for_range(range_to_fetch, req_params):
req_sess = authenticate()
req_params.update({'range': range_to_fetch})
print("Fetching {}".format(req_params['range']))
r = req_sess.get(url=auth_config['endpoint_url'] + '/assets',
params=req_params)
if r.status_code != 200:
raise ConnectionError("API Get assets error: {}".format(r))
json_response = r.json()
# This function in return, processes the list into a dataframe
return process_get_assets_from_api_in_df(json_response["asset_list"])
def get_assets_from_api_in_df() -> pd.DataFrame:
GET_NB_MAX = 100
# First, fetch 1 value to get nb to fetch
req_sess = authenticate()
r = req_sess.get(url=auth_config['endpoint_url'] + '/assets',
params={'range': '0-1'})
if r.status_code != 200:
raise ConnectionError("API Get assets error: {}".format(r))
json_response = r.json()
nb_to_fetch_total = json_response['total']
print("Nb to fetch: {}".format(nb_to_fetch_total))
# Building a queue of ranges to fetch
ranges_to_fetch_queue = []
for nb in range(0, nb_to_fetch_total, GET_NB_MAX):
if nb + GET_NB_MAX < nb_to_fetch_total:
range_str = str(nb) + '-' + str(nb + GET_NB_MAX)
else:
range_str = str(nb) + '-' + str(nb_to_fetch_total)
ranges_to_fetch_queue.append(range_str)
params = {
}
func_to_call = partial(get_assets_api_for_range,
req_params=params)
with cfu.ThreadPoolExecutor(max_workers=10) as executor:
result = list(executor.map(func_to_call, ranges_to_fetch_queue))
print("Fetch finished, merging data...")
return pd.concat(result, ignore_index=True)
I am currently trying to get live stock prices using the beautifulsoup and requests library from the yahoo finance website. I'm currently finding that the bottleneck for speed is that the request for the webpage takes around 0.5 seconds on average. Below is my code and output
from bs4 import BeautifulSoup, SoupStrainer
import requests
import time
# transform percent text into numerical (float) value
def process_percent_change(percent_text):
processed_text = []
for c in percent_text:
if c == '.' or c == '+' or c == '-' or c.isdigit():
processed_text.append(c)
if processed_text[0] == '-':
return -float("".join(processed_text[1:]))
else:
return float("".join(processed_text[1:]))
# parse most current ticker data from beautiful soup object
def get_current_data():
# define the page to be parsed for data
spy_link = "https://finance.yahoo.com/quote/SPY?p=SPY&.tsrc=fin-srch" # yahoo finance link to SPY etf info
webpage_start = time.time()
page = requests.get(spy_link).text
webpage_end = time.time()
webpage_time = webpage_end-webpage_start
print("webpage get time: {}".format(webpage_time))
parse_start = time.time()
strainer = SoupStrainer('div', attrs = {'data-reactid': '30'}) # only get relevant div info
data_element = BeautifulSoup(page, 'lxml', parse_only=strainer)
current_data = {}
# store current data inside a dictionary
current_data["current_price"] = float(data_element.find('span', attrs={"data-reactid": "32"}).text)
change = data_element.find('span', attrs={"data-reactid": "33"}).text.split()
current_data["value_change"] = float(change[0])
current_data["percent_change"] = process_percent_change(change[1])
parse_end = time.time()
parse_time = parse_end-parse_start
print("parse time: {}".format(parse_time))
# return data to be used
return current_data
if __name__ == "__main__":
current_data = get_current_data()
webpage get time: 0.42667412757873535
parse time: 0.05974388122558594
What I want to do is either speed up the request or find a more efficient way of monitoring the website for changes. I've noticed that after loading the website in my browser that the price will change without having to refresh the page. Here is a gif example for what I mean:
https://media.giphy.com/media/nZmbqDeu7rWT2vmrks/giphy.gif
Here's a link to the specific yahoo finance page: https://finance.yahoo.com/quote/SPY?p=SPY&.tsrc=fin-srch
Is there a way to monitor the stock price such as in the gif/web browser example? If there isn't, is there a way to speed up requests (without buying a faster connection)?
Any help is appreciated.
This seems a reasonable time for what you are doing (downloading and parsing a webpage).
In real I think that 0.4s should be enough for monitoring, but if you really need to have an higher update frequency (ie. you have got some new crazy trading algorithm) you can:
Try to parse a more lightweight page. (But you need to find one!)
Create a pool of thread performing many requests at the same time. You should note that this behaviour could lead to a violation of yahoo ToS and restrictions on your account/ip could be applied.
Use any sort of market API (free or payed) to avoid downloading an entire webpage and all its dependencies when you can just have something like a 20byte json (or similar) message. Also in this case you should read about the max pooling frequency you can use.
Use Selenium and perform just 1 connection to the webpage. When you detect any changes in the div you are interested in you get the updated value.
I am trying to extract the purchase_state from google play, following the steps bellow:
import base64
import requests
import smtplib
from collections import OrderedDict
import mysql.connector
from mysql.connector import errorcode
......
Query db ,returning thousand of lines with purchase_idfield from my table
Check for each row from db, and extract purchase_id, then query google play for all of them. for example if the results of my previous
step is 1000, 1000 times is querying google (refresh token + query).
Add a new field purchase status from the google play to a new dictionary apart from some other fields whcih are grabbed from mysql query.
The last step is doing a loop over my dic as a follow and prepare the desirable report
AFTER EDITED:
def build_dic_from_db(data,access_token):
dic = {}
for row in data:
product_id = row['product_id']
purchase_id = row['purchase_id']
status = check_purchase_status(access_token, product_id,purchase_id)
cnt = 1
if row['user'] not in dic:
dic[row['user']] = {'id':row['user_id'],'country': row['country_name'],'reg_ts': row['user_registration_timestamp'],'last_active_ts': row['user_last_active_action_timestamp'],
'total_credits': row['user_credits'],'total_call_sec_this_month': row['outgoing_call_seconds_this_month'],'user_status':row['user_status'],'mobile':row['user_mobile_phone_number_num'],'plus':row['user_assigned_msisdn_num'],
row['product_id']:{'tAttemp': cnt,'tCancel': status}}
else:
if row['product_id'] not in dic[row['user']]:
dic[row['user']][row['product_id']] = {'tAttemp': cnt,'tCancel':status}
else:
dic[row['user']][row['product_id']]['tCancel'] += status
dic[row['user']][row['product_id']]['tAttemp'] += cnt
return dic
The problem is that my code is working slowly ~ Total execution time: 448.7483880519867 and I am wondering if there is away to improve my script. Is there any suggestion?
I hope I'm right about this, but the bottleneck seems to be the connection to the playstore. Doing it sequentially will take a long time, whereas the server is able to take a million requests at a time. So here's a way to process your jobs with executors (you need the "concurrent" package installed)
In that example, you'll be able to send 100 requests at the same time.
from concurrent import futures
EXECUTORS = futures.ThreadPoolExecutor(max_workers=100)
jobs = dict()
for row in data:
product_id = row['product_id']
purchase_id = row['purchase_id']
job = EXECUTORS.submit(check_purchase_status,
access_token, product_id,purchase_id)
jobs[job] = row
for job in futures.as_completed(jobs.keys()):
# here collect your results and do something useful with them :)
status = job.result()
# make the connection with current row
row = jobs[job]
# now you have your status and the row
And BTW try to use temp variables or you're constantly accessing your dictionary with the same keys, which is not good for performance AND readability of your code.
My question involves learning how to retrieve my entire list of friends using Facebook's Python API. The current result returns an object with limited number of friends and a link to the 'next' page. How do I use this to fetch the next set of friends ? (Please post the link to possible duplicates) Any help would be much appreciated. In general, I need to learn about the pagination involved the API usage.
import facebook
import json
ACCESS_TOKEN = "my_token"
g = facebook.GraphAPI(ACCESS_TOKEN)
print json.dumps(g.get_connections("me","friends"),indent=1)
Sadly the documentation of pagination is an open issue since almost 2 years. You should be able to paginate like this (based on this example) using requests:
import facebook
import requests
ACCESS_TOKEN = "my_token"
graph = facebook.GraphAPI(ACCESS_TOKEN)
friends = graph.get_connections("me","friends")
allfriends = []
# Wrap this block in a while loop so we can keep paginating requests until
# finished.
while(True):
try:
for friend in friends['data']:
allfriends.append(friend['name'].encode('utf-8'))
# Attempt to make a request to the next page of data, if it exists.
friends=requests.get(friends['paging']['next']).json()
except KeyError:
# When there are no more pages (['paging']['next']), break from the
# loop and end the script.
break
print allfriends
Update: There's a new generator method available which implements above behavior and can be used to iterate over all friends like this:
for friend in graph.get_all_connections("me", "friends"):
# Do something with this friend.
Meanwhile I was searching answer here is much better approach:
import facebook
access_token = ""
graph = facebook.GraphAPI(access_token = access_token)
totalFriends = []
friends = graph.get_connections("me", "/friends&summary=1")
while 'paging' in friends:
for i in friends['data']:
totalFriends.append(i['id'])
friends = graph.get_connections("me", "/friends&summary=1&after=" + friends['paging']['cursors']['after'])
At end point you will get one response where data will be empty and then there will be no 'paging' key so at that time it will break and all the data will be stored.
I couldn't find this anywhere, these answers seem super complicated and just no way I would even use an SDK if I had do stuff like that when Paging from a simple POST is so easy to start with, however:
FacebookAdsApi.init(my_app_id, my_app_secret, my_access_token)
my_account = AdAccount('act_23423423423423423')
# In the below, I added the limit to the max rows, 250.
# Also more importantly, paging. the SDK has a really sneaky way of doing this,
# enclose the request in a list() the results end up the same, but this will make the script request new objects until there are no more
#I tested this example and compared to Graph API and as of right now, 1/22 9:47AM, I get 81 from Graph and 81 here.
fields = ['name']
params = {'limit':250}
ads = list(my_account.get_ads(
fields = fields,
params = params,
))
Trick from the docs: "NOTE: We wrap the return value of get_ad_accounts with list() because get_ad_accounts returns an EdgeIterator object (located in facebook_business.adobjects) and we want to get the full list right away instead of having the iterator lazily loading accounts."
https://github.com/facebook/facebook-python-business-sdk
in this example you off set / pagination by one at the time, i think my while loop is simple since it only looking for the pagination key"next" to be none, if doesnt exists means we finish looping, and you will have your results in a list.
in this example i am just looking for all the people call jacob
import requests
import facebook
token = access_token="your token goes here"
fb = facebook.GraphAPI(access_token=token)
limit = 1
offset = 0
data = {"q": "jacob",
"type": "user",
"fields": "id",
"limit": limit,
"offset": offset}
req = fb.request('/search', args=data, method='GET')
users = []
for item in req['data']:
users.append(item["id"])
pag = req['paging']
while pag.get("next") is not None:
offset += limit
data["offset"] = offset
req = fb.request('/search', args=data, method='GET')
for item in req['data']:
users.append(item["id"])
pag = req.get('paging')
print users
I'm working on a project for school and I am trying to get data about movies. I've managed to write a script to get the data I need from IMDbPY and Open Movie DB API (omdbapi.com). The challenge I'm experiencing is that I'm trying to get data for 22,305 movies and each request takes about 0.7 seconds. Essentially my current script will take about 8 hours to complete. Looking for any way to maybe use multiple requests at the same time or any other suggestions to significantly speed up the process of getting this data.
import urllib2
import json
import pandas as pd
import time
import imdb
start_time = time.time() #record time at beginning of script
#used to make imdb.com think we are getting this data from a browser
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
#Open Movie Database Query url for IMDb IDs
url = 'http://www.omdbapi.com/?tomatoes=true&i='
#read the ids from the imdb_id csv file
imdb_ids = pd.read_csv('ids.csv')
cols = [u'Plot', u'Rated', u'tomatoImage', u'Title', u'DVD', u'tomatoMeter',
u'Writer', u'tomatoUserRating', u'Production', u'Actors', u'tomatoFresh',
u'Type', u'imdbVotes', u'Website', u'tomatoConsensus', u'Poster', u'tomatoRotten',
u'Director', u'Released', u'tomatoUserReviews', u'Awards', u'Genre', u'tomatoUserMeter',
u'imdbRating', u'Language', u'Country', u'imdbpy_budget', u'BoxOffice', u'Runtime',
u'tomatoReviews', u'imdbID', u'Metascore', u'Response', u'tomatoRating', u'Year',
u'imdbpy_gross']
#create movies dataframe
movies = pd.DataFrame(columns=cols)
i=0
for i in range(len(imdb_ids)-1):
start = time.time()
req = urllib2.Request(url + str(imdb_ids.ix[i,0]), None, headers) #request page
response = urllib2.urlopen(req) #actually call the html request
the_page = response.read() #read the json from the omdbapi query
movie_json = json.loads(the_page) #convert the json to a dict
#get the gross revenue and budget from IMDbPy
data = imdb.IMDb()
movie_id = imdb_ids.ix[i,['imdb_id']]
movie_id = movie_id.to_string()
movie_id = int(movie_id[-7:])
data = data.get_movie_business(movie_id)
data = data['data']
data = data['business']
#get the budget $ amount out of the budget IMDbPy string
try:
budget = data['budget']
budget = budget[0]
budget = budget.replace('$', '')
budget = budget.replace(',', '')
budget = budget.split(' ')
budget = str(budget[0])
except:
None
#get the gross $ amount out of the gross IMDbPy string
try:
budget = data['budget']
budget = budget[0]
budget = budget.replace('$', '')
budget = budget.replace(',', '')
budget = budget.split(' ')
budget = str(budget[0])
#get the gross $ amount out of the gross IMDbPy string
gross = data['gross']
gross = gross[0]
gross = gross.replace('$', '')
gross = gross.replace(',', '')
gross = gross.split(' ')
gross = str(gross[0])
except:
None
#add gross to the movies dict
try:
movie_json[u'imdbpy_gross'] = gross
except:
movie_json[u'imdbpy_gross'] = 0
#add gross to the movies dict
try:
movie_json[u'imdbpy_budget'] = budget
except:
movie_json[u'imdbpy_budget'] = 0
#create new dataframe that can be merged to movies DF
tempDF = pd.DataFrame.from_dict(movie_json, orient='index')
tempDF = tempDF.T
#add the new movie to the movies dataframe
movies = movies.append(tempDF, ignore_index=True)
end = time.time()
time_took = round(end-start, 2)
percentage = round(((i+1) / float(len(imdb_ids))) * 100,1)
print i+1,"of",len(imdb_ids),"(" + str(percentage)+'%)','completed',time_took,'sec'
#increment counter
i+=1
#save the dataframe to a csv file
movies.to_csv('movie_data.csv', index=False)
end_time = time.time()
print round((end_time-start_time)/60,1), "min"
Use Eventlet library to fetch concurently
As advised in comments, you shall fetch your feeds concurrently. This can be done by using treading, multiprocessing, or using eventlet.
Install eventlet
$ pip install eventlet
Try web crawler sample from eventlet
See: http://eventlet.net/doc/examples.html#web-crawler
Understanding concurrency with eventlet
With threading system takes care of switching between your threads. This brings big problem in case you have to access some common data structures, as you never know, which other thread is currently accessing your data. You then start playing with synchronized blocks, locks, semaphores - just to synchronize access to your shared data structures.
With eventlet it goes much simpler - you always run only one thread and jump between them only at I/O instructions or at other eventlet calls. The rest of your code runs uninterrupted and without a risk, another thread would mess up with our data.
You only have to take care of following:
all I/O operations must be non-blocking (this is mostly easy, eventlet provides non-blocking versions for most of the I/O you need).
your remaining code must not be CPU expensive as it would block switching between "green" threads for longer time and the power of "green" multithreading would be gone.
Great advantage with eventlet is, that it allows to write code in straightforward way without spoiling it (too) much with Locks, Semaphores etc.
Apply eventlet to your code
If I understand it correctly, list of urls to fetch is known in advance and order of their processing in your analysis is not important. This shall allow almost direct copy of example from eventlet. I see, that an index i has some significance, so you might consider mixing url and the index as a tuple and processing them as independent jobs.
There are definitely other methods, but personally I have found eventlet really easy to use comparing it to other techniques while getting really good results (especially with fetching feeds). You just have to grasp main concepts and be a bit careful to follow eventlet requirements (keep being non-blocking).
Fetching urls using requests and eventlet - erequests
There are various packages for asynchronous processing with requests, one of them using eventlet and being namederequests see https://github.com/saghul/erequests
Simple sample fetching set of urls
import erequests
# have list of urls to fetch
urls = [
'http://www.heroku.com',
'http://python-tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://kennethreitz.com'
]
# erequests.async.get(url) creates asynchronous request
async_reqs = [erequests.async.get(url) for url in urls]
# each async request is ready to go, but not yet performed
# erequests.map will call each async request to the action
# what returns processed request `req`
for req in erequests.map(async_reqs):
if req.ok:
content = req.content
# process it here
print "processing data from:", req.url
Problems for processing this specific question
We are able to fetch and somehow process all urls we need. But in this question, processing is bound to particular record in source data, so we will need to match processed request with index of record we need for getting further details for final processing.
As we will see later, asynchronous processing does not honour order of requests, some are processed sooner and some later and map yields whatever is completed.
One option is to attach index of given url to the requests and use it later when processing returned data.
Complex sample of fetching and processing urls with preserving url indices
Note: following sample is rather complex, if you can live with solution provided above, skip this. But make sure you are not running into problems detected and resolved below (urls being modified, requests following redirects).
import erequests
from itertools import count, izip
from functools import partial
urls = [
'http://www.heroku.com',
'http://python-tablib.org',
'http://httpbin.org',
'http://python-requests.org',
'http://kennethreitz.com'
]
def print_url_index(index, req, *args, **kwargs):
content_length = req.headers.get("content-length", None)
todo = "PROCESS" if req.status_code == 200 else "WAIT, NOT YET READY"
print "{todo}: index: {index}: status: {req.status_code}: length: {content_length}, {req.url}".format(**locals())
async_reqs = (erequests.async.get(url, hooks={"response": partial(print_url_index, i)}) for i, url in izip(count(), urls))
for req in erequests.map(async_reqs):
pass
Attaching hooks to request
requests (and erequests too) allows defining hooks to event called response. Each time, the request gets a response, this hook function is called and can do something or even modify the response.
Following line defines some hook to response:
erequests.async.get(url, hooks={"response": partial(print_url_index, i)})
Passing url index to the hook function
Signature of any hook shall be func(req, *args, *kwargs)
But we need to pass into the hook function also the index of url we are processing.
For this purpose we use functools.partial which allows creation of simplified functions by fixing some of parameters to specific value. This is exactly what we need, if you see print_url_index signature, we need just to fix value of index, the rest will fit requirements for hook function.
In our call we use partial with name of simplified function print_url_index and providing for each url unique index of it.
Index could be provided in the loop by enumerate, in case of larger number of parameters we may work more memory efficient way and use count, which generates each time incremented number starting by default from 0.
Let us run it:
$ python ereq.py
WAIT, NOT YET READY: index: 3: status: 301: length: 66, http://python-requests.org/
WAIT, NOT YET READY: index: 4: status: 301: length: 58, http://kennethreitz.com/
WAIT, NOT YET READY: index: 0: status: 301: length: None, http://www.heroku.com/
PROCESS: index: 2: status: 200: length: 7700, http://httpbin.org/
WAIT, NOT YET READY: index: 1: status: 301: length: 64, http://python-tablib.org/
WAIT, NOT YET READY: index: 4: status: 301: length: None, http://kennethreitz.org
WAIT, NOT YET READY: index: 3: status: 302: length: 0, http://docs.python-requests.org
WAIT, NOT YET READY: index: 1: status: 302: length: 0, http://docs.python-tablib.org
PROCESS: index: 3: status: 200: length: None, http://docs.python-requests.org/en/latest/
PROCESS: index: 1: status: 200: length: None, http://docs.python-tablib.org/en/latest/
PROCESS: index: 0: status: 200: length: 12064, https://www.heroku.com/
PROCESS: index: 4: status: 200: length: 10478, http://www.kennethreitz.org/
This shows, that:
requests are not processed in the order they were generated
some requests follow redirection, so hook function is called multiple times
carefully inspecting url values we can see, that no url from original list urls is reported by response, even for index 2 we got extra / appended. That is why simple lookup of response url in original list of urls would not help us.
When web-scraping we generally have two types of bottlenecks:
IO blocks - whenever we make a request, we need to wait for the server to respond, which can block our entire program.
CPU blocks - when parsing web scraped content, our code might be limited by CPU processing power.
CPU Speed
CPU blocks are an easy fix - we can spawn more processes. Generally, 1 CPU core can efficiently handle 1 process. So if our scraper is running on a machine that has 12 CPU cores we can spawn 12 processes for 12x speed boost:
from concurrent.futures import ProcessPoolExecutor
def parse(html):
... # CPU intensive parsing
htmls = [...]
with ProcessPoolExecutor() as executor:
for result in executor.map(parse, htmls):
print(result)
Python's ProcessPooolExecutor spawns optimal amount of threads (equal to CPU cores) and distributes task through them.
IO Speed
For IO-blocking we have more options as our goal is to get rid of useless waiting which can be done through threads, processes and asyncio loops.
If we're making thousands of requests we can't spawn hundreds of processes. Threads will be less expensive but still, there's a better option - asyncio loops.
Asyncio loops can execute tasks in no specific order. In other words, while task A is being blocked task B can take over the program. This is perfect for web scraping as there's very little overhead computing going on. We can scale to thousands requests in a single program.
Unfortunately, for asycio to work, we need to use python packages that support asyncio. For example, by using httpx and asyncio we can speed up our scraping significantly:
# comparing synchronous `requests`:
import requests
from time import time
_start = time()
for i in range(50):
request.get("http://httpbin.org/delay/1")
print(f"finished in: {time() - _start:.2f} seconds")
# finished in: 52.21 seconds
# versus asynchronous `httpx`
import httpx
import asyncio
from time import time
_start = time()
async def main():
async with httpx.AsyncClient() as client:
tasks = [client.get("http://httpbin.org/delay/1") for i in range(50)]
for response_future in asyncio.as_completed(tasks):
response = await response_future
print(f"finished in: {time() - _start:.2f} seconds")
asyncio.run(main())
# finished in: 3.55 seconds
Combining Both
With async code we can avoid IO-blocks and with processes we can scale up CPU intensive parsing - a perfect combo to optimize web scraping:
import asyncio
import multiprocessing
from concurrent.futures import ProcessPoolExecutor
from time import sleep, time
import httpx
async def scrape(urls):
"""this is our async scraper that scrapes"""
results = []
async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client:
scrape_tasks = [client.get(url) for url in urls]
for response_f in asyncio.as_completed(scrape_tasks):
response = await response_f
# emulate data parsing/calculation
sleep(0.5)
...
results.append("done")
return results
def scrape_wrapper(args):
i, urls = args
print(f"subprocess {i} started")
result = asyncio.run(scrape(urls))
print(f"subprocess {i} ended")
return result
def multi_process(urls):
_start = time()
batches = []
batch_size = multiprocessing.cpu_count() - 1 # let's keep 1 core for ourselves
print(f"scraping {len(urls)} urls through {batch_size} processes")
for i in range(0, len(urls), batch_size):
batches.append(urls[i : i + batch_size])
with ProcessPoolExecutor() as executor:
for result in executor.map(scrape_wrapper, enumerate(batches)):
print(result)
print("done")
print(f"multi-process finished in {time() - _start:.2f}")
def single_process(urls):
_start = time()
results = asyncio.run(scrape(urls))
print(f"single-process finished in {time() - _start:.2f}")
if __name__ == "__main__":
urls = ["http://httpbin.org/delay/1" for i in range(100)]
multi_process(urls)
# multi-process finished in 7.22
single_process(urls)
# single-process finished in 51.28
These foundation concepts sound complex, but once you narrow it down to the roots of the issue, the fixes are very straight and already present in Python!
For more details on this subject see my blog Web Scraping Speed: Processes, Threads and Async