How can I test concurrency using ThreadPoolExecutor? - python

I have written a program that I would like to use concurrency on and I have implemented it but I am having a hard time testing to see if it will actually spin up more threads than just one. Does anyone have any suggestions? I am just trying to see if this code will ever use 2,3,4,5 workers.
def read_files():
t0 = time.process_time()
cols = ['fname', ' lname', ' age']
path = 'data'
files = glob.glob(os.path.join(path, "*.csv"))
# with open('data/url') as f:
# for line in f:
# files.append(line.rstrip('\n'))
bad_files = []
df_list = []
for file in files:
try:
temp = pd.read_csv(file)
if temp.columns.to_list() == cols:
df_list.append(temp)
else:
bad_files.append(file)
except ParserError as pe:
bad_files.append(file)
logging.error(f'Parsing Error on {file}. Error: {pe}')
except ValueError as ve:
logging.error(f'Value error on reading the csv: {temp}, error: {ve}')
bad_files.append(file)
except urllib.error.HTTPError as he:
bad_files.append(file)
logging.error(f'Http Error {he}, Code {he.code}')
except Exception as e:
bad_files.append(file)
logging.error(f'Error grabbing data from given {file} possible HTTP error. Error: {e}')
print(f'Files that were not read {bad_files}')
df = pd.concat(df_list)
t1 = time.process_time()
print(f'It took {t1 - t0} seconds, to read and fill the dataframe.')
return df
def run_calculations(df):
if len(df.index) % 2 == 0:
print(f'Even number of entries, pandas median() method will add both middle numbers and find the average.')
average = round(df[' age'].mean())
median = df[' age'].median()
names_arr = df[df[' age'] == median].values[0]
fname = names_arr[0]
lname = names_arr[1]
print(f'The Average Age is {int(average)}, The Median Age is {int(median)}. {fname} {lname} is {int(median)}')
if __name__ == '__main__':
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
executor.submit(run_calculations(read_files()))
print(f'I have used {len(executor._threads)} thread(s) for processing')

the short answer is: no, your code will not use more than one workers.
The reason is you passed a wrong-typed argument to executor.submit, which accepts a callable while you are passing basically None.
A quick fix would be to replace executor.submit(run_calculations(read_files())) with executor.submit(lambda : run_calculations(read_files()))
The following snippet will help to explain how to submit a callable to executor:
import time
import threading
from concurrent.futures import ThreadPoolExecutor
def task(time_to_sleep):
time.sleep(time_to_sleep)
print(id(threading.current_thread()))
def use_single_worker():
print("in use single worker")
with ThreadPoolExecutor(max_workers=5) as executor:
# a single thread id will get dumped multi times
futures = [executor.submit(task(i)) for i in range(10)]
for future in futures:
try:
future.result()
except Exception:
pass
def use_multiple_workers():
print("in use multiple workers")
with ThreadPoolExecutor(max_workers=5) as executor:
# different thread ids will get dumped
futures = [executor.submit(lambda: task(i)) for i in range(10)]
for future in futures:
try:
future.result()
except Exception:
pass
if __name__ == '__main__':
use_single_worker()
use_multiple_workers()

Related

Why my code still slow after threading for 15k records only, how to fix this

I have a script, taking links from a file, visiting it, getting re-directed links, storing it back. But it works too slow on a file with 15k records. How can I make it quick? already used threading
Please do help to fix it out!, I've tried multiple ways, threadings but I cannot make it quick. Is there any solution to my problem by any chance? any expert who could help me out.
import concurrent.futures
import sys
import pandas as pd
import requests
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
file_name =None
concurrent = 10000
q = None
count=0
df =None
def do_work():
while True:
global q
url = q.get()
res = get_status(url)
q.task_done()
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
print(count)
count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=do_work)
t.daemon = True
t.start()
try:
for url in urls:
if type(url)!=str:
continue
q.put(url.strip())
q.join()
except KeyboardInterrupt:
sys.exit(1)
process_data()
for i in range (len(df['shopify'])):
for j in range(len(old)):
if df['shopify'][i]==old[j]:
df['shopify'][i]=out[j]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)
Email,shopify,Proofy_Status_Name
hello#knobblystudio.com,http://puravidabracelets.myshopify.com,Deliverable
service#cafe-select.co.uk,cafe-select.co.uk,Deliverable
mtafich#gmail.com,,Deliverable
whoopies#stevessnacks.com,stevessnacks.com,Deliverable
customerservice#runwayriches.com,runwayriches.com,Deliverable
shop#blackdogride.com.au,blackdogride.com.au,Deliverable
anavasconcelos.nica#gmail.com,grass4you.com,Deliverable
info#prideandprestigehair.com,prideandprestigehair.com,Deliverable
info#dancinwoofs.com,dancinwoofs.com,Deliverable
Threads in Python do not run simultaneously due to the Global Interpreter Lock. You might want to use the multiprocessing module instead, or ProcessPoolExecutor() from concurrent.futures. If you decide to use ProcessPoolExecutors, pass the URLs to the callback and have the callback return the old and redirected URL which should be returned by the result method of the future you get from the executor.submit. When using processes, global variables are not shared, unlike threads.
There has been an attempt to remove the global interpreter lock but without the GIL, Python doesn't run quite as fast or something like that if I remember correctly.
Something like the following might work. I renamed the concurrent variable because it would shadow the concurrent module and probably cause an error. This code is untested because I don't have the csv file to test with.
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor
import sys
import pandas as pd
import requests
import numpy as np
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
futures = []
file_name =None
concurrent_ = 10000
q = None
count=0
df =None
def do_work(urls):
results = []
for url in urls:
res = get_status(url)
if res:
results.append((res[2], res[1]))
else:
results.append((url, url))
return results
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
#print(count)
#count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
def load_url(url, timeout):
ans = requests.get(url, timeout=timeout)
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
global urls
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
workers = 50
with ProcessPoolExecutor(max_workers=workers) as executor:
url_arrays = np.array_split(urls, workers)
for urls in url_arrays:
f = executor.submit(do_work, urls)
futures.append(f)
process_data()
df['shopify'] = [res[1] for f in concurrent.futures.as_completed(futures) for res in f.result()]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)

How to read file of URLs and web scrape them with multithreading

I am implementing a web scraping script in Python that reads a JSON file and gets a list of URLs to scrape each.
This file contains over 60K rows of which around 50K are unique (so first I am removing duplicates).
To do this process I have the next:
import contextlib
from bs4 import BeautifulSoup
import feedparser
import pandas
import requests
import time
BASE_URL = 'https://www.iso.org'
def create_iso_details_json(p_merged_iso_df):
merged_iso_details_df = p_merged_iso_df.drop_duplicates(subset=['Link']).drop(columns=['TC', 'ICS'], axis=1)
iso_details_dfs = [parse_iso_details(iso, stage, link)
for iso, stage, link in zip(merged_iso_details_df['Standard and/or project'], merged_iso_details_df['Stage'], merged_iso_details_df['Link'])
if link != '']
merged_iso_details_df = pandas.concat(iso_details_dfs)
print('Total rows retrieved: ', len(merged_iso_details_df.index))
merged_iso_details_df.to_json('iso_details.json', orient="records")
def parse_iso_details(p_iso, p_stage, p_url):
print('URL: ', p_url)
soup = BeautifulSoup(requests.get(p_url).text, 'html.parser')
try:
feed_details_url = BASE_URL + soup.find('section', {'id': 'product-details'}).find('a', {'class': 'ss-icon ss-social-circle text-warning text-sm'})['href']
except AttributeError:
print('Could not find feed data for URL: ', p_url)
print(feed_details_url)
iso_details_dfs = []
if feed_details_url is not None:
iso_details_dfs.append(read_iso_details(feed_details_url, p_iso, p_stage))
with contextlib.suppress(ValueError):
return pandas.concat(iso_details_dfs)
def read_iso_details(p_feed_details_url, p_iso, p_stage):
data = {'Standard and/or project': p_iso, 'Stage': p_stage}
df = pandas.DataFrame(data, index=[0])
feed = feedparser.parse(p_feed_details_url)
df['Publication date'] = [entry.published for entry in feed.entries]
return df
def main():
start_time = time.time()
merged_iso_df = pandas.read_json('input_file.json', dtype={"Stage": str})
create_iso_details_json(merged_iso_df)
print(f"--- {time.time() - start_time} seconds ---")
if __name__ == "__main__":
main()
I am merging the results in a pandas DataFrame to write it to another JSON file later.
Now, this takes so much time since the process makes a request per each input URL and lasts between 0.5 and 1 seconds.
I would like to implement this process with multithreading (not multiprocessing) so the processing time decreases significatively.
What is the best approach to achieve this? Split the input JSON file into many parts as number of threads to create to processing? How I merge the results of each thread into one to write the output JSON file?
Thank you in advance.
This Website explains multithreading pretty well. What you could do is splitting the URLs into equal parts and running them simultaneously. The problem with that is, that you basically just divide the time it would take by the number of threads you use. But to my knowledge, this is the best thing you can do without overcomplicating it.
I finally managed to implement the process with multithreading as #Yui posted in their answer.
The real problem was to merge results of each thread into one, so I decided to write each thread result into a file in append mode as a CSV. Then when all threads are finished I read the CSV and write the results into requierd JSON file.
BASE_URL = 'https://www.iso.org'
NUM_THREADS = 4
q = Queue()
INPUT_FILE = 'iso_tc_ics.json'
ISO_DETAILS_CSV_FILE = 'iso_details.csv'
OUTPUT_FILE = 'iso_details.json'
def create_iso_details_json(p_queue):
iso_details_df = p_queue.get()
iso_details_dfs = [parse_iso_details(iso, stage, link)
for iso, stage, link in zip(iso_details_df['Standard and/or project'], iso_details_df['Stage'], iso_details_df['Link'])
if link != '']
iso_details_df = pandas.concat(iso_details_dfs)
print('Rows retrieved: ', len(iso_details_df.index))
return iso_details_df
def parse_iso_details(p_iso, p_stage, p_url):
print('URL: ', p_url)
soup = BeautifulSoup(requests.get(p_url).text, 'html.parser')
try:
feed_details_url = BASE_URL + soup.find('section', {'id': 'product-details'}).find('a', {'class': 'ss-icon ss-social-circle text-warning text-sm'})['href']
except AttributeError:
print('Could not find feed data for URL: ', p_url)
print(feed_details_url)
iso_details_dfs = []
if feed_details_url is not None:
iso_details_dfs.append(read_iso_details(feed_details_url, p_iso, p_stage))
with contextlib.suppress(ValueError):
return pandas.concat(iso_details_dfs)
def read_iso_details(p_feed_details_url, p_iso, p_stage):
data = {'Standard and/or project': p_iso, 'Stage': p_stage}
df = pandas.DataFrame(data, index=[0])
feed = feedparser.parse(p_feed_details_url)
df['Publication date'] = [entry.published for entry in feed.entries]
return df
def main():
global q
result_df = create_iso_details_json(q)
with open(ISO_DETAILS_CSV_FILE, 'a') as f:
result_df.to_csv(f, mode='a', index=False, header=not f.tell(), encoding='ISO-8859-1')
q.task_done()
def init():
merged_iso_df = pandas.read_json(INPUT_FILE, dtype={"Stage": str})
merged_iso_details_df = merged_iso_df.drop_duplicates(subset=['Link']).drop(columns=['TC', 'ICS'], axis=1)
iso_details_df_chunks = numpy.array_split(merged_iso_details_df, NUM_THREADS)
for iso_details_df in iso_details_df_chunks:
q.put(iso_details_df)
for _ in range(NUM_THREADS):
worker = Thread(target=main)
worker.daemon = True
worker.start()
def end():
q.join()
result_iso_details_df = pandas.read_csv(ISO_DETAILS_CSV_FILE, dtype={"Stage": str}, encoding='ISO-8859-1')
print('Total rows retrieved: ', len(result_iso_details_df.index))
result_iso_details_df.to_json(OUTPUT_FILE, orient="records")
with contextlib.suppress(OSError):
os.remove(ISO_DETAILS_CSV_FILE)
if __name__ == "__main__":
start_time = time.time()
init()
end()
print(f"--- {time.time() - start_time} seconds ---")
I would go with asyncio and aiohttp here is a complete example of how to do multiple requests concurrently and get the result in the end
import aiohttp
import asyncio
async def geturl(url, session):
async with session.get(url) as resp:
if resp.status == 200:
return (await resp.json())['name']
else:
return "ERROR"
async def main():
urls = [f'https://pokeapi.co/api/v2/pokemon/{i}' for i in range(1,10)]
async with aiohttp.ClientSession() as session:
tasks = [geturl(url, session) for url in urls]
# asyncio.gather will run all the tasks concurrently
# and return their results once all tasks have returned
all_results = await asyncio.gather(*tasks)
print(all_results)
asyncio.run(main())
This will print the first 10 pokemon names by the way, you can tweak for your needs

Try, except not working in Python

I am trying to add error handling to this program by adding try and except blocks in case something does not work and so that the program does not shutdown in case there is an error in handling the data. (This is a dumbed down version of my code). When I run it this way, (given that the time is accurate), nothing seems to work - the functions in report_scheduler do not actually ever run.
Here is the code I am looking at:
import schedule
def forex_data_report():
from forex_python.converter import CurrencyRates
import csv
current_dir = os.getcwd()
date_time = time.strftime('%m-%d-%Y_at_%I-%M-%S-%p')
c = CurrencyRates()
usd_eur = c.get_rate('EUR', 'USD')
usd_gbp = c.get_rate('GBP', 'USD')
usd_yen = c.get_rate('JPY', 'USD')
usd_aud = c.get_rate('AUD', 'USD')
eur_gbp = c.get_rate('GBP', 'EUR')
clean_file_location = current_dir + '\\Reports\\Forex_Data\\Forex_Data.csv'
with open(clean_file_location, 'a', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow([date_time, usd_eur, usd_gbp, usd_yen, usd_aud, eur_gbp])
send_outlook_w_attach('Key Currencies', clean_file_location)
print ('Updated Key Currencies Data.')
def competitor_stock_data_report():
import datetime
import pandas_datareader.data as web
import csv
current_dir = os.getcwd()
date_print = time.strftime('%m-%d-%Y_at_%I-%M-%S-%p')
date_time = datetime.datetime.now()
date = date_time.date()
stocklist = ['LAZ','AMG','BEN','LM','EVR','GHL','HLI','MC','PJT','MS','GS','JPM','AB']
start = datetime.datetime(date.year-1, date.month, date.day-1)
end = datetime.datetime(date.year, date.month, date.day-1)
clean_file_location = current_dir + '\\Reports\\XXX\\Stock_Data.csv'
for x in stocklist:
df = web.DataReader(x, 'google', start, end)
with open(clean_file_location, 'a', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerow([date_print, x, df.loc[df.index[0], 'Close'], df.loc[df.index[-1], 'Close']])
send_outlook_w_attach('Competitor Stock Data vs. XXX', clean_file_location)
print ('Updated XXX Competitor Stock Performance Data.')
def report_scheduler():
try:
schedule.every().day.at("00:00").do(forex_data_report)
except:
pass
try:
schedule.every().friday.at("00:01").do(competitor_stock_data_report)
except:
pass
while True:
schedule.run_pending()
time.sleep(1)
if __name__ == '__main__':
print('Starting background - HANDLER - process...')
report_scheduler()
I understand the pass is not error handling, but I do need some sort of way to tell the program to continue, even if the data is not being updated/an error occurs.
Thanks.
Without actually getting deep into your code, probably an exception is being raised and then caught and then the pass statement means you don't get any output.
Have you checked it runs without the try except blocks?
Also, this might help:
except Exception as e:
print("Exception raised: {}".format(e))
At least then you will get a printout of your exception. You might also want to look into logging the exception.
I'm not familiar with the libraries you are using - it would be very helpful if you would post a complete program that we could tinker with ourselves, and name all of the third-party libraries involved - but I suspect you want the try-except blocks inside forex_data_report and competitor_stock_data_report. You aren't concerned with exceptions thrown by the act of scheduling the periodic task, are you? You want to swallow exceptions from the periodic tasks themselves. Experiment with code structured like this:
def forex_data_report():
# unchanged
def forex_data_report_wrapper():
try:
forex_data_report()
except Exception as e:
logger.exception(e)
# similarly for competitor_stock_data_report
def report_scheduler():
schedule.every().day.at("00:00").do(forex_data_report_wrapper)
schedule.every().friday.at("00:01").do(competitor_stock_data_report_wrapper)
while True:
schedule.run_pending()
time.sleep(1)
Note also that I am using except Exception instead of except. A bare except catches things you almost certainly don't want to catch, such as KeyboardInterrupt and StopIteration.

Python Multiprocessing Exception Handling Data Chunking

I'm trying to speed up some data processing using the multiprocessing module, the idea being I can send a chunk of data to each process I start up to utilize all the cores on my machine instead of just one at a time.
So I built an iterator for the data using the pandas read_fwf() function, with chunksize=50000 lines at a time. My problem is that eventually the iterator should raise StopIteration, and I'm trying to catch this in an except block in the child process and pass it along to the parent thread using a Queue to let the parent know it can stop spawning child processes. I have no idea what's wrong though, but what's happening is it gets to the end of the data and then keeps spawning processes which essentially do nothing.
def MyFunction(data_iterator, results_queue, Placeholder, message_queue):
try:
current_data = data_iterator.next()
#does other stuff here
#that isn't important
placeholder_result = "Eggs and Spam"
results_queue.put(placeholder_result)
return None
except StopIteration:
message_queue.put("Out Of Data")
return None
results_queue = Queue() #for passing results from each child process
message_queue = Queue() #for passing the stop iteration message
cpu_count = cpu_count() #num of cores on the machine
Data_Remaining = True #loop control
output_values = [] #list to put results in
print_num_records = 0 #used to print how many lines have been processed
my_data_file = "some_data.dat"
data_iterator = BuildDataIterator(my_data_file)
while Data_Remaining:
processes = []
for process_num in range(cpu_count):
if __name__ == "__main__":
p = Process(target=MyFunction, args=(data_iterator,results_queue,Placeholder, message_queue))
processes.append(p)
p.start()
print "Process " + str(process_num) + " Started" #print some stuff to
print_num_records = print_num_records + 50000 #show how far along
print "Processing records through: ", print_num_records #my data file I am
for i,p in enumerate(processes):
print "Joining Process " + str(i)
output_values.append(results_queue.get())
p.join(None)
if not message_queue.empty():
message = message_queue.get()
else:
message = ""
if message == "Out Of Data":
Data_Remaining = False
print "STOP ITERATION NOW PLEASE"
Update:
I discovered a problem with the data iterator. There are approximately 8 million rows in my data set, and after it processes the 8 million it never actually returns a StopIteration, it keeps returning the same 14 rows of data over and over. Here is the code that builds my data iterator:
def BuildDataIterator(my_data_file):
#data_columns is a list of 2-tuples
#headers is a list of strings
#num_lines is 50000
data_reader = read_fwf(my_data_file, colspecs=data_columns, header=None, names=headers, chunksize=num_lines)
data_iterator = data_reader.__iter__()
return data_iterator

Learning python and threading. I think my code runs infinitely. Help me find bugs?

So I've started learning python now, and I absolutely am in love with it.
I'm building a small scale facebook data scraper. Basically, it will use the Graph API and scrape the first names of the specified number of users. It works fine in a single thread (or no thread I guess).
I used online tutorials to come up with the following multithreaded version (updated code):
import requests
import json
import time
import threading
import Queue
GraphURL = 'http://graph.facebook.com/'
first_names = {} # will store first names and their counts
queue = Queue.Queue()
def getOneUser(url):
http_response = requests.get(url) # open the request URL
if http_response.status_code == 200:
data = http_response.text.encode('utf-8', 'ignore') # Get the text of response, and encode it
json_obj = json.loads(data) # load it as a json object
# name = json_obj['name']
return json_obj['first_name']
# last = json_obj['last_name']
return None
class ThreadGet(threading.Thread):
""" Threaded name scraper """
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
#print 'thread started\n'
url = GraphURL + str(self.queue.get())
first = getOneUser(url) # get one user's first name
if first is not None:
if first_names.has_key(first): # if name has been encountered before
first_names[first] = first_names[first] + 1 # increment the count
else:
first_names[first] = 1 # add the new name
self.queue.task_done()
#print 'thread ended\n'
def main():
start = time.time()
for i in range(6):
t = ThreadGet(queue)
t.setDaemon(True)
t.start()
for i in range(100):
queue.put(i)
queue.join()
for name in first_names.keys():
print name + ': ' + str(first_names[name])
print '----------------------------------------------------------------'
print '================================================================'
# Print top first names
for key in first_names.keys():
if first_names[key] > 2:
print key + ': ' + str(first_names[key])
print 'It took ' + str(time.time()-start) + 's'
main()
To be honest, I don't understand some of the parts of the code but I get the main idea. The output is nothing. I mean the shell has nothing in it, so I believe it keeps on running.
So what I am doing is filling queue with integers that are the user id's on fb. Then each ID is used to build the api call URL. getOneUser returns the name of one user at a time. That task (ID) is marked as 'done' and it moves on.
What is wrong with the code above?
Your usage of first_names is not thread-safe. You could add a lock to protect the increment. Otherwise the code should work. You might be hitting some facebook api limit i.e., you should limit your request rate.
You could simplify your code by using a thread pool and counting the names in the main thread:
#!/usr/bin/env python
import json
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
def get_name(url):
try:
return json.load(urllib2.urlopen(url))['first_name']
except Exception:
return None # error
urls = ('http://graph.facebook.com/%d' % i for i in xrange(100))
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, urls))
print first_names.most_common()
To see what errors you get, you could add logging:
#!/usr/bin/env python
import json
import logging
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
logging.basicConfig(level=logging.DEBUG,
format="%(asctime)s %(threadName)s %(message)s")
def get_name(url):
try:
name = json.load(urllib2.urlopen(url))['first_name']
except Exception as e:
logging.debug('error: %s url: %s', e, url)
return None # error
else:
logging.debug('done url: %s', url)
return name
urls = ('http://graph.facebook.com/%d' % i for i in xrange(100))
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, urls))
print first_names.most_common()
A simple way to limit number of requests per given time period is to use a semaphore:
#!/usr/bin/env python
import json
import logging
import time
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
from threading import _BoundedSemaphore as BoundedSemaphore, Timer
logging.basicConfig(level=logging.DEBUG,
format="%(asctime)s %(threadName)s %(message)s")
class RatedSemaphore(BoundedSemaphore):
"""Limit to 1 request per `period / value` seconds (over long run)."""
def __init__(self, value=1, period=1):
BoundedSemaphore.__init__(self, value)
t = Timer(period, self._add_token_loop,
kwargs=dict(time_delta=float(period) / value))
t.daemon = True
t.start()
def _add_token_loop(self, time_delta):
"""Add token every time_delta seconds."""
while True:
try:
BoundedSemaphore.release(self)
except ValueError: # ignore if already max possible value
pass
time.sleep(time_delta) # ignore EINTR
def release(self):
pass # do nothing (only time-based release() is allowed)
def get_name(gid, rate_limit=RatedSemaphore(value=100, period=600)):
url = 'http://graph.facebook.com/%d' % gid
try:
with rate_limit:
name = json.load(urllib2.urlopen(url))['first_name']
except Exception as e:
logging.debug('error: %s url: %s', e, url)
return None # error
else:
logging.debug('done url: %s', url)
return name
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, xrange(200)))
print first_names.most_common()
After the initial burst, it should make a single request every 6 seconds.
Consider using batch requests.
Your original run function only processed one item from the queue. In all you've only removed 5 items from the queue.
Usually run functions look like
run(self):
while True:
doUsefulWork()
i.e. they have a loop which causes the recurring work to be done.
[Edit] OP edited code to include this change.
Some other useful things to try:
Add a print statement into the run function: you'll find that it is only called 5 times.
Remove the queue.join() call, this is what is causing the module to block, then you will be able to probe the state of the queue.
put the entire body of run into a function. Verify that you can use that function in a single threaded manner to get the desired results, then
try it with just a single worker thread, then finally go for
multiple worker threads.

Categories