I have a script that retrieves the URLs from a file (1_1.txt)
I configured ThreadPoolExecutor to max 50 workers (at the end of the code), but even with just one worker, the memory grows very fast (several GB), it executes the get_url function which launches the requests module
The other functions called (file_writer and file_writer_html) don't seem involved because I already tried without
When I remove the code that executes HTTP requests and the associated variables, I no longer have this bug
Also, I installed the memory-profiler module (as you can see in the code), and it shows me that it is all the variables of the function get_url that increase the ram (result_request, soup, html, headers...)
import requests
from concurrent.futures import ThreadPoolExecutor
import fileinput
from bs4 import BeautifulSoup
import traceback
from threading import Thread
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import warnings
from random import random
from queue import Queue
from memory_profiler import profile
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
count_requests = 0
host_error = 0
ongoing = 0
##profile
def get_url(url):
global queue
global queue_html
global count_requests
global ongoing
global host_error
try:
result_request = requests.get(url, verify=False, timeout=40)
soup = BeautifulSoup(result_request.text, 'html.parser')
title = soup.title.get_text().splitlines(False)
html = result_request.content.splitlines(False)
headers = str(result_request.headers)
headers = headers.splitlines(False)
title = str(title)
html = str(html)
headers = str(headers)
title = title[0:10000]
html = html[0:10000000]
headers = headers[0:10000]
count_requests = count_requests + 1
queue.put(f'{url} - {title} \n')
queue_html.put(f'{url} - {html} - HEADER: {headers} \n')
except:
queue.put(f'FAILED : {url} \n')
host_error = host_error + 1
# dedicated file writing task
def file_writer(filepath, queue):
global count_requests
# open the file
with open(filepath, 'a', encoding="utf-8") as file:
# run until the event is set
while True:
# get a line of text from the queue
line = queue.get()
# check if we are done
if line is None:
# exit the loop
break
# write it to file
file.write(line)
# flush the buffer
file.flush()
# mark the unit of work complete
queue.task_done()
# mark the exit signal as processed, after the file was closed
queue.task_done()
# dedicated file writing task
def file_writer_html(filepath_html, queue_html):
# open the file
with open(filepath_html, 'a', encoding="utf-8") as file:
# run until the event is set
while True:
# get a line of text from the queue
line = queue_html.get()
print(str("requests success : ") + str(count_requests) + str(" | requests error ") + str(host_error), end='\r')
# check if we are done
if line is None:
# exit the loop
break
# write it to file
file.write(line)
# flush the buffer
file.flush()
# mark the unit of work complete
queue_html.task_done()
# mark the exit signal as processed, after the file was closed
queue_html.task_done()
# create the shared queue
queue = Queue()
queue_html = Queue()
# defile the shared file path
filepath = 'output.txt'
filepath_html = 'output_html.txt'
# create and start the file writer thread
writer_thread = Thread(target=file_writer, args=(filepath,queue), daemon=True)
writer_thread.start()
writer_html_thread = Thread(target=file_writer_html, args=(filepath_html,queue_html), daemon=True)
writer_html_thread.start()
# wait for all tasks in the queue to be processed
queue.join()
with open("1_1.txt") as stream:
urls = [line.strip() for line in stream]
with ThreadPoolExecutor(max_workers=50) as pool:
pool.map(get_url, urls)
Related
I am quite new to XML and to what makes code effective, and the code I am using takes quite a long time to run.
So I want to extract the elevation from given lat, long-values as fast as possible (I have a lot of lat,long-points). This is how I tried it:
import xml.etree.ElementTree as ET
from urllib.request import urlopen
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def elevation(lat, long):
query = ('http://openwps.statkart.no/skwms1/wps.elevation2?request=Execute&service=WPS&version=1.0.0'
f'&identifier=elevation&datainputs=lat={lat};lon={long};epsg=4326')
parsing = "{http://www.opengis.net/wps/1.0.0}"
with urlopen(query) as f:
tree = ET.parse(f)
root = tree.getroot()
return float(root.findall(f".//{parsing}Data/*")[0].text)
Using this function on the data set I have extracted from an csv-file, with several datasets within the same file separated by a "new_sheep"-line:
df = pd.read_csv("/Users/ninsalv/Documents/Sheepdata/Data.csv", delimiter=';',
dtype={"Initial start": "str", "Start": "str", "Stop": "str"})
print(df.head())
dataset = 1
Lat = []
Long = []
temp = 0
for i in range(len(df)):
if "new_sheep" in df.iloc[i][0]:
temp += 1
continue
if temp == dataset:
Lat.append(df.iloc[i][3])
Long.append(df.iloc[i][4])
if temp > dataset:
break
step = np.linspace(0,len(Lat),len(Lat))
altitude = []
for i in range(len(Lat)):
altitude.append(elevation(Lat[i], Long[i]))
if (i % 100) == 0:
print("round number ", i)
plt.plot(step, altitude)
This works, but it takes almost a minute to find every 100 altitudes, and I have about 7000-15000 points to check in my dataset. Does anybody know either XML, pandas or something else that may make my code faster?
What you need to do is to get data (HTTP request) you are looking for in parallel. You cab use multi threading for that.
See the example below.
import requests
from requests.sessions import Session
import time
from threading import Thread,local
from queue import Queue
url_list = [] # TODO long list of urls to be populated by your code
q = Queue(maxsize=0) #Use a queue to store all URLs
for url in url_list:
q.put(url)
thread_local = local() #The thread_local will hold a Session object
def get_session() -> Session:
if not hasattr(thread_local,'session'):
thread_local.session = requests.Session() # Create a new Session if not exists
return thread_local.session
def download_link() -> None:
'''download link worker, get URL from queue until no url left in the queue'''
session = get_session()
while not q.empty():
url = q.get()
with session.get(url) as response:
print(f'Read {len(response.content)} from {url}')
q.task_done() # tell the queue, this url downloading work is done
def download_all(urls) -> None:
'''Start 10 threads, each thread as a wrapper of downloader'''
thread_num = 10
for i in range(thread_num):
t_worker = Thread(target=download_link)
t_worker.start()
q.join() # main thread wait until all url finished downloading
print("start work")
start = time.time()
download_all(url_list)
end = time.time()
print(f'download {len(url_list)} links in {end - start} seconds')
So I have been trying to multi-thread some internet connections in python. I have been using the multiprocessing module so I can get around the "Global Interpreter Lock". But it seems that the system only gives one open connection port to python, Or at least it only allows one connection to happen at once. Here is an example of what I am saying.
*Note that this is running on a linux server
from multiprocessing import Process, Queue
import urllib
import random
# Generate 10,000 random urls to test and put them in the queue
queue = Queue()
for each in range(10000):
rand_num = random.randint(1000,10000)
url = ('http://www.' + str(rand_num) + '.com')
queue.put(url)
# Main funtion for checking to see if generated url is active
def check(q):
while True:
try:
url = q.get(False)
try:
request = urllib.urlopen(url)
del request
print url + ' is an active url!'
except:
print url + ' is not an active url!'
except:
if q.empty():
break
# Then start all the threads (50)
for thread in range(50):
task = Process(target=check, args=(queue,))
task.start()
So if you run this you will notice that it starts 50 instances on the function but only runs one at a time. You may think that the 'Global Interpreter Lock' is doing this but it isn't. Try changing the function to a mathematical function instead of a network request and you will see that all fifty threads run simultaneously.
So will I have to work with sockets? Or is there something I can do that will give python access to more ports? Or is there something I am not seeing? Let me know what you think! Thanks!
*Edit
So I wrote this script to test things better with the requests library. It seems as though I had not tested it very well with this before. (I had mainly used urllib and urllib2)
from multiprocessing import Process, Queue
from threading import Thread
from Queue import Queue as Q
import requests
import time
# A main timestamp
main_time = time.time()
# Generate 100 urls to test and put them in the queue
queue = Queue()
for each in range(100):
url = ('http://www.' + str(each) + '.com')
queue.put(url)
# Timer queue
time_queue = Queue()
# Main funtion for checking to see if generated url is active
def check(q, t_q): # args are queue and time_queue
while True:
try:
url = q.get(False)
# Make a timestamp
t = time.time()
try:
request = requests.head(url, timeout=5)
t = time.time() - t
t_q.put(t)
del request
except:
t = time.time() - t
t_q.put(t)
except:
break
# Then start all the threads (20)
thread_list = []
for thread in range(20):
task = Process(target=check, args=(queue, time_queue))
task.start()
thread_list.append(task)
# Join all the threads so the main process don't quit
for each in thread_list:
each.join()
main_time_end = time.time()
# Put the timerQueue into a list to get the average
time_queue_list = []
while True:
try:
time_queue_list.append(time_queue.get(False))
except:
break
# Results of the time
average_response = sum(time_queue_list) / float(len(time_queue_list))
total_time = main_time_end - main_time
line = "Multiprocessing: Average response time: %s sec. -- Total time: %s sec." % (average_response, total_time)
print line
# A main timestamp
main_time = time.time()
# Generate 100 urls to test and put them in the queue
queue = Q()
for each in range(100):
url = ('http://www.' + str(each) + '.com')
queue.put(url)
# Timer queue
time_queue = Queue()
# Main funtion for checking to see if generated url is active
def check(q, t_q): # args are queue and time_queue
while True:
try:
url = q.get(False)
# Make a timestamp
t = time.time()
try:
request = requests.head(url, timeout=5)
t = time.time() - t
t_q.put(t)
del request
except:
t = time.time() - t
t_q.put(t)
except:
break
# Then start all the threads (20)
thread_list = []
for thread in range(20):
task = Thread(target=check, args=(queue, time_queue))
task.start()
thread_list.append(task)
# Join all the threads so the main process don't quit
for each in thread_list:
each.join()
main_time_end = time.time()
# Put the timerQueue into a list to get the average
time_queue_list = []
while True:
try:
time_queue_list.append(time_queue.get(False))
except:
break
# Results of the time
average_response = sum(time_queue_list) / float(len(time_queue_list))
total_time = main_time_end - main_time
line = "Standard Threading: Average response time: %s sec. -- Total time: %s sec." % (average_response, total_time)
print line
# Do the same thing all over again but this time do each url at a time
# A main timestamp
main_time = time.time()
# Generate 100 urls and test them
timer_list = []
for each in range(100):
url = ('http://www.' + str(each) + '.com')
t = time.time()
try:
request = requests.head(url, timeout=5)
timer_list.append(time.time() - t)
except:
timer_list.append(time.time() - t)
main_time_end = time.time()
# Results of the time
average_response = sum(timer_list) / float(len(timer_list))
total_time = main_time_end - main_time
line = "Not using threads: Average response time: %s sec. -- Total time: %s sec." % (average_response, total_time)
print line
As you can see, it is multithreading very well. Actually, most of my tests show that the threading module is actually faster than the multiprocessing module. (I don't understand why!) Here are some of my results.
Multiprocessing: Average response time: 2.40511314869 sec. -- Total time: 25.6876308918 sec.
Standard Threading: Average response time: 2.2179402256 sec. -- Total time: 24.2941861153 sec.
Not using threads: Average response time: 2.1740363431 sec. -- Total time: 217.404567957 sec.
This was done on my home network, the response time on my server is much faster. I think my question has been answered indirectly, since I was having my problems on a much more complex script. All of the suggestions helped me optimize it very well. Thanks to everyone!
it starts 50 instances on the function but only runs one at a time
You have misinterpreted the results of htop. Only a few, if any, copies of python will be runnable at any specific instance. Most of them will be blocked waiting for network I/O.
The processes are, in fact, running parallel.
Try changing the function to a mathematical function instead of a network request and you will see that all fifty threads run simultaneously.
Changing the task to a mathematical function merely illustrates the difference between CPU-bound (e.g. math) and IO-bound (e.g. urlopen) processes. The former is always runnable, the latter is rarely runnable.
it only prints one at a time. If it was actually running multiple processes it would print many out at once.
It prints one at a time because you are writing lines to a terminal. Because the lines are indistinguishable, you wouldn't be able to tell if they are written all by one thread, or each by a separate thread in turn.
First of all, using multiprocessing to parallelize network I/O is an overkill. Using the built-in threading or a lightweight greenlet library like gevent are a much better option with less overhead. The GIL has nothing to do with blocking IO calls, so you don't have to worry about that at all.
Secondly, an easy way to see if your subprocesses/threads/greenlets are running in parallel if you are monitoring stdout is to print out something at the very beginning of the function, right after the subprocesses/threads/greenlets are spawned. For example, modify your check() function like so
def check(q):
print 'Start checking urls!'
while True:
...
If your code is correct, you should see many Start checking urls! lines printed out before any of the url + ' is [not] an active url!' printed out. It works on my machine, so it looks like your code is correct.
It appears that your issue is actually with the serial behavior of gethostbyname(3). This is discussed in this SO thread.
Try this code that uses the Twisted asynchronous I/O library:
import random
import sys
from twisted.internet import reactor
from twisted.internet import defer
from twisted.internet.task import cooperate
from twisted.web import client
SIMULTANEOUS_CONNECTIONS = 25
# Generate 10,000 random urls to test and put them in the queue
pages = []
for each in range(10000):
rand_num = random.randint(1000,10000)
url = ('http://www.' + str(rand_num) + '.com')
pages.append(url)
# Main function for checking to see if generated url is active
def check(page):
def successback(data, page):
print "{} is an active URL!".format(page)
def errback(err, page):
print "{} is not an active URL!; errmsg:{}".format(page, err.value)
d = client.getPage(page, timeout=3) # timeout in seconds
d.addCallback(successback, page)
d.addErrback(errback, page)
return d
def generate_checks(pages):
for i in xrange(0, len(pages)):
page = pages[i]
#print "Page no. {}".format(i)
yield check(page)
def work(pages):
print "started work(): {}".format(len(pages))
batch_size = len(pages) / SIMULTANEOUS_CONNECTIONS
for i in xrange(0, len(pages), batch_size):
task = cooperate(generate_checks(pages[i:i+batch_size]))
print "starting..."
reactor.callWhenRunning(work, pages)
reactor.run()
I am stuck in a grieve problem, and not able to figure out which way to go, in attempts made whole day I have posted so many times, this is not a duplicate question, since I need clarity how can I use multiple threads to grab a multiple chunks simultaneously from a server but write to disk atomically by locking the file write operation for single thread access; while the next thread waits for lock to get released.
import argparse,logging, Queue, os, requests, signal, sys, time, threading
import utils as _fdUtils
DESKTOP_PATH = os.path.expanduser("~/Desktop")
appName = 'FileDownloader'
logFile = os.path.join(DESKTOP_PATH, '%s.log' % appName)
_log = _fdUtils.fdLogger(appName, logFile, logging.DEBUG, logging.DEBUG, console_level=logging.DEBUG)
queue = Queue.Queue()
STOP_REQUEST = threading.Event()
maxSplits = threading.BoundedSemaphore(3)
threadLimiter = threading.BoundedSemaphore(5)
lock = threading.Lock()
Update:1
def _grabAndWriteToDisk(url, saveTo, first=None, queue=None, mode='wb', irange=None):
""" Function to download file..
Args:
url(str): url of file to download
saveTo(str): path where to save file
first(int): starting byte of the range
queue(Queue.Queue): queue object to set status for file download
mode(str): mode of file to be downloaded
irange(str): range of byte to download
"""
fileName = url.split('/')[-1]
filePath = os.path.join(saveTo, fileName)
fileSize = int(_fdUtils.getUrlSizeInBytes(url))
downloadedFileSize = 0 if not first else first
block_sz = 8192
irange = irange if irange else '0-%s' % fileSize
# print mode
resp = requests.get(url, headers={'Range': 'bytes=%s' % irange}, stream=True)
fileBuffer = resp.raw.read()
with open(filePath, mode) as fd:
downloadedFileSize += len(fileBuffer)
fd.write(fileBuffer)
status = r"%10d [%3.2f%%]" % (downloadedFileSize, downloadedFileSize * 100. / fileSize)
status = status + chr(8)*(len(status)+1)
sys.stdout.write('%s\r' % status)
time.sleep(.05)
sys.stdout.flush()
if downloadedFileSize == fileSize:
STOP_REQUEST.set()
if queue:
queue.task_done()
_log.info("Download Completed %s%% for file %s, saved to %s",
downloadedFileSize * 100. / fileSize, fileName, saveTo)
class ThreadedFetch(threading.Thread):
""" docstring for ThreadedFetch
"""
def __init__(self, queue):
super(ThreadedFetch, self).__init__()
self.queue = queue
self.lock = threading.Lock()
def run(self):
threadLimiter.acquire()
try:
items = self.queue.get()
url = items[0]
saveTo = DESKTOP_PATH if not items[1] else items[1]
split = items[-1]
# grab split chunks in separate thread.
if split > 1:
maxSplits.acquire()
try:
sizeInBytes = int(_fdUtils.getUrlSizeInBytes(url))
byteRanges = _fdUtils.getRange(sizeInBytes, split)
mode = 'wb'
th = threading.Thread(target=_grabAndWriteToDisk, args=(url, saveTo, first, self.queue, mode, _range))
_log.info("Pulling for range %s using %s" , _range, th.getName())
th.start()
# _grabAndWriteToDisk(url, saveTo, first, self.queue, mode, _range)
mode = 'a'
finally:
maxSplits.release()
else:
while not STOP_REQUEST.isSet():
self.setName("primary_%s" % url.split('/')[-1])
# if downlaod whole file in single chunk no need
# to start a new thread, so directly download here.
_grabAndWriteToDisk(url, saveTo, 0, self.queue)
finally:
threadLimiter.release()
def main(appName, flag='with'):
args = _fdUtils.getParser()
urls_saveTo = {}
if flag == 'with':
_fdUtils.Watcher()
elif flag != 'without':
_log.info('unrecognized flag: %s', flag)
sys.exit()
# spawn a pool of threads, and pass them queue instance
# each url will be downloaded concurrently
for i in xrange(len(args.urls)):
t = ThreadedFetch(queue)
t.daemon = True
t.start()
split = 4
try:
for url in args.urls:
# TODO: put split as value of url as tuple with saveTo
urls_saveTo[url] = args.saveTo
# populate queue with data
for url, saveTo in urls_saveTo.iteritems():
queue.put((url, saveTo, split))
# wait on the queue until everything has been processed
queue.join()
_log.info('Finsihed all dowonloads.')
except (KeyboardInterrupt, SystemExit):
_log.critical('! Received keyboard interrupt, quitting threads.')
I expect to multiple threads grab each chunk but in the terminal i see same thread grabbing each range of chunk:
INFO - Pulling for range 0-25583 using Thread-1
INFO - Pulling for range 25584-51166 using Thread-1
INFO - Pulling for range 51167-76748 using Thread-1
INFO - Pulling for range 76749-102331 using Thread-1
INFO - Download Completed 100.0% for file 607800main_kepler1200_1600-1200.jpg
but what I am expecting is
INFO - Pulling for range 0-25583 using Thread-1
INFO - Pulling for range 25584-51166 using Thread-2
INFO - Pulling for range 51167-76748 using Thread-3
INFO - Pulling for range 76749-102331 using Thread-4
Please do not mark duplicate, without understanding...
please recommend if there is a better approach of doing what I am trying to do..
Cheers:/
If you want the download happen simultaneously and just writing to the file to be atomic then don't put the lock around downloading and writing to the file, but just around the write part.
As every thread uses its own file object to write I don't see the need to lock that access anyway. What you have to make sure is every thread writes to the correct offset, so you need a seek() call on the file before writing the data chunk. Otherwise you'll have to write the chunks in file order, which would make things more complicated.
I have a very simple Python script using gevent.pool to download URLs (see below). The script runs fine for a couple of days and then locks up. I noticed that the memory usage is very high at that time. Am I using gevent incorrectly?
import sys
from gevent import monkey
monkey.patch_all()
import urllib2
from gevent.pool import Pool
inputFile = open(sys.argv[1], 'r')
urls = []
counter = 0
for line in inputFile:
counter += 1
urls.append(line.strip())
inputFile.close()
outputDirectory = sys.argv[2]
def fetch(url):
try:
body = urllib2.urlopen("http://" + url, None, 5).read()
if len(body) > 0:
outputFile = open(outputDirectory + "/" + url, 'w')
outputFile.write(body)
outputFile.close()
print "Success", url
except:
pass
pool = Pool(int(sys.argv[3]))
pool.map(fetch, urls)
body = urllib2.urlopen("http://" + url, None, 5).read()
Above line reads the entire content in memory as a string. To prevent that, change fetch() as follow:
def fetch(url):
try:
u = urllib2.urlopen("http://" + url, None, 5)
try:
with open(outputDirectory + "/" + url, 'w') as outputFile:
while True:
chunk = u.read(65536)
if not chunk:
break
outputFile.write(chunk)
finally:
u.close()
print "Success", url
except:
print "Fail", url
So I've started learning python now, and I absolutely am in love with it.
I'm building a small scale facebook data scraper. Basically, it will use the Graph API and scrape the first names of the specified number of users. It works fine in a single thread (or no thread I guess).
I used online tutorials to come up with the following multithreaded version (updated code):
import requests
import json
import time
import threading
import Queue
GraphURL = 'http://graph.facebook.com/'
first_names = {} # will store first names and their counts
queue = Queue.Queue()
def getOneUser(url):
http_response = requests.get(url) # open the request URL
if http_response.status_code == 200:
data = http_response.text.encode('utf-8', 'ignore') # Get the text of response, and encode it
json_obj = json.loads(data) # load it as a json object
# name = json_obj['name']
return json_obj['first_name']
# last = json_obj['last_name']
return None
class ThreadGet(threading.Thread):
""" Threaded name scraper """
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
#print 'thread started\n'
url = GraphURL + str(self.queue.get())
first = getOneUser(url) # get one user's first name
if first is not None:
if first_names.has_key(first): # if name has been encountered before
first_names[first] = first_names[first] + 1 # increment the count
else:
first_names[first] = 1 # add the new name
self.queue.task_done()
#print 'thread ended\n'
def main():
start = time.time()
for i in range(6):
t = ThreadGet(queue)
t.setDaemon(True)
t.start()
for i in range(100):
queue.put(i)
queue.join()
for name in first_names.keys():
print name + ': ' + str(first_names[name])
print '----------------------------------------------------------------'
print '================================================================'
# Print top first names
for key in first_names.keys():
if first_names[key] > 2:
print key + ': ' + str(first_names[key])
print 'It took ' + str(time.time()-start) + 's'
main()
To be honest, I don't understand some of the parts of the code but I get the main idea. The output is nothing. I mean the shell has nothing in it, so I believe it keeps on running.
So what I am doing is filling queue with integers that are the user id's on fb. Then each ID is used to build the api call URL. getOneUser returns the name of one user at a time. That task (ID) is marked as 'done' and it moves on.
What is wrong with the code above?
Your usage of first_names is not thread-safe. You could add a lock to protect the increment. Otherwise the code should work. You might be hitting some facebook api limit i.e., you should limit your request rate.
You could simplify your code by using a thread pool and counting the names in the main thread:
#!/usr/bin/env python
import json
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
def get_name(url):
try:
return json.load(urllib2.urlopen(url))['first_name']
except Exception:
return None # error
urls = ('http://graph.facebook.com/%d' % i for i in xrange(100))
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, urls))
print first_names.most_common()
To see what errors you get, you could add logging:
#!/usr/bin/env python
import json
import logging
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
logging.basicConfig(level=logging.DEBUG,
format="%(asctime)s %(threadName)s %(message)s")
def get_name(url):
try:
name = json.load(urllib2.urlopen(url))['first_name']
except Exception as e:
logging.debug('error: %s url: %s', e, url)
return None # error
else:
logging.debug('done url: %s', url)
return name
urls = ('http://graph.facebook.com/%d' % i for i in xrange(100))
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, urls))
print first_names.most_common()
A simple way to limit number of requests per given time period is to use a semaphore:
#!/usr/bin/env python
import json
import logging
import time
import urllib2
from collections import Counter
from multiprocessing.dummy import Pool # use threads
from threading import _BoundedSemaphore as BoundedSemaphore, Timer
logging.basicConfig(level=logging.DEBUG,
format="%(asctime)s %(threadName)s %(message)s")
class RatedSemaphore(BoundedSemaphore):
"""Limit to 1 request per `period / value` seconds (over long run)."""
def __init__(self, value=1, period=1):
BoundedSemaphore.__init__(self, value)
t = Timer(period, self._add_token_loop,
kwargs=dict(time_delta=float(period) / value))
t.daemon = True
t.start()
def _add_token_loop(self, time_delta):
"""Add token every time_delta seconds."""
while True:
try:
BoundedSemaphore.release(self)
except ValueError: # ignore if already max possible value
pass
time.sleep(time_delta) # ignore EINTR
def release(self):
pass # do nothing (only time-based release() is allowed)
def get_name(gid, rate_limit=RatedSemaphore(value=100, period=600)):
url = 'http://graph.facebook.com/%d' % gid
try:
with rate_limit:
name = json.load(urllib2.urlopen(url))['first_name']
except Exception as e:
logging.debug('error: %s url: %s', e, url)
return None # error
else:
logging.debug('done url: %s', url)
return name
p = Pool(5) # 5 concurrent connections
first_names = Counter(p.imap_unordered(get_name, xrange(200)))
print first_names.most_common()
After the initial burst, it should make a single request every 6 seconds.
Consider using batch requests.
Your original run function only processed one item from the queue. In all you've only removed 5 items from the queue.
Usually run functions look like
run(self):
while True:
doUsefulWork()
i.e. they have a loop which causes the recurring work to be done.
[Edit] OP edited code to include this change.
Some other useful things to try:
Add a print statement into the run function: you'll find that it is only called 5 times.
Remove the queue.join() call, this is what is causing the module to block, then you will be able to probe the state of the queue.
put the entire body of run into a function. Verify that you can use that function in a single threaded manner to get the desired results, then
try it with just a single worker thread, then finally go for
multiple worker threads.