Limiting Threads within Python Threading, Queue - python

Im using the following code to multithread urlib2. However what is the best way to limit the number of threads that it consumes ??
class ApiMultiThreadHelper:
def __init__(self,api_calls):
self.q = Queue.Queue()
self.api_datastore = {}
self.api_calls = api_calls
self.userpass = '#####'
def query_api(self,q,api_query):
self.q.put(self.issue_request(api_query))
def issue_request(self,api_query):
self.api_datastore.update({api_query:{}})
for lookup in ["call1","call2"]:
query = api_query+lookup
request = urllib2.Request(query)
request.add_header("Authorization", "Basic %s" % self.userpass)
f = urllib2.urlopen(request)
response = f.read()
f.close()
self.api_datastore[api_query].update({lookup:response})
return True
def go(self):
threads = []
for i in self.api_calls:
t = threading.Thread(target=self.query_api, args = (self.q,i))
t.start()
threads.append(t)
for t in threads:
t.join()

You should use a thread pool. Here's my implementation I've made years ago (Python 3.x friendly):
import traceback
from threading import Thread
try:
import queue as Queue # Python3.x
except ImportError:
import Queue
class ThreadPool(object):
def __init__(self, no=10):
self.alive = True
self.tasks = Queue.Queue()
self.threads = []
for _ in range(no):
t = Thread(target=self.worker)
t.start()
self.threads.append(t)
def worker(self):
while self.alive:
try:
fn, args, kwargs = self.tasks.get(timeout=0.5)
except Queue.Empty:
continue
except ValueError:
self.tasks.task_done()
continue
try:
fn(*args, **kwargs)
except Exception:
# might wanna add some better error handling
traceback.print_exc()
self.tasks.task_done()
def add_job(self, fn, args=[], kwargs={}):
self.tasks.put((fn, args, kwargs))
def join(self):
self.tasks.join()
def deactivate(self):
self.alive = False
for t in self.threads:
t.join()
You can also find a similar class in multiprocessing.pool module (don't ask me why it is there). You can then refactor your code like this:
def go(self):
tp = ThreadPool(20) # <-- 20 thread workers
for i in self.api_calls:
tp.add_job(self.query_api, args=(self.q, i))
tp.join()
tp.deactivate()
Number of threads is now defined a priori.

Related

how can you use threading in python, so that it would change the value of i in loop which is outside class in a function [duplicate]

Is there a Pool class for worker threads, similar to the multiprocessing module's Pool class?
I like for example the easy way to parallelize a map function
def long_running_func(p):
c_func_no_gil(p)
p = multiprocessing.Pool(4)
xs = p.map(long_running_func, range(100))
however I would like to do it without the overhead of creating new processes.
I know about the GIL. However, in my usecase, the function will be an IO-bound C function for which the python wrapper will release the GIL before the actual function call.
Do I have to write my own threading pool?
I just found out that there actually is a thread-based Pool interface in the multiprocessing module, however it is hidden somewhat and not properly documented.
It can be imported via
from multiprocessing.pool import ThreadPool
It is implemented using a dummy Process class wrapping a python thread. This thread-based Process class can be found in multiprocessing.dummy which is mentioned briefly in the docs. This dummy module supposedly provides the whole multiprocessing interface based on threads.
In Python 3 you can use concurrent.futures.ThreadPoolExecutor, i.e.:
executor = ThreadPoolExecutor(max_workers=10)
a = executor.submit(my_function)
See the docs for more info and examples.
Yes, and it seems to have (more or less) the same API.
import multiprocessing
def worker(lnk):
....
def start_process():
.....
....
if(PROCESS):
pool = multiprocessing.Pool(processes=POOL_SIZE, initializer=start_process)
else:
pool = multiprocessing.pool.ThreadPool(processes=POOL_SIZE,
initializer=start_process)
pool.map(worker, inputs)
....
For something very simple and lightweight (slightly modified from here):
from Queue import Queue
from threading import Thread
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try:
func(*args, **kargs)
except Exception, e:
print e
finally:
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads):
Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
if __name__ == '__main__':
from random import randrange
from time import sleep
delays = [randrange(1, 10) for i in range(100)]
def wait_delay(d):
print 'sleeping for (%d)sec' % d
sleep(d)
pool = ThreadPool(20)
for i, d in enumerate(delays):
pool.add_task(wait_delay, d)
pool.wait_completion()
To support callbacks on task completion you can just add the callback to the task tuple.
Hi to use the thread pool in Python you can use this library :
from multiprocessing.dummy import Pool as ThreadPool
and then for use, this library do like that :
pool = ThreadPool(threads)
results = pool.map(service, tasks)
pool.close()
pool.join()
return results
The threads are the number of threads that you want and tasks are a list of task that most map to the service.
Yes, there is a threading pool similar to the multiprocessing Pool, however, it is hidden somewhat and not properly documented. You can import it by following way:-
from multiprocessing.pool import ThreadPool
Just I show you simple example
def test_multithread_stringio_read_csv(self):
# see gh-11786
max_row_range = 10000
num_files = 100
bytes_to_df = [
'\n'.join(
['%d,%d,%d' % (i, i, i) for i in range(max_row_range)]
).encode() for j in range(num_files)]
files = [BytesIO(b) for b in bytes_to_df]
# read all files in many threads
pool = ThreadPool(8)
results = pool.map(self.read_csv, files)
first_result = results[0]
for result in results:
tm.assert_frame_equal(first_result, result)
Here's the result I finally ended up using. It's a modified version of the classes by dgorissen above.
File: threadpool.py
from queue import Queue, Empty
import threading
from threading import Thread
class Worker(Thread):
_TIMEOUT = 2
""" Thread executing tasks from a given tasks queue. Thread is signalable,
to exit
"""
def __init__(self, tasks, th_num):
Thread.__init__(self)
self.tasks = tasks
self.daemon, self.th_num = True, th_num
self.done = threading.Event()
self.start()
def run(self):
while not self.done.is_set():
try:
func, args, kwargs = self.tasks.get(block=True,
timeout=self._TIMEOUT)
try:
func(*args, **kwargs)
except Exception as e:
print(e)
finally:
self.tasks.task_done()
except Empty as e:
pass
return
def signal_exit(self):
""" Signal to thread to exit """
self.done.set()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads, tasks=[]):
self.tasks = Queue(num_threads)
self.workers = []
self.done = False
self._init_workers(num_threads)
for task in tasks:
self.tasks.put(task)
def _init_workers(self, num_threads):
for i in range(num_threads):
self.workers.append(Worker(self.tasks, i))
def add_task(self, func, *args, **kwargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kwargs))
def _close_all_threads(self):
""" Signal all threads to exit and lose the references to them """
for workr in self.workers:
workr.signal_exit()
self.workers = []
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def __del__(self):
self._close_all_threads()
def create_task(func, *args, **kwargs):
return (func, args, kwargs)
To use the pool
from random import randrange
from time import sleep
delays = [randrange(1, 10) for i in range(30)]
def wait_delay(d):
print('sleeping for (%d)sec' % d)
sleep(d)
pool = ThreadPool(20)
for i, d in enumerate(delays):
pool.add_task(wait_delay, d)
pool.wait_completion()
another way can be adding the process to thethread queue pool
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
for i in range(10):
a = executor.submit(arg1, arg2,....)
The overhead of creating the new processes is minimal, especially when it's just 4 of them. I doubt this is a performance hot spot of your application. Keep it simple, optimize where you have to and where profiling results point to.
There is no built in thread based pool. However, it can be very quick to implement a producer/consumer queue with the Queue class.
From:
https://docs.python.org/2/library/queue.html
from threading import Thread
from Queue import Queue
def worker():
while True:
item = q.get()
do_work(item)
q.task_done()
q = Queue()
for i in range(num_worker_threads):
t = Thread(target=worker)
t.daemon = True
t.start()
for item in source():
q.put(item)
q.join() # block until all tasks are done
If you don't mind executing other's code, here's mine:
Note: There is lot of extra code you may want to remove [added for better clarificaiton and demonstration how it works]
Note: Python naming conventions were used for method names and variable names instead of camelCase.
Working procedure:
MultiThread class will initiate with no of instances of threads by sharing lock, work queue, exit flag and results.
SingleThread will be started by MultiThread once it creates all instances.
We can add works using MultiThread (It will take care of locking).
SingleThreads will process work queue using a lock in middle.
Once your work is done, you can destroy all threads with shared boolean value.
Here, work can be anything. It can automatically import (uncomment import line) and process module using given arguments.
Results will be added to results and we can get using get_results
Code:
import threading
import queue
class SingleThread(threading.Thread):
def __init__(self, name, work_queue, lock, exit_flag, results):
threading.Thread.__init__(self)
self.name = name
self.work_queue = work_queue
self.lock = lock
self.exit_flag = exit_flag
self.results = results
def run(self):
# print("Coming %s with parameters %s", self.name, self.exit_flag)
while not self.exit_flag:
# print(self.exit_flag)
self.lock.acquire()
if not self.work_queue.empty():
work = self.work_queue.get()
module, operation, args, kwargs = work.module, work.operation, work.args, work.kwargs
self.lock.release()
print("Processing : " + operation + " with parameters " + str(args) + " and " + str(kwargs) + " by " + self.name + "\n")
# module = __import__(module_name)
result = str(getattr(module, operation)(*args, **kwargs))
print("Result : " + result + " for operation " + operation + " and input " + str(args) + " " + str(kwargs))
self.results.append(result)
else:
self.lock.release()
# process_work_queue(self.work_queue)
class MultiThread:
def __init__(self, no_of_threads):
self.exit_flag = bool_instance()
self.queue_lock = threading.Lock()
self.threads = []
self.work_queue = queue.Queue()
self.results = []
for index in range(0, no_of_threads):
thread = SingleThread("Thread" + str(index+1), self.work_queue, self.queue_lock, self.exit_flag, self.results)
thread.start()
self.threads.append(thread)
def add_work(self, work):
self.queue_lock.acquire()
self.work_queue._put(work)
self.queue_lock.release()
def destroy(self):
self.exit_flag.value = True
for thread in self.threads:
thread.join()
def get_results(self):
return self.results
class Work:
def __init__(self, module, operation, args, kwargs={}):
self.module = module
self.operation = operation
self.args = args
self.kwargs = kwargs
class SimpleOperations:
def sum(self, *args):
return sum([int(arg) for arg in args])
#staticmethod
def mul(a, b, c=0):
return int(a) * int(b) + int(c)
class bool_instance:
def __init__(self, value=False):
self.value = value
def __setattr__(self, key, value):
if key != "value":
raise AttributeError("Only value can be set!")
if not isinstance(value, bool):
raise AttributeError("Only True/False can be set!")
self.__dict__[key] = value
# super.__setattr__(key, bool(value))
def __bool__(self):
return self.value
if __name__ == "__main__":
multi_thread = MultiThread(5)
multi_thread.add_work(Work(SimpleOperations(), "mul", [2, 3], {"c":4}))
while True:
data_input = input()
if data_input == "":
pass
elif data_input == "break":
break
else:
work = data_input.split()
multi_thread.add_work(Work(SimpleOperations(), work[0], work[1:], {}))
multi_thread.destroy()
print(multi_thread.get_results())

Python Multithreading with requests

i have one scraper which initiate the "requestes" session and fetch some data, using a IPV6, i have now 10000 ip list, I have prepared it using threading, but its giving error.
Need support to find out the issue.
import requests, queue,threading, urllib3,jso,pandas as pd, os, time, datetime,inspect
num_threads = 2
root = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
with open (root+ "/ip_list.txt") as ips:
device_ip = list(ips)
class Writer_Worker(threading.Thread):
def __init__(self, queue, df, *args, **kwargs):
if not queue:
print("Device Queue not specified")
exit(1)
self.out_q = queue
self.df = df
super().__init__(*args, **kwargs)
def run(self):
while True:
try:
device_details = self.out_q.get(timeout=3)
except queue.Empty:
return
self.df[device_details[0]] = device_details
self.out_q.task_done()
class Worker(threading.Thread):
def __init__(self, queue, out_queue, device_password, *args, **kwargs):
if not queue:
print("Device Queue not specified")
exit(1)
self.queue = queue
self.pas = device_password
self.out_q = out_queue
super().__init__(*args, **kwargs)
def run(self):
while True:
try:
device_ip = self.queue.get(timeout=3)
except queue.Empty:
return
self.connect_to_device_and_process(device_ip)
self.queue.task_done()
def connect_to_device_and_process(self, device_ip):
st = str("Online")
try:
r = requests.post("https://["+device_ip+"]/?q=index.login&mimosa_ajax=1", {"username":"configure", "password":self.pas}, verify=False)
except requests.exceptions.ConnectionError:
st = str("Offline")
self.out_q.put([device_ip,st,"","","","","","","","","","","","","","","","","",""])
return
finally:
if 'Online' in st:
r = requests.get("https://["+device_ip+"]/cgi/dashboard.php", cookies=r.cookies, verify=False)
if "Response [401]" in str(r):
st2 = str("Password Error")
self.out_q.put([device_ip,st2,"","","","","","","","","","","","","","","","","",""])
else:
data = json.loads(r.content.decode())
output5 = data ['config'] ['Spectrum_Power']
self.out_q.put([device_ip,st,output5['Auto_Power'].replace('2', 'Max Power').replace('1', 'Min Power').replace('0', 'off'),output5['AutoConfig']])
def main():
start = time.time()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
pas = input("Enter Device Password:")
df =pd.DataFrame(columns = ["IP","Status","Auto_Power","AutoChannel"])
q = queue.Queue(len(device_ip))
for ip in device_ip:
q.put_nowait(ip)
out_q = queue.Queue(len(device_ip))
Writer_Worker(out_q, df).start()
for _ in range(num_threads):
Worker(q, out_q, pas).start()
q.join()
print(df)
df.to_excel('iBridge_C5x_Audit_Report.xlsx', sheet_name='Detail', index = False)
if __name__ == "__main__":
main()
below is the error while running the script, seeps I am unable to login to this device.
Any help is appreciable.
You should use a thread pool that distributes the work between a fixed number of threads. This is a core feature of Python since version 3.2.
from concurrent.futures import ThreadPoolExecutor
Define a function perform(ip) that performs the request for one ip
Set variable numThreads to the number of desired threads
Run the thread-pool executor:
print(f'Using {numThreads} threads')
with ThreadPoolExecutor(max_workers=numThreads) as pool:
success = all(pool.map(perform, ips))
Source: https://docs.python.org/3/library/concurrent.futures.html
On that page you find an example even better tailored to your application: https://docs.python.org/3/library/concurrent.futures.html#threadpoolexecutor-example
from threading import Thread
th = Thread(target=self.fill_imdb, args=(movies_info_part, "thread " + str(count)))
th.start()
fill_imdb is my method

Break Main Calling Thread If Child Thread Throws An Exception

I'm using threading.Thread and t.start() with a List of Callables to do long-running multithreaded processing. My main thread is blocked until all threads did finish. I'd like however t.start() to immediately return if one of the Callables throw an exception and terminate the other threads.
Using t.join() to check that the thread got executed provides no information about failures due to exception.
Here is the code:
import json
import requests
class ThreadServices:
def __init__(self):
self.obj = ""
def execute_services(self, arg1, arg2):
try:
result = call_some_process(arg1, arg2) #some method
#save results somewhere
except Exception, e:
# raise exception
print e
def invoke_services(self, stubs):
"""
Thread Spanning Function
"""
try:
p1 = "" #some value
p2 = "" #some value
# Call service 1
t1 = threading.Thread(target=self.execute_services, args=(a, b,)
# Start thread
t1.start()
# Block till thread completes execution
t1.join()
thread_pool = list()
for stub in stubs:
# Start parallel execution of threads
t = threading.Thread(target=self.execute_services,
args=(p1, p2))
t.start()
thread_pool.append(t)
for thread in thread_pool:
# Block till all the threads complete execution: Wait for all
the parallel tasks to complete
thread.join()
# Start another process thread
t2 = threading.Thread(target=self.execute_services,
args=(p1, p2)
t2.start()
# Block till this thread completes execution
t2.join()
requests.post(url, data= json.dumps({status_code=200}))
except Exception, e:
print e
requests.post(url, data= json.dumps({status_code=500}))
# Don't return anything as this function is invoked as a thread from
# main calling function
class Service(ThreadServices):
"""
Service Class
"""
def main_thread(self, request, context):
"""
Main Thread:Invokes Task Execution Sequence in ThreadedService
:param request:
:param context:
:return:
"""
try:
main_thread = threading.Thread(target=self.invoke_services,
args=(request,))
main_thread.start()
return True
except Exception, e:
return False
When i call Service().main_thread(request, context) and there is some exception executing t1, I need to get it raised in main_thread and return False. How can i implement it for this structure. Thanks!!
For one thing, you are complicating matters too much. I would do it this way:
from thread import start_new_thread as thread
from time import sleep
class Task:
"""One thread per task.
This you should do with subclassing threading.Thread().
This is just conceptual example.
"""
def __init__ (self, func, args=(), kwargs={}):
self.func = func
self.args = args
self.kwargs = kwargs
self.error = None
self.done = 0
self.result = None
def _run (self):
self.done = 0
self.error = None
self.result = None
# So this is what you should do in subclassed Thread():
try: self.result = self.func(*self.args, **self.kwargs)
except Exception, e:
self.error = e
self.done = 1
def start (self):
thread(self._run,())
def wait (self, retrexc=1):
"""Used in place of threading.Thread.join(), but it returns the result of the function self.func() and manages errors.."""
while not self.done: sleep(0.001)
if self.error:
if retrexc: return self.error
raise self.error
return self.result
# And this is how you should use your pool:
def do_something (tasknr):
print tasknr-20
if tasknr%7==0: raise Exception, "Dummy exception!"
return tasknr**120/82.0
pool = []
for task in xrange(20, 50):
t = Task(do_something, (task,))
pool.append(t)
# And only then wait for each one:
results = []
for task in pool:
results.append(task.wait())
print results
This way you can make task.wait() raise the error instead. The thread would already be stopped. So all you need to do is remove their references from pool, or whole pool, after you are done. You can even:
results = []
for task in pool:
try: results.append(task.wait(0))
except Exception, e:
print task.args, "Error:", str(e)
print results
Now, do not use strictly this (I mean Task() class) as it needs a lot of things added to be used for real.
Just subclass threading.Thread() and implement the similar concept by overriding run() and join() or add new functions like wait().

Python function call in thread always returns same value

I'm boggled over why a function called in a thread always returns the same value. I've confirmed that the parameters are different for each call. If I call the function after acquiring a lock then the function returns the correct value. This obviously defeats the purpose of using threads, because then this function is just called sequentially, one thread after another. Here is what I have. The function is called "get_related_properties" and I've made a note of it in the code:
class ThreadedGetMultipleRelatedProperties():
def __init__(self, property_values, **kwargs):
self.property_values = property_values
self.kwargs = kwargs
self.timeout = kwargs.get('timeout', 20)
self.lock = threading.RLock()
def get_result_dict(self):
queue = QueueWithTimeout()
result_dictionary = {}
num_threads = len(self.property_values)
threads = []
for i in range(num_threads):
t = GetMultipleRelatedPropertiesThread(queue,
result_dictionary,
self.lock)
t.setDaemon(True)
try:
threads.append(t)
t.start()
except:
return {"Error": "Unable to process results at this time." }
for property_value in self.property_values:
kwargs_copy = dict.copy(kwargs)
kwargs_copy['property_value'] = property_value
queue.put(self.kwargs_copy)
queue.join_with_timeout(self.timeout)
# cleanup threads
for i in range(num_threads):
queue.put(None)
for t in threads: t.join()
return result_dictionary
class GetMultipleRelatedPropertiesThread(threading.Thread):
def __init__(self, queue, result_dictionary, lock):
threading.Thread.__init__(self)
self.queue = queue
self.result_dictionary = result_dictionary
self.lock = lock
def run(self):
from mixpanel_helpers import get_related_properties
while True:
kwargs = self.queue.get()
if kwargs == None:
break
current_property_value = kwargs.get('property_value')
self.lock.acquire()
# The function call below always returns the same value if called before acquire
result = get_related_properties(**kwargs)
try:
self.result_dictionary[current_property_value] = result
finally:
self.lock.release()
#signals to queue job is done
self.queue.task_done()
Here is get_related_properties, although it makes other calls, so I'm not sure the problem lives in here:
def get_related_properties(property_name,
property_value,
related_properties,
properties={},
**kwargs):
kwargs['exclude_detailed_data'] = True
properties[property_name] = property_value
result = get_multiple_mixpanel_results(properties=properties,
filter_on_values=related_properties,
**kwargs)
result_dictionary = {}
for related_property in related_properties:
try:
# grab the last result here, because it'll more likely have the most up to date properties
current_result = result[related_property][0]['__results'][0]['label']
except Exception as e:
current_result = None
try:
related_property = int(related_property)
except:
pass
result_dictionary[related_property] = current_result
return result_dictionary
An additional note, I've also tried to copy the function using Python's copy module, both a deep and shallow copy and call the function copy, but neither of those worked.

python threadpool problem (wait for something)

I wrote simple web site crowler with threadpool. The problem is: then crawler is get all over site it must finish, but in real it wait for something in the end,and script dont finished, why this happend?
from Queue import Queue
from threading import Thread
import sys
from urllib import urlopen
from BeautifulSoup import BeautifulSoup, SoupStrainer
import re
from Queue import Queue, Empty
from threading import Thread
visited = set()
queue = Queue()
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
print "startcall in thread",self
print args
try: func(*args, **kargs)
except Exception, e: print e
print "stopcall in thread",self
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def process(pool,host,url):
try:
print "get url",url
#content = urlopen(url).read().decode(charset)
content = urlopen(url).read()
except UnicodeDecodeError:
return
for link in BeautifulSoup(content, parseOnlyThese=SoupStrainer('a')):
#print "link",link
try:
href = link['href']
except KeyError:
continue
if not href.startswith('http://'):
href = 'http://%s%s' % (host, href)
if not href.startswith('http://%s%s' % (host, '/')):
continue
if href not in visited:
visited.add(href)
pool.add_task(process,pool,host,href)
print href
def start(host,charset):
pool = ThreadPool(7)
pool.add_task(process,pool,host,'http://%s/' % (host))
pool.wait_completion()
start('simplesite.com','utf8')
The problem I see is that you never quit the while in run. So, it will block forever. You need to break that loop when the jobs are done.
You could try to :
1) insert
if not func: break
after task.get(...) in run.
2) append
pool.add_task(None, None, None)
at the end of process.
This is a way for process to notify the pool that he has no more task to process.

Categories