ProcessPoolExecutor not working for function with multiple arguments - python - python

I have a dataframe df_full that I am trying to rewrite as a dict() while also doing some stuff over it.
agent locations modal_choices
0 agent_1 'loc1', 'loc2', 'loc3', 'loc2' 'mode_1', 'mode_1', 'mode_2', 'mode_3'
1 agent_2 'loc1', 'loc4', 'loc2', 'loc6' 'mode_2', 'mode_3', 'mode_2', 'mode_3'
I am currently facing a problem while trying to multiprocess the following function format_dict() knowing that I only want to iterate over the agent argument, the three others are supposed to be the same for each iterations. So I added the partial() parameter to "freeze" df, dict_ and list_ but the code returns me an empty dict and an empty list by the end and I don't understand why.
I suppose I haven't written the executor.map() properly. I tried following the methods shown here but it still doesn't return anything.
What could be wrong with my code?
I also printed the time taken by the following script to run with time.perf_counter() and compared it with what is given with tqdm() but the two values don't match. The iteration part is done in 7 seconds (tqdm) while the print of time.perf_counter() shows up after 2.3 minutes.
What would explain the delay for the ending of the with concurrent.futures.ProcessPoolExecutor() as executor:?
I am, unfortunately, still not an expert in python and this is the first time I'm trying to multiprocess something (as the agent list I am working with is massive and would take days to process...). Any help would be greatly appreciated! And please do tell me if informations are missing or if something is not explained properly, I'll edit the post right away.
def format_dict(agent, df, dict_, list_):
try:
dict_[agent] = dict()
toto_ = df.loc[df.agent_ID == agent]
toto_mod = toto_['modal_choices'].apply(lambda x: pd.Series(x.split(',')))
toto_loc = toto_['locations'].apply(lambda x: pd.Series(x.split(',')))
for i in toto_mod:
dict_[agent]['step_{}'.format(i)] = dict()
dict_[agent]['step_{}'.format(i)]['mode'] = toto_mod[i].iloc[0]
dict_[agent]['step_{}'.format(i)]['start'] = toto_loc[counter + 1].iloc[0]
dict_[agent]['step_{}'.format(i)]['name'] = dict_agent_edt[agent]['step_0']['name']
except ValueError:
list_.append(agent)
return dict_, list_
dict_name = dict()
list_name = list()
start = time.perf_counter()
agent = df_full['agent'][:1000]
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(partial(format_dict, df=df_full, dict_=dict_name, list_=list_name),
tqdm(agent), chunksize=50)
end = time.perf_counter()
print(f'It took {(end-start)/60} minutes.')
Following #Louis Lac's answer, I modified my script to avoid any concurrence but it still returns an empty dict.
def format_dict(agent, df, dict_):
try:
dict_[agent] = dict()
toto_ = df.loc[df.agent_ID == agent]
(same stuff here)
except ValueError:
pass
return dict_
start = time.perf_counter()
agents = df_full['agent'][:1000]
dict_name = {}
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(partial(format_dict, df=df_full, dict_=dict_name),
tqdm(agents), chunksize=50)
end = time.perf_counter()
print(f'It took {(end-start)/60} minutes.')

When using concurrency such as multithreading and multiprocessing, functions that are executed concurrently such as format_dict should not mutate shared state to avoid data races or the mutations should be synchronized.
You could for instance compute all your stuff concurrently first, then sequentially reduce the result into outputs (dict_ and list_):
def format_dict(agent, df):
list_ = None
try:
dict_ = dict()
toto_ = df.loc[df.agent_ID == agent]
toto_mod = toto_['modal_choices'].apply(lambda x: pd.Series(x.split(',')))
toto_loc = toto_['locations'].apply(lambda x: pd.Series(x.split(',')))
for i in toto_mod:
dict_['step_{}'.format(i)] = dict()
dict_['step_{}'.format(i)]['mode'] = toto_mod[i].iloc[0]
dict_['step_{}'.format(i)]['start'] = toto_loc[counter + 1].iloc[0]
dict_['step_{}'.format(i)]['name'] = dict_agent_edt[agent]['step_0']['name']
except ValueError:
list_ = agent
return dict_, list_
start = time.perf_counter()
agents = df_full['agent'][:1000]
with concurrent.futures.ProcessPoolExecutor() as executor:
elements = executor.map(partial(format_dict, df=df_full),
tqdm(agents), chunksize=50)
dict_ = {}
list_ = []
for agent, (d, l) in zip(agents, elements):
if l is not None:
list_.append(l)
dict_[agent] = d
end = time.perf_counter()
print(f'It took {(end-start)/60} minutes.')

Related

Using multiprocessing to double the speed of working on a list

Let's say I have a list like this:
list_base = ['a','b','c','d']
If I used for xxx in list_base:, the loop would parse the list one value at a time. If I want to double the speed of this work, I'm creating a list with two values to iterate over at once and calling multiprocessing.
Basic example
Code 1 (main_code.py):
import api_values
if __name__ == '__main__':
list_base = ['a','b','c','d']
api_values.main(list_base)
Code 2 (api_values.py):
import multiprocessing
import datetime
def add_hour(x):
return str(x) + ' - ' + datetime.datetime.now().strftime('%d/%m/%Y %H:%M')
def main(list_base):
a = list_base
a_pairs = [a[i:i+2] for i in range(0, len(a)-1, 2)]
if (len(a) % 2) != 0:
a_pairs.append([a[-1]])
final_list = []
for a, b in a_pairs:
mp_1 = multiprocessing.Process(target=add_hour, args=(a,))
mp_2 = multiprocessing.Process(target=add_hour, args=(b,))
mp_1.start()
mp_2.start()
mp_1.join()
mp_2.join()
final_list.append(mp_1)
final_list.append(mp_2)
print(final_list)
When I analyze the final_list print it delivers values like this:
[
<Process name='Process-1' pid=9564 parent=19136 stopped exitcode=0>,
<Process name='Process-2' pid=5400 parent=19136 stopped exitcode=0>,
<Process name='Process-3' pid=13396 parent=19136 stopped exitcode=0>,
<Process name='Process-4' pid=5132 parent=19136 stopped exitcode=0>
]
I couldn't get to the return values I want conquered by calling the add_hour(x) function.
I found some answers in this question:
How can I recover the return value of a function passed to multiprocessing.Process?
But I couldn't bring to the scenario I'm using where I need the multiprocessing inside a function and not inside if __name__ == '__main__':
When trying to use it, it always generates errors in relation to the position of the created code structure, I would like some help to be able to visualize the use for my need.
Note:
This codes are a basic's examples, my real use is to extract data from an API that allows for a maximum of two simultaneous calls.
Additional code:
According to #Timus comment (You might want to look into a **Pool** and **.apply_async**), I came to this code it seems to me it worked but I don't know if it is reliable, if there is any improvement that is necessary for its use and this option is the best, feel free to update in a answer:
import multiprocessing
import datetime
final_list = []
def foo_pool(x):
return str(x) + ' - ' + datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')
def log_result(result):
final_list.append(result)
def main(list_base):
pool = multiprocessing.Pool()
a = list_base
a_pairs = [a[i:i+2] for i in range(0, len(a)-1, 2)]
if (len(a) % 2) != 0:
a_pairs.append([a[-1]])
for a, b in a_pairs:
pool.apply_async(foo_pool, args = (a, ), callback = log_result)
pool.apply_async(foo_pool, args = (b, ), callback = log_result)
pool.close()
pool.join()
print(final_list)
You don't have to use a callback: Pool.apply_async() gives you a return (an AsyncResult object) which has a .get() method to retrieve the result of the submit. Extension of your attempt:
import time
import multiprocessing
import datetime
from os import getpid
def foo_pool(x):
print(getpid())
time.sleep(2)
return str(x) + ' - ' + datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')
def main(list_base):
a = list_base
a_pairs = [a[i:i+2] for i in range(0, len(a)-1, 2)]
if (len(a) % 2) != 0:
a_pairs.append([a[-1]])
final_list = []
with multiprocessing.Pool(processes=2) as pool:
for a, b in a_pairs:
res_1 = pool.apply_async(foo_pool, args=(a,))
res_2 = pool.apply_async(foo_pool, args=(b,))
final_list.extend([res_1.get(), res_2.get()])
print(final_list)
if __name__ == '__main__':
list_base = ['a','b','c','d']
start = time.perf_counter()
main(list_base)
end = time.perf_counter()
print(end - start)
I have added the print(getpid()) to foo_pool to show that you're actually using different processes. And I've used time to illustrate that despite the time.sleep(2) in foo_pool the overall duration of main isn't much more than 2 seconds.
I think you need shared strings between processes. They can be obtained from multiprocessing.Manager().
Your api_values.py should look like this:
import multiprocessing
import datetime
from ctypes import c_wchar_p
def add_hour(x, ret_str):
ret_str.value = str(x) + ' - ' + datetime.datetime.now().strftime('%d/%m/%Y %H:%M')
def main(list_base):
a = list_base
a_pairs = [a[i:i+2] for i in range(0, len(a)-1, 2)]
if (len(a) % 2) != 0:
a_pairs.append([a[-1]])
final_list = []
manager = multiprocessing.Manager()
for a, b in a_pairs:
ret_str_a = manager.Value(c_wchar_p, "")
ret_str_b = manager.Value(c_wchar_p, "")
mp_1 = multiprocessing.Process(target=add_hour, args=(a, ret_str_a))
mp_2 = multiprocessing.Process(target=add_hour, args=(b, ret_str_b))
mp_1.start()
mp_2.start()
mp_1.join()
mp_2.join()
final_list.append(ret_str_a.value)
final_list.append(ret_str_b.value)
print(final_list)
Source: How to share a string amongst multiple processes using Managers() in Python?

How can I use functional programming to make a generic method in python?

I would like to improve the way this code is written. Right now I have six methods that are almost copy-paste, only one line is changing. How can I make a generic method and depending on the property of the data input to change the calculations? I was thinking to use functional programming to achieve that, but I am not sure how to do it properly.
The method is getting a dict object. Then this object is transformed into JSON. The mid variable is storing a JSON with midrate for currency from external API, it must be before the for loop otherwise the API will be called in every iteration and this slows down the process a lot! Then in the for loop, I iterate through the data from the input. The only difference between methods is the calculation before inserting it in the list. .append(mid_current - bankMSell)
def margin_to_exchange_rate_sell(data):
j = data.to_JSON()
list_p = []
mid = midrate.get_midrate(j["fromCurrency"][0])
for idx, val in enumerate(j['toCurrency']):
try:
mid_current = 1/get_key(mid, j['toCurrency'][idx])
bankMSell = float(j['sellMargin'][idx])
list_p.append(mid_current - bankMSell)
except Exception as e:
list_p.append(0)
print(str(e))
return list_p
Another one of the methods:
def margin_to_exchange_rate_buy(data):
j = data.to_JSON()
list_p = []
mid = midrate.get_midrate(j["fromCurrency"][0])
for idx, val in enumerate(j['toCurrency']):
try:
mid_current = 1/get_key(mid, j['toCurrency'][idx])
bankMSell = float(j['sellMargin'][idx])
list_p.append(mid_current + bankMSell)
except Exception as e:
list_p.append(0)
print(str(e))
return list_p
Indeed, there is a way to reduce code here with lambdas:
def margin_to_exchange_rate_sell(data):
return margin_to_exchange_rate(data, lambda m, b: m - b)
def margin_to_exchange_rate_buy(data):
return margin_to_exchange_rate(data, lambda m, b: m + b)
def margin_to_exchange_rate(data, operation):
j = data.to_JSON()
list_p = []
mid = midrate.get_midrate(j["fromCurrency"][0])
for idx, val in enumerate(j['toCurrency']):
try:
mid_current = 1/get_key(mid, j['toCurrency'][idx])
bankMSell = float(j['sellMargin'][idx])
list_p.append(operation(mid_current, bankMSell))
except Exception as e:
list_p.append(0)
print(str(e))
return list_p

Custom method for unlimited *args with set() function usage?

I am working on some project, and we have lots of some code usage like this;
# filtering fields are different from each other, please ignore the similarity below
def function1(self, param):
list_x = Model1.objects.filter(foo=bar, bla=bla).values_list('field', flat=True)
list_y = Model2.objects.filter(foo=bar, bla=bla).values_list('field', flat=True)
lists_to_delete = set(list_x) - set(list_y)
# here is the code line with set() that needed to be method
self._delete(lists_to_delete)
def function2(self, param):
list_z = Model3.objects.filter(foo=bar, bla=bla).values_list('field', flat=True)
list_q = Model4.objects.filter(foo=bar, bla=bla).values_list('field', flat=True).distinct()
list_w = Model5.objects.filter(foo=bar, bla=bla).values_list('field', flat=True)
lists_to_delete = set(list_x) - set(list_y) - set(list_w)
# here is the code line with set() that needed to be method
self._delete(lists_to_delete)
... # other functions continues like above
...
...
So, as you can see we have same usage with set() function. And I need to change this usage with custom method. I tried to write a method like this;
def _get_deleted_lists(self, *args):
value = set()
for arg in args:
value |= set(arg)
return value
and usage will be change like;
lists_to_delete = self._get_deleted_lists(list_x, list_y, ...)
instead of this;
lists_to_delete = set(list_x) - set(list_y)
But my custom method not return same value as before. How can I achieve this?
| operation on sets returns their union. What you want is the difference (-)
def _get_deleted_lists(*lists):
if not lists:
return set()
result = set(lists[0])
for l in lists[1:]:
result -= set(l)
return result

Loop a function using the previous output value as input

I'm trying to query an API, but it only provides me with 100 records at a time and provides an offshoot record, which I need to use to query the next 100 records. I can write a function to query my results, but am having trouble looping my function to use the output of the previous function as the input of the following function. Here's what I want my loop to essentially do:
def query(my_offset=None):
page = at.get('Accounts',offset=my_offset)
a = page['records']
return str(page['offset'])
query()
query(query())
query(query(query(query())))
query(query(query(query(query()))))
query(query(query(query(query(query())))))
...
...
...
...
I'm guessing res can have a special value indicating no more rows were returned, if so, a while loop can be deployed:
res = query()
while True:
res = query(res)
if not res: break
you just rebind the result of the query to res and re-use it during every iteration.
Try collecting the results externally, and then call the function again:
results = []
MAX_ITERATIONS = 20
offset = None
def query(offset=None):
page = at.get('Accounts', offset=offset)
return page['records'], page['offset']
while len(results) <= MAX_ITERATIONS:
result, offset = query(offset)
results.append(result)
how are you returning the final results? consider:
def query():
offset = None
a = []
while True:
page = at.get('Accounts',offset=offset)
a.extend(page['records'])
offset = page['offset']
if not offset: break
return a
which is really just Jim's answer while collecting and returning the page results
def query(result):
# perform query
return result
def end_query_condition(result):
# if want to continue query:
return True
# if you want to stop:
return False
continue_query = True
result = None
while continue_query:
result = query(result)
continue_query = end_query_condition(result)
You may simply make your function recursive without the need of any loop as:
def query(my_offset=None):
if not offset:
return None
page = at.get('Accounts',offset=my_offset)
a = page['records']
return query(str(page['offset']))

how to make a function return every possible values

So I am using writing this method, which work fine in term of looking up number but it only return last value. is there a way to make it so it return all the values after each run.
here is my code:
def searchPFAM():
fileAddress = '/Volumes/interpro/data/Q14591.txt'
start = None
end = None
with open(fileAddress,'rb') as f:
root = etree.parse(f)
for lcn in root.xpath("/protein/match[#dbname='PFAM']/lcn"):#find dbname =PFAM
start = int(lcn.get("start"))#if it is PFAM then look for start value
end = int(lcn.get("end"))#if it is PFAM then also look for end value
print start, end
return start, end
Do you mean something similar to this?
def do_something(fname):
with open(fname,'rb') as f:
root = etree.parse(f)
for lcn in root.xpath("/protein/match[#dbname='PFAM']/lcn"):#find dbname =PFAM
# Make slightly more robust
try:
start = int(lcn.get("start"))#if it is PFAM then look for start value
end = int(lcn.get("end"))#if it is PFAM then also look for end value
yield start, end
except (TypeError , ValueError) as e:
pass # start/end aren't usable as numbers decide what to do here...
for start, end in do_something():
do_something_else(start, end)
You can create a list of tuples of start and end and just return the list at the end of your function.
Just modify your function to create and return a list of start,end tuples:
def searchPFAM():
fileAddress = '/Volumes/interpro/data/Q14591.txt'
start = None
end = None
result = []
with open(fileAddress,'rb') as f:
root = etree.parse(f)
for lcn in root.xpath("/protein/match[#dbname='PFAM']/lcn"):#find dbname =PFAM
start = int(lcn.get("start"))#if it is PFAM then look for start value
end = int(lcn.get("end"))#if it is PFAM then also look for end value
print start, end
result.append((start, end))
return result
A little less readable, but more compact and efficient way to write it would be to use what is knowm as a "list comprehension" would be like this:
def searchPFAM():
fileAddress = '/Volumes/interpro/data/Q14591.txt'
start = None
end = None
with open(fileAddress,'rb') as f:
root = etree.parse(f)
result = [(int(lcn.get("start")), int(lcn.get("end")))
for lcn in root.xpath("/protein/match[#dbname='PFAM']/lcn")]
return result
Afterwards you can process the returned list like this:
for start,end in result:
... # do something with pair of int values
or
for i in xrange(len(result)):
start,end = result[i][0],result[i][1]
... # do something with pair of int values

Categories