use multiprocessing to implement a function in python - python

I am using a function that take too much time to finish since it takes a large input and use two nested for loops .
The code of the function :
def transform(self, X):
global brands
result=[]
for x in X:
index=0
count=0
for brand in brands:
all_matches= re.findall(re.escape(brand), x,flags=re.I)
count_all_match=len(all_matches)
if(count_all_match>count):
count=count_all_match
index=brands.index(brand)
result.append([index])
return np.array(result)
So how to change the code of this function so that it uses multiprocessing in order to optimize the running time ?

I don't see the use of self in the method transform. So i made a common function.
import re
import numpy as np
from concurrent.futures import ProcessPoolExecutor
def transformer(x):
global brands
index = 0
count = 0
for brand in brands:
all_matches = re.findall(re.escape(brand), x, flags=re.I)
count_all_match = len(all_matches)
if count_all_match > count:
count = count_all_match
index = brands.index(brand)
return [index]
def transform(X):
with ProcessPoolExecutor() as executor:
result = executor.map(transformer, X)
return np.array(list(result))

Related

Concurrent.futures not parallelizing loop iterations

I am trying to use concurrent.futures to process a function with multiple threads to efficiently speed up the code.
I have read their documentation and this guide but believe I may not be doing this correctly. This MRE should allow us to test a number of different string lengths and list sizes to compare performance:
import pandas as pd, tqdm, string, random
from thefuzz import fuzz, process
from concurrent.futures import ThreadPoolExecutor
def generate_string(items=10, lengths=5):
return [''.join(random.choice(string.ascii_letters) for i in range (lengths))] * items
def matching(a, b):
matches = {}
scorers = {'token_sort_ratio': fuzz.token_sort_ratio, 'token_set_ratio': fuzz.token_set_ratio, 'partial_token_sort_ratio': fuzz.partial_token_sort_ratio,
'Quick': fuzz.QRatio, 'Unicode Quick': fuzz.UQRatio, 'Weighted': fuzz.WRatio, 'Unweighted': fuzz.UWRatio}
for x in tqdm.tqdm(a):
best = 0
for _, scorer in scorers.items():
res = process.extractOne(x, b, scorer=scorer)
if res[1] > best:
best = res[1]
matches[x] = res
else:
continue
return matches
list_a = generate_string(100, 10)
list_b = generate_string(10, 5)
with ThreadPoolExecutor(max_workers=5) as executor:
future = executor.submit(matching, list_a, list_b)
This code runs with no error; how can I use multiple workers to execute these loops in parallel so that the code will run faster?
Thanks to a hint from #Anentropic, I was able to use the following change with multiprocessing
if __name__ == '__main__':
list_a = generate_string(500, 10)
list_b = generate_string(500, 10)
pool = Pool(os.cpu_count()-2)
res = pool.map(matching, zip(list_a, list_b))
norm_res = matching([list_a, list_b])

Python, Casting a list from a set changes the order. What is the best way to avoid this?

Given this python code snippet:
import numpy as np
rng = np.random.default_rng(42)
class Agent:
def __init__(self, id):
self.id = id
self.friends = set()
def __repr__(self):
return str(self.id)
group_list = list()
for i in range(100):
new_obj = Agent(i)
group_list.append(new_obj)
for person in group_list:
pool = rng.choice([p for p in group_list if p != person], 6)
for p in pool:
person.friends.add(p)
def set_to_list_ordered(a_set):
return sorted(list(a_set), key=lambda x: x.id)
print("This will change: ")
print(rng.choice(list(group_list[0].friends), 2))
print("This will not change: ")
print(rng.choice(set_to_list_ordered(group_list[0].friends), 2))
The purpose of this code is to perform a random extraction of 2 elements from a set. The problem is that the np.random.choiche function does not accept a set, so you have to turn it into a list. But, doing this, the order of the elements is random and given the same seed, the result of the random extraction is not replicable. In this case I implemented a function that sorts the elements, but it is costly.
You will rightly say, use a list instead of a set. To this I reply that sets fit perfectly the use I need. For example, this structure allows the Agent.friends attribute to have no duplicate elements.
So, my question is, what is the most convenient method, other than the function I implemented, to use sets and have the random extraction from a set be deterministic? Is it better to use lists instead of sets? Is there any way to make the transformation deterministic?
Thanks in advance.
EDIT:
Some observe that internally the transformation from set to list is consistent. My objective is for this transformation to be consistent externally as well. So that by running the same script numerous times, the extraction of the default_rng instance is the same.
You can use ordered set.
From the documentation:
from ordered_set import OrderedSet
>>>OrderedSet('abracadabra')
OrderedSet(['a', 'b', 'r', 'c', 'd'])
Solved by overriding the hash() method. Source: https://www.youtube.com/watch?v=C4Kc8xzcA68
import numpy as np
rng = np.random.default_rng(42)
class Agent:
def __init__(self, id):
self.id = id
self.friends = set()
def __repr__(self):
return str(self.id)
def __hash__(self):
return self.id
group_list = list()
for i in range(100):
new_obj = Agent(i)
group_list.append(new_obj)
for person in group_list:
pool = rng.choice([p for p in group_list if p != person], 6)
for p in pool:
person.friends.add(p)
def set_to_list_ordered(a_set):
return sorted(list(a_set), key=lambda x: x.id)
print("This will change: ")
print(rng.choice(list(group_list[0].friends), 2))
print("This will not change: ")
print(rng.choice(set_to_list_ordered(group_list[0].friends), 2))

multiprocessing a function that returns list in python

Given this example:
I want to distribute the calculation over a list (map)
from multiprocessing import Pool
def fun(elem):
lis = []
for i in range(0,elem):
lis.append(i)
return lis
p = Pool(3)
p.map(fun, [2,3,4])
How can I make something like this work?

python filter + multiprocessing + iterator lazy loading

I have a 2 dimensional array which produces a huge (>300GB) list of combinations, so i'd like to do lazy iteration on the iterator produced by itertools.combinations and parallelize this operation. The problem is that I need to filter the output and this isn't supported by Multiprocessing. My existing workaround for this requires loading the combinations list into memory, which also doesn't work because of the size of the list.
n_nodes = np.random.randn(10, 100)
cutoff=0.3
def node_combinations(nodes):
return itertools.combinations(list(range(len(nodes))), 2)
def pfilter(func, candidates):
return np.asarray([c for c, keep in zip(candidates, pool.map(func, candidates)) if keep])
def pearsonr(xy: tuple):
correlation_coefficient = scipy.stats.pearsonr(n_nodes[xy[0]], n_nodes[xy[1]])[0]
if correlation_coefficient >= cutoff:
return True
else:
return False
edgelist = pfilter(pearsonr, node_combinations(n_nodes))
I'm looking for a way to do lazy evaluation of a large iterator using multiprocessing with filter instead of map.
The following uses a Semaphore to slow down the over eager pool thread. Not the proper solution as it doesn't fix the other issues such as that nested loops that use the same pool and loop over the result of imap have their outer loop's jobs finish before any of the inner loops jobs even get to start. But it does limit the memory usage:
def slowdown(n=16):
s = threading.Semaphore(n)
def inner(it):
for item in it:
s.acquire()
yield item
def outer(it):
for item in it:
s.release()
yield item
return outer, inner
This is used to wrap pool.imap as such:
outer, inner = slowdown()
outer(pool.imap(func, inner(candidates)))
Hoxha's suggestion works fine -- thanks!
#Dan the issue is that even empty lists take up memory, which x42 billion pairings is nearly 3TB in memory.
here's my implementation:
import more_itertools
import itertools
import multiprocessing as mp
import numpy as np
import scipy
from tqdm import tqdm
n_nodes = np.random.randn(10, 100)
num_combinations = int((int(n_nodes.shape[0]) ** 2) - int(n_nodes.shape[0]) // 2)
cpu_count = 8
cutoff=0.3
def node_combinations(nodes):
return itertools.combinations(list(range(len(nodes))), 2)
def edge_gen(xy_iterator: type(itertools.islice)):
edges = []
for cand in tqdm(xy_iterator, total=num_combinations//cpu_count)
if pearsonr(cand):
edges.append(cand)
def pearsonr(xy: tuple):
correlation_coefficient = scipy.stats.pearsonr(n_nodes[xy[0]], n_nodes[xy[1]])[0]
if correlation_coefficient >= cutoff:
return True
else:
return False
slices = more_itertools.distribute(cpu_count), node_combinations(n_nodes))
pool = mp.Pool(cpu_count)
results = pool.imap(edge_gen, slices)
pool.close()
pool.join()

Create a copy of an object rather that reinitialising inside of a new multiprocessing process

This code shows the structure of what I am trying to do.
import multiprocessing
from foo import really_expensive_to_compute_object
## Create a really complicated object that is *hard* to initialise.
T = really_expensive_to_compute_object(10)
def f(x):
return T.cheap_calculation(x)
P = multiprocessing.Pool(processes=64)
results = P.map(f, range(1000000))
print results
The problem is that each process starts by spending a lot of time recalculating T instead of using the original T that was computed once. Is there a way to prevent this? T has a fast (deep) copy method, so can I get Python to use that instead of recalculating?
multiprocessing documentation suggests
Explicitly pass resources to child processes
So your code can be rewritenn to something like this:
import multiprocessing
import time
import functools
class really_expensive_to_compute_object(object):
def __init__(self, arg):
print 'expensive creation'
time.sleep(3)
def cheap_calculation(self, x):
return x * 2
def f(T, x):
return T.cheap_calculation(x)
if __name__ == '__main__':
## Create a really complicated object that is *hard* to initialise.
T = really_expensive_to_compute_object(10)
## helper, to pass expensive object to function
f_helper = functools.partial(f, T)
# i've reduced count for tests
P = multiprocessing.Pool(processes=4)
results = P.map(f_helper, range(100))
print results
Why not have f take a T parameter instead of referencing the global, and do the copies yourself?
import multiprocessing, copy
from foo import really_expensive_to_compute_object
## Create a really complicated object that is *hard* to initialise.
T = really_expensive_to_compute_object(10)
def f(t, x):
return t.cheap_calculation(x)
P = multiprocessing.Pool(processes=64)
results = P.map(f, (copy.deepcopy(T) for _ in range(1000000)), range(1000000))
print results

Categories