I am trying to use multiprocessing to speed up my data processing. I am working on a machine with 6 Cores, so I want to iterate through a table of 12 million rows, and for each of these rows I iterate through several time steps doing a calculation (executing a function).
This line I would like to split up that it runs in parallel on different cores:
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
I tried something with
from multiprocessing import Pool
but I did not manage to pass the arguments of the function and the iterator.
I would appreciate any idea. I am new to Python.
This is what i have:
import pyreadr
import pandas as pd
import numpy as np
import time
from datetime import timedelta
import functools
from pathlib import Path
def read_data():
current_path = os.getcwd()
myfile = os.path.join(str(Path(current_path).parents[0]), 'dummy.RData')
result = pyreadr.read_r(myfile)
pc = result["pc"]
u = result["u"]
return pc, u
# add one column per time
def prepare_output_structure(pc):
ini_cols = pc.columns
pc = pc.reindex(columns=[*pc.columns, *np.arange(0, 11), 'cat'], fill_value=0)
pc.reset_index(level=0, inplace=True)
# print(pc.columns, pc.shape, pc.dtypes)
return pc, ini_cols
def conjunction(*conditions):
return functools.reduce(np.logical_and, conditions)
def timeloop(t_final: int, count_final: int, tipo):
if tipo == 'A':
count_ini = 35
else: # B:
count_ini = 30
yy_list = []
for t in np.arange(0, 11):
yy = ((count_final - count_ini) / t_final) * t + count_ini
yy_list.append(int(yy))
return yy_list
def rowiteration(i, output, ini_cols, cols):
c_2: bool = pc.loc[i, 'tipo'] == u.iloc[:, 0].str[:1] # first character of category e.g. 'A1'
c_5: bool = pc.loc[i, 't_final'] >= u.iloc[:, 1] # t_min (u)
c_6: bool = pc.loc[i, 't_final'] <= (u.iloc[:, 2]) # t_max (u)
pc.loc[i, 'cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
pc.iloc[i, (0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(pc.loc[i, 't_final']), int(pc.loc[i, 'count_final']), pc.loc[i, 'tipo'])
out = pd.DataFrame(pc.iloc[i, :])
out = pd.DataFrame(out.transpose(), columns=cols)
output = output.append(out.iloc[0, :])
return output
if __name__ == '__main__':
start_time = time.time()
pc, u = read_data()
nrowpc = len(pc.index)
a = np.arange(0, nrowpc) # filas tabla pc
# print(a, nrowpc, len(pc.index))
pc, ini_cols = prepare_output_structure(pc)
cols = pc.columns
output = pd.DataFrame()
test = [rowiteration(i, output, ini_cols, cols) for i in a] # this should run in parallel
pc2 = pd.concat(test, ignore_index=True)
pc2 = pc2.iloc[:, np.r_[5, (len(ini_cols)+1):(len(pc2.columns))]]
print(pc2.head)
elapsed_time_secs = time.time() - start_time
msg = "Execution took: %s secs (Wall clock time)" % timedelta(milliseconds=elapsed_time_secs)
print(msg)```
Replace your [rowiteration(i, output, ini_cols, cols) for i in a] with:
from multiprocessing import Pool
n_cpu = 10 # put in the number of threads of cpu
with Pool(processes=n_cpu) as pool:
ret = pool.starmap(rowiteration,
[(i, output, ini_cols, cols) for i in a])
Here is an approach that I think solves the problem and that only sends what is necessary to the worker processes. I haven't tested this as is (which would be difficult without the data your code reads in) but this is basic idea:
import multiprocessing as mp
p = mp.Pool(processes=mp.cpu_count())
# Note that you already define the static cols and ini_cols
# in global scope so you don't need to pass them to the Pool.
# ... Other functions you've defined ...
def rowiteration(row):
c_2: bool = row['tipo'] == u.iloc[:, 0].str[:1]
c_5: bool = row['t_final'] >= u.iloc[:, 1]
c_6: bool = row['t_final'] <= (u.iloc[:, 2])
row['cat'] = u[conjunction(c_2, c_5, c_6)].iloc[0, 0]
row[(0 + (len(ini_cols))+1):(10 + (len(ini_cols))+2)] = timeloop(int(row['t_final']), int(row['count_final']), row['tipo'])
return row
out = []
for row in p.imap_unordered(rowiteration, [r for _, r in pc.iterrows()]):
row.index = cols
out.append(cols)
pc2 = pd.DataFrame(out, ignore_index=True)
I'm trying to update a dataframe (self.df) with a column from a temp df(self.df_temp['linkedin_profile']) with the following class but it doesn't seem to update anything.
The code:
class NameToSocialURLScraper:
def __init__(self, csv_file_name, person_name_column, organization_name_column):
self.proxy_list = PROXY_LIST
pool = Pool()
self.csv_file_name = csv_file_name
self.person_name_column = person_name_column
self.organization_name_column = organization_name_column
self.df = pd.read_csv(csv_file_name)
self.df_temp = pd.DataFrame()
def internal_linkedin_job(self):
self.df['linkedin_profile'] = np.nan
self.df_temp['linkedin_profile'] = np.nan
self.df_temp['linkedin_profile'] = self.df.apply(
lambda row: term_scraper(
str(row[self.person_name_column]) + " " + str(row[self.organization_name_column]), self.proxy_list,
'link', output_generic=False), axis=1)
self.df['linkedin_profile'] = self.df_temp['linkedin_profile']
print(self.df.values)
...
def multiprocess_job(self):
multiprocessing.log_to_stderr(logging.DEBUG)
linkedin_profile_proc = Process(target=self.internal_linkedin_job, args=())
jobs = [linkedin_profile_proc]
# Start the processes (i.e. calculate the random number lists)
for j in jobs:
j.start()
# Ensure all of the processes have finished
for j in jobs:
j.join()
When printing inside internal_linkedin_job it shows the df with the new column 'linkedin_profile' but when I print after j.join() the column isn't there.
When doing multiprocessing, each process runs in its own memory space. You would need to refactor your code so that internal_linkedin_job returns the dataframe.
I have the following code
global total_pds
total_pds = []
ksplit = wr.s3.list_objects(pred_path)
ksplit = list(ksplit)
def process(x):
dk = wr.s3.read_parquet(path = pred_path+x,dataset=False)
return dk
def log_result(result):
print(len(total_pds), end = ' ')
total_pds.append(result)
def error_back(error):
print('error', error)
pool = mp.Pool(processes=4,maxtasksperchild=10)
dcms_info = [pool.apply_async(process, args=(spl,), callback = log_result, error_callback = error_back) for spl in ksplit]
for x in dcms_info:
x.wait()
pool.close()
pool.join()
dataset = pd.concat(total_pds, ignore_index=True)
the last element throw me this error:
error("'i' format requires -2147483648 <= number <= 2147483647"
Thank you
I tried to use multiprocessing on this for loop:
def segment_features(segment_pixels):
features = []
npixels, nbands = segment_pixels.shape
for b in range(nbands):
stats = scipy.stats.describe(segment_pixels[:, b])
band_stats = list(stats.minmax) + list(stats)[2:]
if npixels == 1:
# in this case the variance = nan, change it 0.0
band_stats[3] = 0.0
features += band_stats
return features
segment_ids = np.unique(segments)
objects = []
object_ids = []
for id in segment_ids:
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
By replacing the for loop section with this:
def segment_features_calc(segment_pixels):
segment_pixels = img[segments == id]
object_features = segment_features(segment_pixels)
objects.append(object_features)
object_ids.append(id)
print("segment "+str(id)+" features calculated")
n=mp.cpu_count()
if__name__ == '__main__':
p = mp.Pool(processes = n)
start = time.time()
async_result = p.map_async(segment_features_calc,list(segment_ids))
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
However, the multiprocessing does not get executed properly (multiprocessing stops after 0.17 seconds whereas the items to loop are close to 270,000+ segment IDs). Any insights on how to solve the issue?
I have an application in Tkinter.
Part of this application is a method:
It basically takes long lists of random values and checks if the random values are inside of a previously defined grid. Afterwards it writes them into another variable to export it.
This is a rather long process. So I would like to multiprocess it.
Read some stuff about how to do that. Here's the resulting code:
I've read around SO for stuff that might be relevant. I am running an up-to-date Spyder with Python 3.7 as part of the Anaconda-suite on both machines, all (at least included) packages are up-to-date and I've included the
if __name__ == '__main__':
-line. I've also experimented with indentation of
p.start()
and
processes.append(p)
Simply can't get it to work.
def ParallelStuff(myIn1, myIn2, myIn3, myIn4, anotherIn1, anotherIn2, anotherIn3, return_dict, processIterator):
tempOut1 = np.zeros(len(myIn1)) # myIn1, myIn2, myIn3 are of the same length
tempOut2 = np.zeros(len(myIn1))
tempOut3 = np.zeros(len(myIn1))
bb = 0
for i in range(len(myIn3)):
xx = myIn3[i]
yy = myIn4[i]
hits = np.isin(anotherIn1, xx)
goodY = anotherIn3[np.where(hits==1)]
if np.isin(yy, goodY):
tempOut1[bb] = myIn1[i]
tempOut2[bb] = myIn2[i]
tempOut3[bb] = anotherIn3
bb += 1
return_dict[processIterator] = [tempOut1, tempOut1, tempOut3]
nCores = multiprocessing.cpu_count()
def export_Function(self):
out1 = np.array([])
out2 = np.array([])
out3 = np.array([])
for loop_one in range(0, N):
# ...
# stuff that works on both systems with only one core...
# ... and on linux with all cores
processes = []
nTotal = int(len(xRand))
if nTotal%nCores == 0:
o = int(nTotal/nCores)
else:
o = int(nTotal/(nCores-1))
manager = multiprocessing.Manager()
return_dict = manager.dict()
for processIterator in range (nCores):
offset = o*i
myIn1 = in1[offset : min(nTotal, offset + o)]
myIn2 = in2[offset : min(nTotal, offset + o)]
myIn3 = in3[offset : min(nTotal, offset + o)]
myIn4 = in4[offset : min(nTotal, offset + o)]
if __name__ == '__main__':
p = multiprocessing.Process(target = ParallelStuff, args = (myIn1, myIn2, myIn3, myIn4, anotherIn1, anotherIn2, anotherIn3, return_dict, processIterator))
p.start()
processes.append(p)
for p in range(len(processes)):
processes[p].join()
myOut1 = return_dict[p][0]
myOut2 = return_dict[p][1]
myOut3 = return_dict[p][2]
out1 = np.concatenate((out1, myOut1[np.where(myOut1 != 0)]))
out2 = np.concatenate((out2, myOut2[np.where(myOut2 != 0)]))
out3 = np.concatenate((out3, myOut3[np.where(myOut3 != 0)]))
When I run my programm on my Linux machine it does exactly what it's supposed to do. Distribute to all 8 cores, computes, concatenates the 3 results in the respective arrays, exports.
When I run my programm on my Windows machine the application's window freezes, the process becomes inactive, a new kernel automatically opens and a new window appears.