I have a dataframe and want to do add a column by taking first 3 digits of a base column using multiprocessing.
Please see below the python code:
import multiprocessing as mp
import pandas as pd
import numpy as np
data = pd.DataFrame({'employee':['Donald','Douglas','Jennifer','Michael','Pat','Susan','Hermann','Shelley','William',
'Steven','Neena','Lex','Alexander','Bruce','David','Valli','Diana','Nancy','Daniel','John'],
'PHONE_NUMBER':['650.507.9833','650.507.9844','515.123.4444','515.123.5555','603.123.6666',
'515.123.7777','515.123.8888','515.123.8080','515.123.8181','515.123.4567','515.123.4568',
'515.123.4569','590.423.4567','590.423.4568','590.423.4569','590.423.4560','590.423.5567',
'515.124.4569','515.124.4169','515.124.4269']})
# Part3- Multiprocessing thread
def strip_digits(x):
return str(x)[:3]
def city_code(x):
x['start_digits'] = x['PHONE_NUMBER'].apply(strip_digits)
return x
def parallelize(df,func):
df_split = np.array_split(df,partitions)
pool = mp.Pool(cores)
df_retun = pd.concat(pool.map(func,df_split), ignore_index=True)
pool.close
return df_retun
if __name__=='__main__':
mp.set_start_method('spawn')
cores = mp.cpu_count()
partitions = cores
df = parallelize(data, city_code)
group_data = df.groupby(['start_digits'])
group_size = group_data.size()
print(group_data.get_group('515'))
I am getting different Attributes error. please help me to identify the error in the code. This is a sample dataframe. I want to do similar task for large dataframe using multi-processing.
Thanks in advance.
Related
Question: How can I pass the dataframe's columns into a function for each row using multiprocessing or concurrent.futures?
Details:
For each row in df, I want to pass its columns leader and years into the function print_sentences(). I want to use the function in a parallel way where each row is printed asynchronously. For example, I want to make use of concurrent.futures.Executor.map.
It needs to be in Python 3.6.
Reprex: My actual problem is computationally demanding, so here is a simplified reprex:
import pandas as pd
import numpy as np
import concurrent.futures
df = pd.DataFrame(np.array([["Larry", 3, "Germany"], ["Jerry", 5, "Sweden"], ["George", 12, "UK"]]),
columns=['leader', 'years', 'score'])
def print_sentences(df):
print(df["leader"] + " has been leader for " + df["years"] + " years")
print_sentences(df)
Background:
Other questions related to this issue seem to deal with object types other than a dataframe.
My specific issue begins when I read in a .csv, the dataframe. I want pass this dataframe's columns, for each of its rows, into some function. My actual function (dramatically simplified for a reprex) is computationally demanding. It scrapes data and saves it to a .json. Each row therefore acts a different query (inputting a different leader's name and score, for example).
To optimise this, I want the rows to map into the function in a parallel way.
I have simplified my problem with the reprex above.
Thanks for your help in advance.
Try this,
#Edits made to reflect your use case.
import pandas as pd
import numpy as np
from multiprocessing import cpu_count, Pool
cores = cpu_count() #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
def parallelize(data, func):
data_split = np.array_split(data, partitions)
pool = Pool(cores)
data = pd.concat(pool.map(func, data_split))
pool.close()
pool.join()
return data
def print_sentences(cols):
leader, years = cols[0], cols[1]
print(leader + " has been leader for " + years + " years")
df = pd.DataFrame(np.array([["Larry", 3, "Germany"], ["Jerry", 5, "Sweden"],
["George", 12, "UK"]]),
columns=['leader', 'years', 'score'])
data = df.copy()
data = parallelize(data, print_sentences)
data.apply(print_sentences, axis=1)
I need to generate 3000+ ndjson files from a pandas data frame based on certain criteria. I tried running the following code, it works but it takes a lot of time to finish.
def p_generate_files(result_df: pd.DataFrame, p_code: str) -> None:
print(result_df.shape)
tmp_df = result_df.filter(like=str(p_code), axis=0)
start_date = tmp_df.index.unique(level='date').min().to_pydatetime().strftime('%b').upper()
end_date = tmp_df.index.unique(level='date').max().to_pydatetime().strftime('%b').upper()
file_name_path = f'data/CR-{p_code}-{start_date}-{end_date}-2000.json'
tmp_df.reset_index(inplace=True)
tmp_df.to_json(
file_name_path,
orient="records",
index=True,
lines=True)
result_df.drop(labels= p_code, inplace = True)
I tried the following implementation of parallel processing but it doesn't seem to work. I have no experience with concurrent programming. Any help to speed up the processing is appreciated.
p_generate_files = partial(generate_files,result_df=big_df)
with concurrent.futures.ProcessPoolExecutor() as executor:
executor.map(p_generate_files, p_codes)
Try multiprocessing, you have to set up the inputs as a list of tuples
import multiprocessing as mp
def generate_files(result_df: pd.DataFrame, codes: list) -> None:
# Your function
if __name__ == '__main__':
cores = mp.cpu_count()
args = [(df1, lst1), (df2, lst2), (df3, lst3) ...]
with mp.Pool(processes=cores) as pool:
results = pool.starmap(merge_names, args)
I am trying to return the results of a multiprocessing execution in a Pandas DataFrame where I can see the value in NICE Dataframe like below as alternate separate Grid in Jupyternotebook.
import pandas as pd
import json
def f1():
t = [{"arrival_date": "2019-12-01", "asof_date": "2019-08-01"}, {"arrival_date": "2019-12-01", "asof_date": "2019-08-01"}]
print (type (dump))
df = pd.DataFrame(dump)
return pd.DataFrame(t)
f1()
However when I am trying to achieve same things with multiprocessing I am not getting dataframe printed with nice grid . It seems I am bound to print the return value(with python print statement) which is splitting up the columns while printing and I have 30 rows to print .
Here is what I was trying to use Shared variable to communicate and finally just call 'df' to print the nice alternate separate output I get below error :
I have objective to print as 'df' and not print(df) also ..is that not possible ? Not sure why it is returning in first example .
import multiprocessing
def f1(i,t):
t = [{"arrival_date": "2019-12-01", "asof_date": "2019-08-01"}, {"arrival_date": "2019-12-01", "asof_date": "2019-08-01"}]
return t
for i in range(2):
manager = multiprocessing.Manager()
t = manager.list()
t_sub = multiprocessing.Process(target=f1, args=(i,t))
t_sub.start()
t_sub.join()
df = pd.DataFrame(t)
df
Also for below it is not returning the dataframe from multiprocessing call :
import multiprocessing
def f1(i,t):
t = [{"arrival_date": "2019-12-01", "asof_date": "2019-08-01"}, {"arrival_date": "2019-12-01", "asof_date": "2019-08-01"}]
return pd.DataFrame(t)
for i in range(2):
manager = multiprocessing.Manager()
t = manager.list()
t_sub = multiprocessing.Process(target=f1, args=(i,t))
t_sub.start()
t_sub.join()
with a basic pandas df of financial market OHLCV data, I am trying to add numerous calculated columns to the df. The large number of columns and calculations is making this SLOW SLOW SLOW!
Trying to multiprocess with pool.map, but getting nowhere.
Ideally, each iteration of the loop should be sent to a discrete thread. Simplified moving averages in code below.
Shown simple dictionary and rolling mean works SLOWLY
TypeError: map() missing 1 required positional argument: 'iterable'
All help appreciated-thx
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
#####################################################
# DJIA_OHLCV_test.csv has format:
# Date,Open,High,Low,Close,Adj Close,Volume
#
1/2/2015,17823.07031,17951.7793,17731.30078,17832.99023,17832.99023,76270000
#
1/3/2015,17823.07031,17951.7793,17731.30078,17832.99023,17832.99023,76270000
DJIA = pd.read_csv('DJIA_OHLCV_test.csv')
"""
#####################################################
# # This works! please comment out to switch
# MAdict = {'MA50':50, 'MA100':100, 'MA200':200} # Define Moving Average
Windows
# for MAkey in MAdict:
# DJIA[('ma' + MAkey)] = pd.Series.rolling(DJIA['Adj Close'],
window=MAdict[MAkey]).mean()
#####################################################
"""
# This doesn't work! please comment out to switch
MAdict = {'MA50':50, 'MA100':100, 'MA200':200}
pool = ThreadPool(3)
def moving_average(MAkey):
return pd.Series.rolling(DJIA['Adj Close'], window=MAdict[MAkey]).mean()
for MAkey in MAdict:
DJIA[('ma' + MAkey)] = pool.map(moving_average(MAkey))
#####################################################
print(DJIA.tail())
pool.map is a blocking call, so instead of iterating over MAdict and calling pool.map you need to pass the iterable directly as an argument to pool.map:
import pandas as pd
from multiprocessing.dummy import Pool
def moving_average(ma):
return pd.Series.rolling(djia['Adj Close'], window=ma).mean()
if __name__ == '__main__':
N_WORKERS = 3
MA_DICT = {'MA50':50, 'MA100':100, 'MA200':200}
djia = pd.read_csv('DJIA_OHLCV_test.csv')
with Pool(N_WORKERS) as pool:
results = pool.map(moving_average, iterable=MA_DICT.values())
# concatenate results and rename columns
results = pd.concat(results, axis=1)
results.columns = ['ma' + key for key in MA_DICT]
djia = pd.concat([djia, results], axis=1)
print(djia.tail())
I'm trying to use multiprocessing with pandas dataframe, that is split the dataframe to 8 parts. apply some function to each part using apply (with each part processed in different process).
EDIT:
Here's the solution I finally found:
import multiprocessing as mp
import pandas.util.testing as pdt
def process_apply(x):
# do some stuff to data here
def process(df):
res = df.apply(process_apply, axis=1)
return res
if __name__ == '__main__':
p = mp.Pool(processes=8)
split_dfs = np.array_split(big_df,8)
pool_results = p.map(aoi_proc, split_dfs)
p.close()
p.join()
# merging parts processed by different processes
parts = pd.concat(pool_results, axis=0)
# merging newly calculated parts to big_df
big_df = pd.concat([big_df, parts], axis=1)
# checking if the dfs were merged correctly
pdt.assert_series_equal(parts['id'], big_df['id'])
You can use https://github.com/nalepae/pandarallel, as in the following example:
from pandarallel import pandarallel
from math import sin
pandarallel.initialize()
def func(x):
return sin(x**2)
df.parallel_apply(func, axis=1)
A more generic version based on the author solution, that allows to run it on every function and dataframe:
from multiprocessing import Pool
from functools import partial
import numpy as np
def parallelize(data, func, num_of_processes=8):
data_split = np.array_split(data, num_of_processes)
pool = Pool(num_of_processes)
data = pd.concat(pool.map(func, data_split))
pool.close()
pool.join()
return data
def run_on_subset(func, data_subset):
return data_subset.apply(func, axis=1)
def parallelize_on_rows(data, func, num_of_processes=8):
return parallelize(data, partial(run_on_subset, func), num_of_processes)
So the following line:
df.apply(some_func, axis=1)
Will become:
parallelize_on_rows(df, some_func)
This is some code that I found useful. Automatically splits the dataframe into however many cpu cores you have.
import pandas as pd
import numpy as np
import multiprocessing as mp
def parallelize_dataframe(df, func):
num_processes = mp.cpu_count()
df_split = np.array_split(df, num_processes)
with mp.Pool(num_processes) as p:
df = pd.concat(p.map(func, df_split))
return df
def parallelize_function(df):
df[column_output] = df[column_input].apply(example_function)
return df
def example_function(x):
x = x*2
return x
To run:
df_output = parallelize_dataframe(df, parallelize_function)
This worked well for me:
rows_iter = (row for _, row in df.iterrows())
with multiprocessing.Pool() as pool:
df['new_column'] = pool.map(process_apply, rows_iter)
Since I don't have much of your data script, this is a guess, but I'd suggest using p.map instead of apply_async with the callback.
p = mp.Pool(8)
pool_results = p.map(process, np.array_split(big_df,8))
p.close()
p.join()
results = []
for result in pool_results:
results.extend(result)
To use all (physical or logical) cores, you could try mapply as an alternative to swifter and pandarallel.
You can set the amount of cores (and the chunking behaviour) upon init:
import pandas as pd
import mapply
mapply.init(n_workers=-1)
def process_apply(x):
# do some stuff to data here
def process(df):
# spawns a pathos.multiprocessing.ProcessPool if sensible
res = df.mapply(process_apply, axis=1)
return res
By default (n_workers=-1), the package uses all physical CPUs available on the system. If your system uses hyper-threading (usually twice the amount of physical CPUs would show up), mapply will spawn one extra worker to prioritise the multiprocessing pool over other processes on the system.
You could also use all logical cores instead (beware that like this the CPU-bound processes will be fighting for physical CPUs, which might slow down your operation):
import multiprocessing
n_workers = multiprocessing.cpu_count()
# or more explicit
import psutil
n_workers = psutil.cpu_count(logical=True)
I also run into the same problem when I use multiprocessing.map() to apply function to different chunk of a large dataframe.
I just want to add several points just in case other people run into the same problem as I do.
remember to add if __name__ == '__main__':
execute the file in a .py file, if you use ipython/jupyter notebook, then you can not run multiprocessing (this is true for my case, though I have no clue)
Install Pyxtension that simplifies using parallel map and use like this:
from pyxtension.streams import stream
big_df = pd.concat(stream(np.array_split(df, multiprocessing.cpu_count())).mpmap(process))
I ended up using concurrent.futures.ProcessPoolExecutor.map in place of multiprocessing.Pool.map which took 316 microseconds for some code that took 12 seconds in serial.
Python's pool.starmap() method can be used to succinctly introduce parallelism also to apply use cases where column values are passed as arguments, i.e. to cases like:
df.apply(lambda row: my_func(row["col_1"], row["col_2"], ...), axis=1)
Full example and benchmarking:
import time
from multiprocessing import Pool
import numpy as np
import pandas as pd
def mul(a, b, c):
# For illustration, could obviously be vectorized
return a * b * c
df = pd.DataFrame(np.random.randint(0, 100, size=(10_000_000, 3)), columns=list('ABC'))
# Standard apply
start = time.time()
df["mul"] = df.apply(lambda row: mul(row["A"], row["B"], row["C"]), axis=1)
print(f"Standard apply took {time.time() - start:.0f} seconds.")
# Starmap apply
start = time.time()
with Pool(10) as pool:
df["mul_pool"] = pool.starmap(mul, zip(df["A"], df["B"], df["C"]))
print(f"Starmap apply took {time.time() - start:.0f} seconds.")
pd.testing.assert_series_equal(df["mul"], df["mul_pool"], check_names=False)
>>> Standard apply took 72 seconds.
>>> Starmap apply took 5 seconds.
This has the benefit of not relying on external libraries, plus being very readable.
Tom Raz's answer https://stackoverflow.com/a/53135031/11847090 misses an edge case where there are fewer rows in the dataframe than processes
use this parallelize method instead
def parallelize(data, func, num_of_processes=8):
# check if the number of rows is less than the number of processes
# to avoid the following error
# ValueError: Expected a 1D array, got an array with shape
num_rows = len(data)
if num_rows == 0:
return None
elif num_rows < num_of_processes:
num_of_processes = num_rows
data_split = np.array_split(data, num_of_processes)
pool = Pool(num_of_processes)
data = pd.concat(pool.map(func, data_split))
pool.close()
pool.join()
return data
and also I used dask bag to multithread this instead of this custom code