How to preserve changes to pandas DataFrame when using multiprocessing module? - python

I have a pandas DataFrame and I need to modify this based on some other data that I'm reading off of many files. To speed this up, I'm trying to do this in parallel using the built-in multiprocessing module.
I have a function that modifies the DataFrame. When called manually, this works fine and the DataFrame changes are preserved. When called in parallel, the changes are not preserved.
I know this is somehow related to the SettingWithCopyWarning, but I can't figure out how to work around the issue?
How can I preserve the changes when using the multiprocessing module?
I've created a MWE that illustrates the problem that I'm having.
import multiprocessing
import pandas as pd
class Data:
def __init__(self):
self.data = pd.DataFrame(
{
"ZAID": ["1001.00c", "92235.00nc"],
"ZA": [1001, 92235],
"T(K)": [293.6, 293.6]
}
)
# This is the data that I don't know when creating the DataFrame
# The content will be filled later using addData
columns = {"NE": int, "length": int}
for name, dtype in columns.items():
self.data[name] = pd.Series(dtype=dtype)
def addData(self, index):
"""
add data to an index of the data frame
"""
# read file and extract data
NE = 1200
length = 1234
self.data.loc[index, "NE"] = NE
self.data.loc[index, "length"] = length
if __name__ == "__main__":
print("Learning pandas")
dd = Data()
dd.addData(0) # This call *does* save the addition
with multiprocessing.Pool(2) as pool:
# This call *does not* save the additions
pool.map(dd.addData, dd.data.index)

Related

Pandas dataframe as a property of a class: setter not called when columns changed

I want to have a pandas DataFrame as a property of my class. I want users to be able to interact with the DataFrame in a normal way, but when it changes I want to run a few checks and balances over the resultant data to make sure it remains valid. I thought I would be able to achieve this using the #property decorator, however this doesn't appear to work when a column of the dataframe (as opposed to the entire dataframe) is changed. An example demonstrating what I mean is below.
My question: is there an alternative way to implement what I want? The only thing I can think of is creating a new class inheriting from DataFrame.... which I'd rather not do, and also the desire to monitor a dataframe whenever it gets updated seems like a broader issue than just my own case?
import pandas as pd
class DemoClass:
def __init__(self, data_frame):
self.data = data_frame
self._check_data_frame_is_valid()
#property
def data(self):
return self._data
#data.setter
def data(self, new_data):
self._data = new_data
self._check_data_frame_is_valid()
def _check_data_frame_is_valid(self):
# example check:
allowed_columns = ['one', 'two']
for column in self.data:
assert column in allowed_columns
assert self.data[column].sum() < 10
print('data passed checks')
if __name__ == '__main__':
example_data = pd.DataFrame({'one': [1,2,3], 'two': [1,2,3]})
class_instance = DemoClass(example_data)
DemoClass.data = example_data # this works; #data.setter is triggered
DemoClass.data['three'] = [4,5,6] #this does not work, and the data is not checked

Create and append pandas dummy variables with pipe

I am trying to create a Pandas pipeline that creates dummy variables and append the column to the existing dataframe.
Unfortunately I can't get the appended columns to stick when the pipeline is finished.
Example:
def function(df):
pass
def create_dummy(df):
a = pd.get_dummy(df['col'])
b = df.append(a)
return b
def mah_pipe(df):
(df.pipe(function)
.pipe(create_dummy)
.pipe(print)
return df
print(mah_pipe(df))
First - I have no idea if this is good practice.
What's weird is that the .pipe(print) prints the dataframe with appended columns. Yay.
But the statement print(mah_pipe(df)) does not. I though they would behave the same way.
I have tried to read the documentation about pd.pipe but I couldn't figure it out.
Hoping someone could help shed some light on what's going on.
This is because print in Python returns None. Since you are not making a copy of df on your pipes, your df dies after print.
pipes in Pandas
Unless used as last pipe, in Pandas, we except (df) -> [pipe] -> (df_1)-> [pipe2] ->(df_2)-> [pipeN] -> df_N By having print as last pipe, the output is None.
Solution
...
def start_pipe(dataf):
# allows make a copy to avoid modifying original
dataf = dataf.copy()
def create_dummies(dataf, column_name):
dummies = pd.get_dummies(dataf[column_name])
dataf[dummies.columns] = dummies
return dataf
def print_dataf(dataf, n_rows=5):
print(dataf.head(n_rows))
return dataf # this is important
# usage
...
dt = (df
.pipe(start_pipe)
.pipe(create_dummies, column_name='a')
.pipe(print_dataf, n_rows=10)
)
def mah_pipe(df):
df = (df
.pipe(start_pipe)
.pipe(create_dummies, column_name='a')
.pipe(print_dataf, n_rows=10)
)
return df
print(mah_pipe(df))

Python: Joblib for multiprocessing

So I have these given functions:
def make_event_df(match_id, path):
'''
Function for making event dataframe.
Argument:
match_id -- int, the required match id for which event data will be constructed.
path -- str, path to .json file containing event data.
Returns:
df -- pandas dataframe, the event dataframe for the particular match.
'''
## read in the json file
event_json = json.load(open(path, encoding='utf-8'))
## normalize the json data
df = json_normalize(event_json, sep='_')
return df
def full_season_events(comp_name, match_df, match_ids, path):
'''
Function to make event dataframe for a full season.
Arguments:
comp_name -- str, competition name + season name
match_df -- pandas dataframe, containing match-data
match_id -- list, list of match id.
path -- str, path to directory where .json file is listed.
e.g. '../input/Statsbomb/data/events'
Returns:
event_df -- pandas dataframe, containing event data for the whole season.
'''
## init an empty dataframe
event_df = pd.DataFrame()
for match_id in tqdm(match_ids, desc=f'Making Event Data For {comp_name}'):
## .json file
temp_path = path + f'/{match_id}.json'
temp_df = make_event_df(match_id, temp_path)
event_df = pd.concat([event_df, temp_df], sort=True)
return event_df
Now I am running this piece of code to get the dataframe:
comp_id = 11
season_id = 1
path = f'../input/Statsbomb/data/matches/{comp_id}/{season_id}.json'
match_df = get_matches(comp_id, season_id, path)
comp_name = match_df['competition_name'].unique()[0] + '-' + match_df['season_name'].unique()[0]
match_ids = list(match_df['match_id'].unique())
path = f'../input/Statsbomb/data/events'
event_df = full_season_events(comp_name, match_df, match_ids, path)
The above code snippet is giving me this output:
Making Event Data For La Liga-2017/2018: 100%|██████████| 36/36 [00:29<00:00, 1.20it/s]
How can I make use multiprocessing to make the process faster i.e. how can I use the match_ids in full_season_events() to grab the data from the JSON file in a faster manner(using multiprocessing). I am very new to joblib and multiprocessing concept. Can someone tell what changes do I have to make in these functions to get the required results?
You don't need joblib here, just plain multiprocessing will do.
I'm using imap_unordered since it's faster than imap or map, but doesn't retain order (each worker can receive and submit jobs out of order). Not retaining order doesn't seem to matter since you're sort=Trueing anyway.
Because I'm using imap_unordered, there's that need for additional jobs finagling; there's no istarmap_unordered which would unpack parameters, so we need to do it ourselves.
If you have many match_ids, things can be sped up with e.g. chunksize=10 to imap_unordered; it means each worker process will be fed 10 jobs at a time, and they will also return 10 jobs at a time. It's faster since less time is spent in process synchronization and serialization, but on the other hand the TQDM progress bar will update less often.
As usual, the code below is dry-coded and might not work OOTB.
import multiprocessing
def make_event_df(job):
# Unpack parameters from job tuple
match_id, path = job
with open(path) as f:
event_json = json.load(f)
# Return the match id (if required) and the result.
return (match_id, json_normalize(event_json, sep="_"))
def full_season_events(comp_name, match_df, match_ids, path):
event_df = pd.DataFrame()
with multiprocessing.Pool() as p:
# Generate job tuples
jobs = [(match_id, path + f"/{match_id}.json") for match_id in match_ids]
# Run & get results from multiprocessing generator
for match_id, temp_df in tqdm(
p.imap_unordered(make_event_df, jobs),
total=len(jobs),
desc=f"Making Event Data For {comp_name}",
):
event_df = pd.concat([event_df, temp_df], sort=True)
return event_df

Chaining output between diffrent functions

I'm looking for the name for a procedure which handles output from one function in several others (trying to find better words for my problem). Some pseudo/actual code would be really helpful.
I have written the following code:
def read_data():
read data from a file
create df
return df
def parse_data():
sorted_df = read_data()
count lines
sort by date
return sorted_df
def add_new_column():
new_column_df = parse_data()
add new column
return new_column_df
def create_plot():
plot_data = add_new_column()
create a plot
display chart
What I'm trying to understand is how to skip a function, e.g. create following chain read_data() -> parse_data() -> create_plot().
As the code looks right now (due to all return values and how they are passed between functions) it requires me to change input data in the last function, create_plot().
I suspect that I'm creating logically incorrect code.
Any thoughts?
Original code:
import pandas as pd
import matplotlib.pyplot as plt
# Read csv files in to data frame
def read_data():
raw_data = pd.read_csv('C:/testdata.csv', sep=',', engine='python', encoding='utf-8-sig').replace({'{':'', '}':'', '"':'', ',':' '}, regex=True)
return raw_data
def parse_data(parsed_data):
...
# Convert CreationDate column into datetime
raw_data['CreationDate'] = pd.to_datetime(raw_data['CreationDate'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
raw_data.sort_values(by=['CreationDate'], inplace=True, ascending=True)
parsed_data = raw_data
return parsed_data
raw_data = read_files()
parsed = parsed_data(raw_data)
Pass the data in instead of just effectively "nesting" everything. Any data that a function requires should ideally be passed in to the function as a parameter:
def read_data():
read data from a file
create df
return df
def parse_data(sorted_df):
count lines
sort by date
return sorted_df
def add_new_column(new_column_df):
add new column
return new_column_df
def create_plot(plot_data):
create a plot
display chart
df = read_data()
parsed = parse_data(df)
added = add_new_column(parsed)
create_plot(added)
Try to make sure functions are only handling what they're directly responsible for. It isn't parse_data's job to know where the data is coming from or to produce the data, so it shouldn't be worrying about that. Let the caller handle that.
The way I have things set up here is often referred to as "piping" or "threading". Information "flows" from one function into the next. In a language like Clojure, this could be written as:
(-> (read-data)
(parse-data)
(add-new-column)
(create-plot))
Using the threading macro -> which frees you up from manually needing to handle data passing. Unfortunately, Python doesn't have anything built in to do this, although it can be achieved using external modules.
Also note that since dataframes seem to be mutable, you don't actually need to return the altered ones them from the functions. If you're just mutating the argument directly, you could just pass the same data frame to each of the functions in order instead of placing it in intermediate variables like parsed and added. The way I'm showing here is a general way to set things up, but it can be altered depending on your exact use case.
Use class to contain your code
class DataManipulation:
def __init__(self, path):
self.df = pd.DataFrame()
self.read_data(path)
#staticmethod
def new(file_path):
return DataManipulation(path)
def read_data(self, path):
read data from a file
self.df = create df
def parse_data(self):
use self.df
count lines
sort by date
return self
def add_new_column(self):
use self.df
add new column
return self
def create_plot(self):
plot_data = add_new_column()
create a plot
display chart
return self
And then,
d = DataManipulation.new(filepath).parse_data().add_column().create_plot()

sending each looped pandas calculation to a different thread (python3.6.5) with pool.map

with a basic pandas df of financial market OHLCV data, I am trying to add numerous calculated columns to the df. The large number of columns and calculations is making this SLOW SLOW SLOW!
Trying to multiprocess with pool.map, but getting nowhere.
Ideally, each iteration of the loop should be sent to a discrete thread. Simplified moving averages in code below.
Shown simple dictionary and rolling mean works SLOWLY
TypeError: map() missing 1 required positional argument: 'iterable'
All help appreciated-thx
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool
#####################################################
# DJIA_OHLCV_test.csv has format:
# Date,Open,High,Low,Close,Adj Close,Volume
#
1/2/2015,17823.07031,17951.7793,17731.30078,17832.99023,17832.99023,76270000
#
1/3/2015,17823.07031,17951.7793,17731.30078,17832.99023,17832.99023,76270000
DJIA = pd.read_csv('DJIA_OHLCV_test.csv')
"""
#####################################################
# # This works! please comment out to switch
# MAdict = {'MA50':50, 'MA100':100, 'MA200':200} # Define Moving Average
Windows
# for MAkey in MAdict:
# DJIA[('ma' + MAkey)] = pd.Series.rolling(DJIA['Adj Close'],
window=MAdict[MAkey]).mean()
#####################################################
"""
# This doesn't work! please comment out to switch
MAdict = {'MA50':50, 'MA100':100, 'MA200':200}
pool = ThreadPool(3)
def moving_average(MAkey):
return pd.Series.rolling(DJIA['Adj Close'], window=MAdict[MAkey]).mean()
for MAkey in MAdict:
DJIA[('ma' + MAkey)] = pool.map(moving_average(MAkey))
#####################################################
print(DJIA.tail())
pool.map is a blocking call, so instead of iterating over MAdict and calling pool.map you need to pass the iterable directly as an argument to pool.map:
import pandas as pd
from multiprocessing.dummy import Pool
def moving_average(ma):
return pd.Series.rolling(djia['Adj Close'], window=ma).mean()
if __name__ == '__main__':
N_WORKERS = 3
MA_DICT = {'MA50':50, 'MA100':100, 'MA200':200}
djia = pd.read_csv('DJIA_OHLCV_test.csv')
with Pool(N_WORKERS) as pool:
results = pool.map(moving_average, iterable=MA_DICT.values())
# concatenate results and rename columns
results = pd.concat(results, axis=1)
results.columns = ['ma' + key for key in MA_DICT]
djia = pd.concat([djia, results], axis=1)
print(djia.tail())

Categories