I am calculating a grouped row-wise moving average on a large data set. However, the process takes a too long time on a single thread. How can I efficiently speed up the process?
Please find a reproducible example below:
dataframe = pd.DataFrame({'id': range(2),
'group_id': range(2),
'Date_1_F1': [1,2],
'Date_2_F1': [2,4],
'Date_3_F1': [3, 6],
'Date_4_F1': [4,8],
'Date_1_F2': [2,11],
'Date_2_F2': [6, 13],
'Date_3-F2': [10, 15],
'Date_4_F2': [14, 17]})
dataframe
id group_id Date_1_F1 ... Date_2_F2 Date_3-F2 Date_4_F2
0 0 0 1 ... 6 10 14
1 1 1 2 ... 13 15 17
I have a function that returns the (row-wise) smoothed version of the dataset.
def smooth_ts(dataframe, ma_parameter = 2):
dataframe = (dataframe
.set_index(["id", "group_id"])
.groupby(lambda x: x.split("_")[-1], axis = 1, group_keys=False)
.apply(lambda x: x.rolling(ma_parameter, axis = 1)
.mean()
.dropna(axis=1, how='all')))
dataframe.reset_index(inplace = True)
return dataframe
smoothed_df = smooth_ts(dataframe)
Thank you very much
You could (1) melt your data frame using pd.melt, (2) create your grouping variable, (3) sort and group it aggregated by rolling.mean(2). Then you can use df.pivot to display the required data. In this approach, there is an apply method that can be parallelized using swifter. Here is an example:
import pandas as pd
import numpy as np
import swifter
dataframe = pd.DataFrame({'id': range(2),
'group_id': range(2),
'Date_1_F1': [1,2],
'Date_2_F1': [2,4],
'Date_3_F1': [3, 6],
'Date_4_F1': [4,8],
'Date_1_F2': [2,11],
'Date_2_F2': [6, 13],
'Date_3-F2': [10, 15],
'Date_4_F2': [14, 17]})
df_melted = pd.melt(dataframe, id_vars=['id', 'group_id'])
# Use next line if you want to parallelize the apply method
# df_melted['groups'] = df_melted['variable'].str.split('_').swifter.apply(lambda v: v[-1])
df_melted['groups'] = df_melted['variable'].str.split('_').apply(lambda v: v[-1])
df_melted = df_melted.sort_values(['id', 'group_id', 'groups'])
df_tmp = df_melted.copy()
df_tmp['rolling_val'] = df_tmp.groupby(['id', 'group_id', 'groups'])['value'].rolling(2).mean().values
df_tmp.pivot(index=['id', 'group_id'], columns='variable', values='rolling_val').dropna(axis=1).reset_index().rename_axis(None, axis=1)
If you want to stick to your approach, you can accelerate it using the Pool object from the multiprocessing library, which parallelizes the mapping of a function to an iterator.
import pandas as pd
import numpy as np
from multiprocessing import Pool
dataframe = pd.DataFrame({'id': range(2),
'group_id': range(2),
'Date_1_F1': [1,2],
'Date_2_F1': [2,4],
'Date_3_F1': [3, 6],
'Date_4_F1': [4,8],
'Date_1_F2': [2,11],
'Date_2_F2': [6, 13],
'Date_3-F2': [10, 15],
'Date_4_F2': [14, 17]})
dataframe
def smooth_ts(dataframe, ma_parameter = 2):
dataframe = (dataframe
.set_index(["id", "group_id"])
.groupby(lambda x: x.split("_")[-1], axis = 1, group_keys=False)
.apply(lambda x: x.rolling(ma_parameter, axis = 1)
.mean()
.dropna(axis=1, how='all')))
dataframe.reset_index(inplace = True)
return dataframe
id_chunks = np.array_split(dataframe.id.unique(), 2) # 2 : number of splits => corresponds to number of chunks
df_chunks = [dataframe[dataframe['id'].isin(i)] for i in id_chunks] # list containing chunked data frames
with Pool(2) as p: dfs_chunks = p.map(smooth_ts, df_chunks) # applies function smooth_ts to list of data frames, use two processors as dfs_chunks only contain two data frames. For more chunks, number of processors can be increased
pd.concat(dfs_chunks).reset_index(drop=True)
Related
I have a piece of code like this:
import pandas as pd
data = {
'col1': [17,2,3,4,5,5,10,22,31,11,65,86],
'col2': [6,7,8,9,10,31,46,12,20,37,91,32],
'col3': [1,2,3,4,5,6,7,8,9,10,11,12]
}
df = pd.DataFrame(data)
sampling_period = 3
abnormal_data = set()
for i in range(sampling_period):
# get index of [0, 3, 6, 9, ...], [1, 4, 7, 10, ...], and [2, 5, 8, 11, ...]
df_sampled = df[i::sampling_period]
diff = df_sampled - df_sampled.shift(1)
# diff >= 5 are considered as an abnormal columns
abnormal_df = df_sampled[
diff >= 5
].dropna(how="all", axis=1)
abnormal_data = abnormal_data.union(set(abnormal_df.columns))
print(f"abnormal_data: {abnormal_data}")
What the code above does are as the followings:
Sampling all the columns in df based on sampling_period.
If the difference between 2 consecutive elements in df_sampled is larger than or equal to 5, mark this column as abnormal.
Return abnormal columns.
Is there anyway to avoid the for loop in the code?
The code above takes a lot of time to run when sampling_period and df becomes large. I wish that it could run faster.
For example, when my sampling_period is 60, and df.shape is (20040, 3562), it takes about 683 seconds to run the above code.
So I have a dataset that contains history of a specific tag from a start to end date. I am trying to compare rows based on the a date column, if they're similar by month, day and year, I'll add those to a temporary list by the value of the next column and then once I have those items by similar date, I'll take that list and find the min/max values subtract them, then add the result to another list and empty the temp_list to start all over again.
For the sake of time and simplicity, I am just presenting a example of 2D List. Here's my example data
dataset = [[1,5],[1,6],[1,10],[1,23],[2,4],[2,8],[2,12],[3,10],[3,20],[3,40],[4,50],[4,500]]
Where the first column will act as dates and second value.
The issues I am having is :
I cant seem to compare every row based on its first column which would take the value in the second column and include it in the temp list to perform min/max operations?
Based on the above 2D List I would expect to get [18,8,30,450] but the result is [5,4,10]
dataset = [[1,5],[1,6],[1,10],[1,23],[2,4],[2,8],[2,12],[3,10],[3,30],[3,40],[4,2],[4,5]]
temp_list = []
daily_total = []
for i in range(len(dataset)-1):
if dataset[i][0] == dataset[i+1][0]:
temp_list.append(dataset[i][1])
else:
max_ = max(temp_list)
min_ = min(temp_list)
total = max_ - min_
daily_total.append(total)
temp_list = []
print([x for x in daily_total])
Try:
tmp = {}
for d, v in dataset:
tmp.setdefault(d, []).append(v)
out = [max(v) - min(v) for v in tmp.values()]
print(out)
Prints:
[18, 8, 30, 450]
Here is a solution using pandas:
import pandas as pd
dataset = [
[1, 5],
[1, 6],
[1, 10],
[1, 23],
[2, 4],
[2, 8],
[2, 12],
[3, 10],
[3, 20],
[3, 40],
[4, 50],
[4, 500],
]
df = pd.DataFrame(dataset)
df.columns = ["date", "value"]
df = df.groupby("date").agg(min_value=("value", "min"), max_value=("value", "max"))
df["res"] = df["max_value"] - df["min_value"]
df["res"].to_list()
Output:
[18, 8, 30, 450]
I can use np.select to insert a new column and set the value for one dataFrame.
But when I combined both dataFrame. The np.select does not work. Seems index error.
import pandas as pd
import numpy as np
df = pd.DataFrame([[3, 2, 1],[4, 5, 6]], columns=['col1','col2','col3'], index=['a','b'])
df2 = pd.DataFrame([[14, 15, 16],[17, 16, 15]], columns=['col1','col2','col3'], index=['c','e'])
count = df.append(df2)
print(count)
conditions = [
(df["col1"] >= df["col2"]) & (df["col2"] >= df["col3"]),
]
choices = [100]
count["col4"] = np.select(conditions,choices, default='WHAT')
count
This is success
This is error after combine, error is :
ValueError: Length of values does not match length of index
I think there is a typo in your code when it comes to count vs df. The following code just works fine.
import pandas as pd
import numpy as np
df = pd.DataFrame([[3, 2, 1],[4, 5, 6]], columns=['col1','col2','col3'], index=['a','b'])
df2 = pd.DataFrame([[14, 15, 16],[17, 16, 15]], columns=['col1','col2','col3'], index=['c','e'])
count = df.append(df2)
print(count)
conditions = [
(count["col1"] >= count["col2"]) & (count["col2"] >= count["col3"]),
]
print(conditions)
choices = [100]
count["col4"] = np.select(conditions,choices, default='WHAT')
count
Suppose that we have a data-frame (df) with a high number of rows (1600000X4). Also, we have a list of lists such as this one:
inx = [[1,2],[4,5], [8,9,10], [15,16]]
We need to calculate average of first and third columns of this data-frame and median of second and fourth columns for every list in inx. For example, for the first list of inx, we should do this for first and second rows and replace all these rows with a new row which contains the output of these calculations. What is the fastest way to do this?
import numpy as np
import pandas as pd
df = pd.DataFrame(np.array([[1, 2, 3, 3], [4, 5, 6, 1], [7, 8, 9, 3], [1, 1, 1, 1]]), columns=['a', 'b', 'c', 'd'])
a b c d
0 1 2 3 3
1 4 5 6 1
2 7 8 9 3
3 1 1 1 1
The output for just the first list inside of inx ([1,2]) will be something like this:
a b c d
0 1 2 3 3
1 5.5 6.5 7.5 2
3 1 1 1 1
As you can see, we don't change first row (0), because it's not in the main list. After that, we're going to do the same for [4,5]. We don't change anything in row 3 because it's not in the list too. inx is a large list of lists (more than 100000 elements).
EDIT: NEW APPROACH AVOIDING LOOPS
Here below you find an approach relying on pandas and avoiding loops.
After generating some fake data with the same size of yours, I basically create list of indexes from your inx list of rows; i.e., with your inx being:
[[2,3], [5,6,7], [10,11], ...]
the created list is:
[[1,1], [2,2,2], [3,3],...]
After that, this list is flattened and added to the original dataframe to mark various groups of rows to operate on.
After proper calculations, the resulting dataframe is joined back with original rows which don't need calculations (in my example above, rows: [0, 1, 4, 8, 9, ...]).
You find more comments in the code.
At the end of the answer I leave also my previous approach for the records.
On my box, the old algo involving a loop take more than 18 minutes... unbearable!
Using pandas only, it takes less than half second!! Pandas is great!
import pandas as pd
import numpy as np
import random
# Prepare some fake data to test
data = np.random.randint(0, 9, size=(160000, 4))
df = pd.DataFrame(data, columns=['a', 'b', 'c', 'd'])
inxl = random.sample(range(1, 160000), 140000)
inxl.sort()
inx=[]
while len(inxl) > 3:
i = random.randint(2,3)
l = inxl[0:i]
inx.append(l)
inxl = inxl[i:]
inx.append(inxl)
# flatten inx (used below)
flat_inx = [item for sublist in inx for item in sublist]
# for each element (list) in inx create equivalent list (same length)
# of increasing ints. They'll be used to group corresponding rows
gr=[len(sublist) for sublist in inx]
t = list(zip(gr, range(1, len(inx)+1)))
group_list = [a*[b] for (a,b) in t]
# the groups are flatten either
flat_group_list = [item for sublist in group_list for item in sublist]
# create a new dataframe to mark rows to group retaining
# original index for each row
df_groups = pd.DataFrame({'groups': flat_group_list}, index=flat_inx)
# and join the group dataframe to the original df
df['groups'] = df_groups
# rows not belonging to a group are marked with 0
df['groups']=df['groups'].fillna(0)
# save rows not belonging to a group for later
df_untouched = df[df['groups'] == 0]
df_untouched = df_untouched.drop('groups', axis=1)
# new dataframe containg only rows belonging to a group
df_to_operate = df[df['groups']>0]
df_to_operate = df_to_operate.assign(ind=df_to_operate.index)
# at last, we group the rows according to original inx
df_grouped = df_to_operate.groupby('groups')
# calculate mean and median
# for each group we retain the index of first row of group
df_operated =df_grouped.agg({'a' : 'mean',
'b' : 'median',
'c' : 'mean',
'd' : 'median',
'ind': 'first'})
# set correct index on dataframe
df_operated=df_operated.set_index('ind')
# finally, join the previous dataframe with saved
# dataframe of rows which don't need calcullations
df_final = df_operated.combine_first(df_untouched)
OLD ALGO, TOO SLOW FOR SO MUCH DATA
This algo involving a loop, though giving a correct result, takes to long for such a big amount of data:
import pandas as pd
df = pd.DataFrame(np.array([[1, 2, 3, 3], [4, 5, 6, 1], [7, 8, 9, 3], [1, 1, 1, 1]]), columns=['a', 'b', 'c', 'd'])
inx = [[1,2]]
for l in inx:
means=df.iloc[l][['a', 'c']].mean()
medians=df.iloc[l][['b', 'd']].median()
df.iloc[l[0]]=pd.DataFrame([means, medians]).fillna(method='bfill').iloc[0]
df.drop(index=l[1:], inplace=True)
Just saw this:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
Apparently the .ix() operator is now deprecated. Wondering how to do something like this:
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=pd.DatetimeIndex(['2017-01-01', '2017-01-03', '2017-01-05']))
wanted_int_index = df.index.get_loc('2017-01-04', method='ffill') # index_id = 2
wanted_str_column = 'a'
value = df.ix[wanted_int_index, wanted_str_column] # value = 2
print(value)
# 2
My understanding is that .loc() excepts label (str) for both index and columns, while .iloc() excepts position (int) for both index and columns. Am I missing a usage here?
loc should work for non-datetime indexing.
import pandas as pd
import numpy as np
data = np.random.rand(10)
df = pd.DataFrame(data, index=range(10),columns=['A'])
print(df.loc[1,'A']) #this works
For datetimes, like you have, you need to index using them. Ie,
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
index=pd.DatetimeIndex(['2017-01-01', '2017-01-03', '2017-01-05']))
wanted_int_index = df.index.get_loc('2017-01-04', method='ffill') # index_id = 2
wanted_str_column = 'a'
value = df.loc[df.index[wanted_int_index], wanted_str_column] # value = 2
print(value) #this works