I want to sort a nested dict in pyhon via pandas.
import pandas as pd
# Data structure (nested list):
# {
# category_name: [[rank, id], ...],
# ...
# }
all_categories = {
"category_name1": [[2, 12345], [1, 32512], [3, 32382]],
"category_name2": [[3, 12345], [9, 25318], [1, 24623]]
}
df = pd.DataFrame(all_categories.items(), columns=['Category', 'Rank'])
df.sort_values(['Rank'], ascending=True, inplace=True) # this only sorts the list of lists
Can anyone tell me how I can get to my goal? I can't figure it out. Via panda it's possible to sort_values() by the second column, but I can't figure out how to sort the nested dict/list.
I want to sort ascending by the rank, not the id.
The fastest option is to apply sort() (note that the sorting occurs in place, so don't assign back to df.Rank in this case):
df.Rank.apply(list.sort)
Or apply sorted() with a custom key and assign back to df.Rank:
df.Rank = df.Rank.apply(lambda row: sorted(row, key=lambda x: x[0]))
Output in either case:
>>> df
Category Rank
0 category_name1 [[1, 32512], [2, 12345], [3, 32382]]
1 category_name2 [[1, 24623], [3, 12345], [9, 25318]]
This is the perfplot of sort() vs sorted() vs explode():
import perfplot
def explode(df):
df = df.explode('Rank')
df['rank_num'] = df.Rank.str[0]
df = df.sort_values(['Category', 'rank_num']).groupby('Category', as_index=False).agg(list)
return df
def apply_sort(df):
df.Rank.apply(list.sort)
return df
def apply_sorted(df):
df.Rank = df.Rank.apply(lambda row: sorted(row, key=lambda x: x[0]))
return df
perfplot.show(
setup=lambda n: pd.concat([df] * n),
n_range=[2 ** k for k in range(25)],
kernels=[explode, apply_sort, apply_sorted],
equality_check=None,
)
To filter rows by list length, mask the rows with str.len() and loc[]:
mask = df.Rank.str.len().ge(10)
df.loc[mask, 'Rank'].apply(list.sort)
Try
df = pd.DataFrame(all_categories.items(), columns=['Category', 'Rank']).explode('Rank')
df['Rank'] = df['Rank'].apply(lambda x: sorted(x))
df = df.groupby('Category').agg(list).reset_index()
to dict
dict(df.agg(list, axis=1).values)
Try:
df = pd.DataFrame(all_categories.items(), columns=['Category', 'Rank'])
df.set_index('Rank', inplace=True)
df.sort_index(inplace=True)
df.reset_index(inplace=True)
Or:
df = pd.DataFrame(all_categories.items(), columns=['Category', 'Rank'])
df = df.set_index('Rank').sort_index().reset_index()
It is much more efficient to use df.explode and then sort the values. It will be vectorized.
df = df.explode('Rank')
df['rank_num'] = df.Rank.str[0]
df.sort_values(['Category', 'rank_num'])
.groupby('Category', as_index=False)
.agg(list)
Output
Category Rank rank_num
0 category_name1 [[1, 32512], [2, 12345], [3, 32382]] [1, 2, 3]
1 category_name2 [[1, 24623], [3, 12345], [9, 25318]] [1, 3, 9]
Related
I am calculating a grouped row-wise moving average on a large data set. However, the process takes a too long time on a single thread. How can I efficiently speed up the process?
Please find a reproducible example below:
dataframe = pd.DataFrame({'id': range(2),
'group_id': range(2),
'Date_1_F1': [1,2],
'Date_2_F1': [2,4],
'Date_3_F1': [3, 6],
'Date_4_F1': [4,8],
'Date_1_F2': [2,11],
'Date_2_F2': [6, 13],
'Date_3-F2': [10, 15],
'Date_4_F2': [14, 17]})
dataframe
id group_id Date_1_F1 ... Date_2_F2 Date_3-F2 Date_4_F2
0 0 0 1 ... 6 10 14
1 1 1 2 ... 13 15 17
I have a function that returns the (row-wise) smoothed version of the dataset.
def smooth_ts(dataframe, ma_parameter = 2):
dataframe = (dataframe
.set_index(["id", "group_id"])
.groupby(lambda x: x.split("_")[-1], axis = 1, group_keys=False)
.apply(lambda x: x.rolling(ma_parameter, axis = 1)
.mean()
.dropna(axis=1, how='all')))
dataframe.reset_index(inplace = True)
return dataframe
smoothed_df = smooth_ts(dataframe)
Thank you very much
You could (1) melt your data frame using pd.melt, (2) create your grouping variable, (3) sort and group it aggregated by rolling.mean(2). Then you can use df.pivot to display the required data. In this approach, there is an apply method that can be parallelized using swifter. Here is an example:
import pandas as pd
import numpy as np
import swifter
dataframe = pd.DataFrame({'id': range(2),
'group_id': range(2),
'Date_1_F1': [1,2],
'Date_2_F1': [2,4],
'Date_3_F1': [3, 6],
'Date_4_F1': [4,8],
'Date_1_F2': [2,11],
'Date_2_F2': [6, 13],
'Date_3-F2': [10, 15],
'Date_4_F2': [14, 17]})
df_melted = pd.melt(dataframe, id_vars=['id', 'group_id'])
# Use next line if you want to parallelize the apply method
# df_melted['groups'] = df_melted['variable'].str.split('_').swifter.apply(lambda v: v[-1])
df_melted['groups'] = df_melted['variable'].str.split('_').apply(lambda v: v[-1])
df_melted = df_melted.sort_values(['id', 'group_id', 'groups'])
df_tmp = df_melted.copy()
df_tmp['rolling_val'] = df_tmp.groupby(['id', 'group_id', 'groups'])['value'].rolling(2).mean().values
df_tmp.pivot(index=['id', 'group_id'], columns='variable', values='rolling_val').dropna(axis=1).reset_index().rename_axis(None, axis=1)
If you want to stick to your approach, you can accelerate it using the Pool object from the multiprocessing library, which parallelizes the mapping of a function to an iterator.
import pandas as pd
import numpy as np
from multiprocessing import Pool
dataframe = pd.DataFrame({'id': range(2),
'group_id': range(2),
'Date_1_F1': [1,2],
'Date_2_F1': [2,4],
'Date_3_F1': [3, 6],
'Date_4_F1': [4,8],
'Date_1_F2': [2,11],
'Date_2_F2': [6, 13],
'Date_3-F2': [10, 15],
'Date_4_F2': [14, 17]})
dataframe
def smooth_ts(dataframe, ma_parameter = 2):
dataframe = (dataframe
.set_index(["id", "group_id"])
.groupby(lambda x: x.split("_")[-1], axis = 1, group_keys=False)
.apply(lambda x: x.rolling(ma_parameter, axis = 1)
.mean()
.dropna(axis=1, how='all')))
dataframe.reset_index(inplace = True)
return dataframe
id_chunks = np.array_split(dataframe.id.unique(), 2) # 2 : number of splits => corresponds to number of chunks
df_chunks = [dataframe[dataframe['id'].isin(i)] for i in id_chunks] # list containing chunked data frames
with Pool(2) as p: dfs_chunks = p.map(smooth_ts, df_chunks) # applies function smooth_ts to list of data frames, use two processors as dfs_chunks only contain two data frames. For more chunks, number of processors can be increased
pd.concat(dfs_chunks).reset_index(drop=True)
i am very new to pandas can anybody tell me how to map uniquely lists for a dataframe?
Data
[phone, laptop]
[life, death, mortal]
[happy]
Expected output:
[1,2]
[3,4,5]
[6]
I used map() and enumerate but both give me errors.
For efficiency, use a list comprehension.
For simple counts:
from itertools import count
c = count(1)
df['new'] = [[next(c) for x in l ] for l in df['Data']]
For unique identifiers in case of duplicates:
from itertools import count
c = count(1)
d = {}
df['new'] = [[d[x] if x in d else d.setdefault(x, next(c)) for x in l ] for l in df['Data']]
Output:
Data new
0 [phone, laptop] [1, 2]
1 [life, death, mortal] [3, 4, 5]
2 [happy] [6]
You could explode, replace, and groupby to undo the explode operation:
df = pd.DataFrame({"data": [["phone", "laptop"],
["life", "death", "mortal"],
["happy", "phone"]]})
df_expl = df.explode("data")
df["data_mapped"] = (
df_expl.assign(data=lambda df: range(1, len(df) + 1))
.groupby(df_expl.index).data.apply(list))
print(df)
data data_mapped
0 [phone, laptop] [1, 2]
1 [life, death, mortal] [3, 4, 5]
2 [happy, phone] [6, 7]
This always increments the counter, even if list items are duplicates.
In case duplicates should have unique integer values, use factorize instead of range:
df_expl = df.explode("data")
df["data_mapped"] = (
df_expl.assign(data=df_expl.data.factorize()[0] + 1)
.groupby(df_expl.index).data.apply(list))
print(df)
# output:
data data_mapped
0 [phone, laptop] [1, 2]
1 [life, death, mortal] [3, 4, 5]
2 [happy, phone] [6, 1]
I have some DataFrames:
d = {'colA': [1, 2], 'colB': [3, 4]}
df = pd.DataFrame(data=d)
df2 = pd.DataFrame(data=d)
df3 = pd.DataFrame(data=d)
I want to return a list of columns containing the string 'A', e.g. for one DataFrame:
[column for column in df.columns if 'A' in column]
How can I do this for multiple DataFrames (e.g., df, df2, df3)?
The desired output in this example would be ['colA', 'colA', 'colA']
Here is a way:
l = [','.join(i.columns[i.columns.str.contains('A')]) for i in [df,df2,df3]]
You can create a function for that:
def find_char(list_of_df , char):
result = []
for df in list_of_df:
for c in df.columns:
if char in c:
result.append(c)
return result
Usage:
res = find_char([df,df2,df3] , 'A')
print(res)
['colA', 'colA', 'colA']
I want to find duplicates in a selection of columns of a df,
# converts the sub df into matrix
mat = df[['idx', 'a', 'b']].values
str_dict = defaultdict(set)
for x in np.ndindex(mat.shape[0]):
concat = ''.join(str(x) for x in mat[x][1:])
# take idx as values of each key a + b
str_dict[concat].update([mat[x][0]])
dups = {}
for key in str_dict.keys():
dup = str_dict[key]
if len(dup) < 2:
continue
dups[key] = dup
The code finds duplicates of the concatenation of a and b. Uses the concatenation as key for a set defaultdict (str_dict), updates the key with idx values; finally uses a dict (dups) to store any concatenation if the length of its value (set) is >= 2.
I am wondering if there is a better way to do that in terms of efficiency.
You can just concatenate and convert to set:
res = set(df['a'].astype(str) + df['b'].astype(str))
Example:
df = pd.DataFrame({'idx': [1, 2, 3],
'a': [4, 4, 5],
'b': [5, 5,6]})
res = set(df['a'].astype(str) + df['b'].astype(str))
print(res)
# {'56', '45'}
If you need to map indices too:
df = pd.DataFrame({'idx': [1, 2, 3],
'a': [41, 4, 5],
'b': [3, 13, 6]})
df['conc'] = (df['a'].astype(str) + df['b'].astype(str))
df = df.reset_index()
res = df.groupby('conc')['index'].apply(set).to_dict()
print(res)
# {'413': {0, 1}, '56': {2}}
You can filter the column you need before drop_duplicate
df[['a','b']].drop_duplicates().astype(str).apply(np.sum,1).tolist()
Out[1027]: ['45', '56']
For a relatively big Pandas DataFrame (a few 100k rows), I'd like to create a series that is a result of an apply function. The problem is that the function is not very fast and I was hoping that it can be sped up somehow.
df = pd.DataFrame({
'value-1': [1, 2, 3, 4, 5],
'value-2': [0.1, 0.2, 0.3, 0.4, 0.5],
'value-3': somenumbers...,
'value-4': more numbers...,
'choice-index': [1, 1, np.nan, 2, 1]
})
def func(row):
i = row['choice-index']
return np.nan if math.isnan(i) else row['value-%d' % i]
df['value'] = df.apply(func, axis=1, reduce=True)
# expected value = [1, 2, np.nan, 0.4, 5]
Any suggestions are welcome.
Update
A very small speedup (~1.1) can be achieved by pre-caching the selected columns. func would change to:
cached_columns = [None, 'value-1', 'value-2', 'value-3', 'value-4']
def func(row):
i = row['choice-index']
return np.nan if math.isnan(i) else row[cached_columns[i]]
But I was hoping for greater speedups...
I think I got a good solution (speedup ~150).
The trick is not to use apply, but to do smart selections.
choice_indices = [1, 2, 3, 4]
for idx in choice_indices:
mask = df['choice-index'] == idx
result_column = 'value-%d' % (idx)
df.loc[mask, 'value'] = df.loc[mask, result_column]