For a relatively big Pandas DataFrame (a few 100k rows), I'd like to create a series that is a result of an apply function. The problem is that the function is not very fast and I was hoping that it can be sped up somehow.
df = pd.DataFrame({
'value-1': [1, 2, 3, 4, 5],
'value-2': [0.1, 0.2, 0.3, 0.4, 0.5],
'value-3': somenumbers...,
'value-4': more numbers...,
'choice-index': [1, 1, np.nan, 2, 1]
})
def func(row):
i = row['choice-index']
return np.nan if math.isnan(i) else row['value-%d' % i]
df['value'] = df.apply(func, axis=1, reduce=True)
# expected value = [1, 2, np.nan, 0.4, 5]
Any suggestions are welcome.
Update
A very small speedup (~1.1) can be achieved by pre-caching the selected columns. func would change to:
cached_columns = [None, 'value-1', 'value-2', 'value-3', 'value-4']
def func(row):
i = row['choice-index']
return np.nan if math.isnan(i) else row[cached_columns[i]]
But I was hoping for greater speedups...
I think I got a good solution (speedup ~150).
The trick is not to use apply, but to do smart selections.
choice_indices = [1, 2, 3, 4]
for idx in choice_indices:
mask = df['choice-index'] == idx
result_column = 'value-%d' % (idx)
df.loc[mask, 'value'] = df.loc[mask, result_column]
Related
I am trying to apply the following function over each column in a dataframe:
def hurst_lag(x):
minlag = 200
maxlag = 300
lags = range(minlag, maxlag)
tau = [sqrt(std(subtract(x.dropna()[lag:], x.dropna()[:-lag]))) for lag in lags]
m = polyfit(log(lags), log(tau), 1)
return m[0]*2
The function only works on non NA values. In my dataframe, the lengths of my columns differ after applying dropna(). e.g.
df = pd.DataFrame({
'colA':[None, None, 1, 2],
'colB': [None, 2, 6, 4],
'colC': [None, None, 2, 8],
'colD': [None, 2.0, 3.0, 4.0],
})
Any ideas how to run the function over each column individually, excluding the NA values for that specific column? Many thanks
Use apply to run it on the dataframe
df = df.apply(hurst_lag)
I want to sort a nested dict in pyhon via pandas.
import pandas as pd
# Data structure (nested list):
# {
# category_name: [[rank, id], ...],
# ...
# }
all_categories = {
"category_name1": [[2, 12345], [1, 32512], [3, 32382]],
"category_name2": [[3, 12345], [9, 25318], [1, 24623]]
}
df = pd.DataFrame(all_categories.items(), columns=['Category', 'Rank'])
df.sort_values(['Rank'], ascending=True, inplace=True) # this only sorts the list of lists
Can anyone tell me how I can get to my goal? I can't figure it out. Via panda it's possible to sort_values() by the second column, but I can't figure out how to sort the nested dict/list.
I want to sort ascending by the rank, not the id.
The fastest option is to apply sort() (note that the sorting occurs in place, so don't assign back to df.Rank in this case):
df.Rank.apply(list.sort)
Or apply sorted() with a custom key and assign back to df.Rank:
df.Rank = df.Rank.apply(lambda row: sorted(row, key=lambda x: x[0]))
Output in either case:
>>> df
Category Rank
0 category_name1 [[1, 32512], [2, 12345], [3, 32382]]
1 category_name2 [[1, 24623], [3, 12345], [9, 25318]]
This is the perfplot of sort() vs sorted() vs explode():
import perfplot
def explode(df):
df = df.explode('Rank')
df['rank_num'] = df.Rank.str[0]
df = df.sort_values(['Category', 'rank_num']).groupby('Category', as_index=False).agg(list)
return df
def apply_sort(df):
df.Rank.apply(list.sort)
return df
def apply_sorted(df):
df.Rank = df.Rank.apply(lambda row: sorted(row, key=lambda x: x[0]))
return df
perfplot.show(
setup=lambda n: pd.concat([df] * n),
n_range=[2 ** k for k in range(25)],
kernels=[explode, apply_sort, apply_sorted],
equality_check=None,
)
To filter rows by list length, mask the rows with str.len() and loc[]:
mask = df.Rank.str.len().ge(10)
df.loc[mask, 'Rank'].apply(list.sort)
Try
df = pd.DataFrame(all_categories.items(), columns=['Category', 'Rank']).explode('Rank')
df['Rank'] = df['Rank'].apply(lambda x: sorted(x))
df = df.groupby('Category').agg(list).reset_index()
to dict
dict(df.agg(list, axis=1).values)
Try:
df = pd.DataFrame(all_categories.items(), columns=['Category', 'Rank'])
df.set_index('Rank', inplace=True)
df.sort_index(inplace=True)
df.reset_index(inplace=True)
Or:
df = pd.DataFrame(all_categories.items(), columns=['Category', 'Rank'])
df = df.set_index('Rank').sort_index().reset_index()
It is much more efficient to use df.explode and then sort the values. It will be vectorized.
df = df.explode('Rank')
df['rank_num'] = df.Rank.str[0]
df.sort_values(['Category', 'rank_num'])
.groupby('Category', as_index=False)
.agg(list)
Output
Category Rank rank_num
0 category_name1 [[1, 32512], [2, 12345], [3, 32382]] [1, 2, 3]
1 category_name2 [[1, 24623], [3, 12345], [9, 25318]] [1, 3, 9]
I want to calculate how many columns for each row have greater than zero values. So if two out of the three columns have the required values then the score is 2.
I can build this using a for loop but it seems to be slow, so I am looking for faster alternatives. How do I do that?
df = pd.DataFrame({'intro': [1, 2, 3], 'quote': [0, 1, 0],'sample': [0, 1, 4]},
columns=['intro', 'quote','sample'])
df['score']=0
cols=['intro', 'quote', 'sample']
for i in range(len(df)):
print(i)
for col in cols:
if df.iloc[i][col] >= 1:
df['score'][i]= df['score'][i]+1
df_expected = pd.DataFrame({'intro': [1, 2, 3], 'quote': [0, 1, 0],'sample': [0, 1, 4],'score': [1, 3, 2]},
columns=['intro', 'quote','sample','score'])
df_expected
this will do the trick:
df['score']=(df>0).sum(axis=1)
You can create a True/False frame of values > 0 like this:
df > 0
You can coun't the True values in each column using
(df > 0).sum(axis)
and create a column like this:
df['score'] = (df > 0).sum(axis=1)
I have a pandas dataframe,
[[1, 3],
[4, 4],
[2, 8]...
]
I want to create a column that has this:
1*(a)^(3) # = x
1*(a)^(3 + 4) + 4 * (a)^4 # = y
1*(a)^(3 + 4 + 8) + 4 * (a)^(4 + 8) + 2 * (a)^8 # = z
...
Where "a" is some value.
The stuff: 1, 4, 2, is from column one, the repeated 3, 4, 8 is column 2
Is this possible using some form of transform/apply?
Essentially getting:
[[1, 3, x],
[4, 4, y],
[2, 8, z]...
]
Where x, y, z is the respective sums from the new column (I want them next to each other)
There is a "groupby" that is being done on the dataframe, and this is what I want to do for a given group
If I'm understanding your question correctly, this should work:
df = pd.DataFrame([[1, 3], [4, 4], [2, 8]], columns=['a', 'b'])
a = 42
new_lst = []
for n in range(len(lst)):
z = 0
i = 0
while i <= n:
z += df['a'][i]*a**(sum(df['b'][i:n+1]))
i += 1
new_lst.append(z)
df['new'] = new_lst
Update:
Saw that you are using pandas and updated with dataframe methods. Not sure there's an easy way to do this with apply since you need a mix of values from different rows. I think this for loop is still the best route.
I want to find duplicates in a selection of columns of a df,
# converts the sub df into matrix
mat = df[['idx', 'a', 'b']].values
str_dict = defaultdict(set)
for x in np.ndindex(mat.shape[0]):
concat = ''.join(str(x) for x in mat[x][1:])
# take idx as values of each key a + b
str_dict[concat].update([mat[x][0]])
dups = {}
for key in str_dict.keys():
dup = str_dict[key]
if len(dup) < 2:
continue
dups[key] = dup
The code finds duplicates of the concatenation of a and b. Uses the concatenation as key for a set defaultdict (str_dict), updates the key with idx values; finally uses a dict (dups) to store any concatenation if the length of its value (set) is >= 2.
I am wondering if there is a better way to do that in terms of efficiency.
You can just concatenate and convert to set:
res = set(df['a'].astype(str) + df['b'].astype(str))
Example:
df = pd.DataFrame({'idx': [1, 2, 3],
'a': [4, 4, 5],
'b': [5, 5,6]})
res = set(df['a'].astype(str) + df['b'].astype(str))
print(res)
# {'56', '45'}
If you need to map indices too:
df = pd.DataFrame({'idx': [1, 2, 3],
'a': [41, 4, 5],
'b': [3, 13, 6]})
df['conc'] = (df['a'].astype(str) + df['b'].astype(str))
df = df.reset_index()
res = df.groupby('conc')['index'].apply(set).to_dict()
print(res)
# {'413': {0, 1}, '56': {2}}
You can filter the column you need before drop_duplicate
df[['a','b']].drop_duplicates().astype(str).apply(np.sum,1).tolist()
Out[1027]: ['45', '56']