Dataframe classification and sorting optimization problem 2 - python

I asked a sorting problem before, and someone solved it use DataFrame.sort_values by both columns first and then add GroupBy.head.
Dataframe classification and sorting optimization problem
Now I encounter a more complicated sorting. I need to classify the dataframe by category. Each category is filtered according to the value of data1 when the value of data2 of the class is the largest, and then sorted
The code is as follows, how to optimize it?
import numpy as np
import pandas as pd
df = pd.DataFrame()
n = 200
df['category'] = np.random.choice(('A', 'B'), n)
df['data1'] = np.random.rand(len(df))*100
df['data2'] = np.random.rand(len(df))*100
a = df[df['category'] == 'A']
c = a[a['data2'] == a.data2.max()].data1.max()
a = a[a['data1'] <= c]
a = a.sort_values(by='data1', ascending=False).head(4)
b = df[df['category'] == 'B']
c = b[b['data2'] == b.data2.max()].data1.max()
b = b[b['data1'] <= c]
b = b.sort_values(by='data1', ascending=False).head(4)
df = pd.concat([a, b]).sort_values(by=['category', 'data1'], ascending=[True, False]).reset_index(drop=True)
print(df)
category data1 data2
0 A 28.194042 98.813271
1 A 26.635099 82.768130
2 A 24.345177 80.558532
3 A 24.222105 89.596726
4 B 60.883981 98.444699
5 B 49.934815 90.319787
6 B 10.751913 86.124271
7 B 4.029914 89.802120
I use groupby, I feel the code is too complicated, can it be optimized?
import numpy as np
import pandas as pd
df = pd.DataFrame()
n = 200
df['category'] = np.random.choice(('A', 'B'), n)
df['data1'] = np.random.rand(len(df))*100
df['data2'] = np.random.rand(len(df))*100
a = df[df['category'] == 'A']
c = a[a['data2'] == a.data2.max()].data1.max()
a = a[a['data1'] <= c]
a = a.sort_values(by='data1', ascending=False).head(4)
b = df[df['category'] == 'B']
c = b[b['data2'] == b.data2.max()].data1.max()
b = b[b['data1'] <= c]
b = b.sort_values(by='data1', ascending=False).head(4)
df2 = pd.concat([a, b]).sort_values(by=['category', 'data1'], ascending=[True, False]).reset_index(drop=True)
df3 = df.groupby('category').apply(lambda x: x[x['data1'].isin(x[x['data1'] <= x[x['data2'] == x['data2'].max()].data1.max()]['data1'].nlargest(4))]).reset_index(drop=True)
df3 = df3.sort_values(by=['category', 'data1'], ascending=[True, False]).reset_index(drop=True)
print((df2.data1-df3.data1).max())
print((df2.data2-df3.data2).max())
0.0
0.0

Use:
df = pd.DataFrame()
n = 200
df['category'] = np.random.choice(('A', 'B'), n)
df['data1'] = np.random.rand(len(df))*100
df['data2'] = np.random.rand(len(df))*100
a = df[df['category'] == 'A']
c = a[a['data2'] == a.data2.max()].data1.max()
a = a[a['data1'] <= c]
a = a.sort_values(by='data1', ascending=False).head(4)
b = df[df['category'] == 'B']
c = b[b['data2'] == b.data2.max()].data1.max()
b = b[b['data1'] <= c]
b = b.sort_values(by='data1', ascending=False).head(4)
df1 = pd.concat([a, b]).sort_values(by=['category', 'data1'], ascending=[True, False]).reset_index(drop=True)
print(df1)
category data1 data2
0 A 87.560430 99.262452
1 A 85.798945 99.200321
2 A 68.614311 97.796274
3 A 41.641961 95.544980
4 B 69.937691 99.711156
5 B 56.932784 99.227111
6 B 19.903620 94.389186
7 B 12.701288 98.455274
Here are first get all data1 by maximal data2 per groups, filtered by <= and last used groupby.head:
s = (df.sort_values('data2')
.drop_duplicates('category', keep='last')
.set_index('category')['data1'])
df = df[df['data1'] <= df['category'].map(s)]
df1 = (df.sort_values(by=['category', 'data1'], ascending=[True, False])
.groupby('category')
.head(4)
.reset_index(drop=True))
print (df1)
category data1 data2
0 A 87.560430 99.262452
1 A 85.798945 99.200321
2 A 68.614311 97.796274
3 A 41.641961 95.544980
4 B 69.937691 99.711156
5 B 56.932784 99.227111
6 B 12.701288 98.455274
7 B 19.903620 94.389186

Related

How to filter dataframe based on varying thresholds for indexes

I have a data frame and a dictionary like this:
thresholds = {'column':{'A':10,'B':11,'C':9}}
df:
Column
A 13
A 7
A 11
B 12
B 14
B 14
C 7
C 8
C 11
For every index group, I want to calculate the count of values less than the threshold and greater than the threshold value.
So my output looks like this:
df:
Values<Thr Values>Thr
A 1 2
B 0 3
C 2 1
Can anyone help me with this
You can use:
import numpy as np
t = df.index.to_series().map(thresholds['column'])
out = (pd.crosstab(df.index, np.where(df['Column'].gt(t), 'Values>Thr', 'Values≤Thr'))
.rename_axis(index=None, columns=None)
)
Output:
Values>Thr Values≤Thr
A 2 1
B 3 0
C 1 2
syntax variant
out = (pd.crosstab(df.index, df['Column'].gt(t))
.rename_axis(index=None, columns=None)
.rename(columns={False: 'Values≤Thr', True: 'Values>Thr'})
)
apply on many column based on the key in the dictionary
def count(s):
t = s.index.to_series().map(thresholds.get(s.name, {}))
return (pd.crosstab(s.index, s.gt(t))
.rename_axis(index=None, columns=None)
.rename(columns={False: 'Values≤Thr', True: 'Values>Thr'})
)
out = pd.concat({c: count(df[c]) for c in df})
NB. The key of the dictionary must match exactly. I changed the case for the demo.
Output:
Values≤Thr Values>Thr
Column A 1 2
B 0 3
C 2 1
Here another option:
import pandas as pd
df = pd.DataFrame({'Column': [13, 7, 11, 12, 14, 14, 7, 8, 11]})
df.index = ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C']
thresholds = {'column':{'A':10,'B':11,'C':9}}
df['smaller'] = df['Column'].groupby(df.index).transform(lambda x: x < thresholds['column'][x.name]).astype(int)
df['greater'] = df['Column'].groupby(df.index).transform(lambda x: x > thresholds['column'][x.name]).astype(int)
df.drop(columns=['Column'], inplace=True)
# group by index summing the greater and smaller columns
sums = df.groupby(df.index).sum()
sums

split multiple columns in pandas dataframe by delimiter

I have survey data which annoying has returned multiple choice questions in the following way. It's in an excel sheet There is about 60 columns with responses from single to multiple that are split by /. This is what I have so far, is there any way to do this quicker without having to do this for each individual column
data = {'q1': ['one', 'two', 'three'],
'q2' : ['one/two/three', 'a/b/c', 'd/e/f'],
'q3' : ['a/b/c', 'd/e/f','g/h/i']}
df = pd.DataFrame(data)
df[['q2a', 'q2b', 'q2c']]= df['q2'].str.split('/', expand = True, n=0)
df[['q3a', 'q3b', 'q3c']]= df['q2'].str.split('/', expand = True, n=0)
clean_df = df.drop(df[['q2', 'q3']], axis=1)
We can use list comprehension with add_prefix, then we use pd.concat to concatenate everything to your final df:
splits = [df[col].str.split(pat='/', expand=True).add_prefix(col) for col in df.columns]
clean_df = pd.concat(splits, axis=1)
q10 q20 q21 q22 q30 q31 q32
0 one one two three a b c
1 two a b c d e f
2 three d e f g h i
If you actually want your column names to be suffixed by a letter, you can do the following with string.ascii_lowercase:
from string import ascii_lowercase
dfs = []
for col in df.columns:
d = df[col].str.split('/', expand=True)
c = d.shape[1]
d.columns = [col + l for l in ascii_lowercase[:c]]
dfs.append(d)
clean_df = pd.concat(dfs, axis=1)
q1a q2a q2b q2c q3a q3b q3c
0 one one two three a b c
1 two a b c d e f
2 three d e f g h i
You can create a dict d that transforms numbers to letters. Then loop through the columns and dynamically change their names:
input:
import pandas as pd
df = pd.DataFrame({'q1': ['one', 'two', 'three'],
'q2' : ['one/two/three', 'a/b/c', 'd/e/f'],
'q3' : ['a/b/c', 'd/e/f','g/h/i']})
code:
ltrs = list('abcdefghijklmonpqrstuvwxyz')
nmbrs = [i[0] for i in enumerate(ltrs)]
d = dict(zip(nmbrs, ltrs))
cols = df.columns[1:]
for col in cols:
df1 = df[col].str.split('/', expand = True)
df1.columns = df1.columns.map(d)
df1 = df1.add_prefix(f'{col}')
df = pd.concat([df,df1], axis=1)
df = df.drop(cols, axis=1)
df
output:
Out[1]:
q1 q2a q2b q2c q3a q3b q3c
0 one one two three a b c
1 two a b c d e f
2 three d e f g h i

How to rename the duplicated MultiIndex column names?

I have a dataframe with two levels of columns index.
Reproducible Dataset.
df = pd.DataFrame(
[ ['Gaz','Gaz','Gaz','Gaz'],
['X','X','X','X'],
['Y','Y','Y','Y'],
['Z','Z','Z','Z']],
columns=pd.MultiIndex.from_arrays([['A','A','C','D'],
['Name','Name','Company','Company']])
I want to rename the duplicated MultiIndex columns, only when level-0 and level-1 combined is duplicated. Then add a suffix number to the end. Like the one below.
Below is a solution I found, but it only works for single level column index.
class renamer():
def __init__(self):
self.d = dict()
def __call__(self, x):
if x not in self.d:
self.d[x] = 0
return x
else:
self.d[x] += 1
return "%s_%d" % (x, self.d[x])
df = df.rename(columns=renamer())
I think the above method can be modified to support the multi level situation, but I am too new to pandas/python.
Thanks in advance.
#Datanovice
This is to clarify to you about the output what I need.
I have the snippet below.
import pandas as pd
import numpy as np
df = pd.DataFrame(
[ ['Gaz','Gaz','Gaz','Gaz'],
['X','X','X','X'],
['Y','Y','Y','Y'],
['Z','Z','Z','Z']],
columns=pd.MultiIndex.from_arrays([
['A','A','C','A'],
['A','A','C','A'],
['Company','Company','Company','Name']]))
s = pd.DataFrame(df.columns.tolist())
cond = s.groupby(0).cumcount()
s = [np.where(cond.gt(0),s[i] + '_' + cond.astype(str),s[i]) for i in
range(df.columns.nlevels)]
s = pd.DataFrame(s)
#print(s)
df.columns = pd.MultiIndex.from_arrays(s.values.tolist())
print(df)
The current result is-
What I need is the last piece of column index should not be counted as duplicated, as as "A-A-Name" is not same with the first two.
Thank you again.
Might be a better way to do this, but you could return a dataframe from your columns and apply a conditional operation on them and re-assign them.
df = pd.DataFrame(
[ ['Gaz','Gaz','Gaz','Gaz'],
['X','X','X','X'],
['Y','Y','Y','Y'],
['Z','Z','Z','Z']],
columns=pd.MultiIndex.from_arrays([['A','A','C','A'],
['Name','Name','Company','Company']])
s = pd.DataFrame(df.columns.tolist())
cond = s.groupby([0,1]).cumcount()
s[0] = np.where(cond.gt(0),s[0] + '_' + cond.astype(str),s[0])
s[1] = np.where(cond.gt(0),s[1] + '_' + cond.astype(str),s[1])
df.columns = pd.MultiIndex.from_frame(s)
print(df)
0 A A_1 C D
1 Name Name_1 Company Company
0 Gaz Gaz Gaz Gaz
1 X X X X
2 Y Y Y Y
3 Z Z Z Z
Try this -
arrays = [['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],['A', 'A', 'A', 'B', 'C', 'C', 'D', 'D']]
tuples = list(zip(*arrays))
index = pd.MultiIndex.from_tuples(tuples)
df = pd.DataFrame(np.random.randn(3, 8), columns=index)
A B
A A A B C C D D
0 0 0 1 3 1 2 1 4
1 0 1 1 1 1 3 0 1
2 1 1 4 2 3 2 1 4
suffix = pd.DataFrame(df.columns)
suffix['count'] = suffix.groupby(0).cumcount()
suffix['new'] = [((i[0]+'_'+str(j)),(i[1]+'_'+str(j))) for i,j in zip(suffix[0],suffix['count'])]
new_index = pd.MultiIndex.from_tuples(list(suffix['new']))
df.columns = new_index

enumerate equal elements within dataframe column

I would like to enumerate elements in a column which appear more than once. Elements that appear only once should not be modified.
I have come up with two solutions, but they seem to be very inelegant, and I am hoping that there is a better solution.
Input:
X
0 A
1 B
2 C
3 A
4 C
5 C
6 D
Output:
new_name
X
A A1
A A2
B B
C C1
C C2
C C3
D D
Here are two possible ways of achieving this, one using .expanding().count(), the other using .cumcount(), but both pretty ugly
import pandas as pd
def solution_1(df):
pvt = (df.groupby(by='X')
.expanding()
.count()
.rename(columns={'X': 'Counter'})
.reset_index()
.drop('level_1', axis=1)
.assign(name = lambda s: s['X'] + s['Counter'].astype(int).astype(str))
.set_index('X')
)
pvt2 = (df.reset_index()
.groupby(by='X')
.count()
.rename(columns={'index': 'C'}
))
df2 = pd.merge(left=pvt, right=pvt2, left_index=True, right_index=True)
ind=df2['C']>1
df2.loc[ind, 'new_name']=df2.loc[ind, 'name']
df2.loc[~ind, 'new_name']=df2.loc[~ind].index
df2 = df2.drop(['Counter', 'C', 'name'], axis=1)
return df2
def solution_2(df):
pvt = pd.DataFrame(df.groupby(by='X')
.agg({'X': 'cumcount'})
).rename(columns={'X': 'Counter'})
pvt2 = pd.DataFrame(df.groupby(by='X')
.agg({'X': 'count'})
).rename(columns={'X': 'Total Count'})
# print(pvt2)
df2 = df.merge(pvt, left_index=True, right_index=True)
df3 = df2.merge(pvt2, left_on='X', right_index=True)
ind=df3['Total Count']>1
df3['Counter'] = df3['Counter']+1
df3.loc[ind, 'new_name']=df3.loc[ind, 'X']+df3.loc[ind, 'Counter'].astype(int).astype(str)
df3.loc[~ind, 'new_name']=df3.loc[~ind, 'X']
df3 = df3.drop(['Counter', 'Total Count'], axis=1).set_index('X')
return df3
if __name__ == '__main__':
s = ['A', 'B', 'C', 'A', 'C', 'C', 'D']
df = pd.DataFrame(s, columns=['X'])
print(df)
sol_1 = solution_1(df)
print(sol_1)
sol_2 = solution_2(df)
print(sol_2)
Any suggestions? Thanks a lot.
First we use GroupBy.cumcount to get a cumulative count for each unique value in X.
Then we add 1 and convert the numeric values to string with Series.astype.
Finally we concat the values to our original column with Series.cat:
df['new_name'] = df['X'].str.cat(df.groupby('X').cumcount().add(1).astype(str))
X new_name
0 A A1
1 A A2
2 B B1
3 C C1
4 C C2
5 C C3
6 D D1
If you actually dont want a number at the values which only appear once, we can use:
df['new_name'] = np.where(df.groupby('X')['X'].transform('size').eq(1),
df['new_name'].str.replace('\d', ''),
df['new_name'])
X new_name
0 A A1
1 A A2
2 B B
3 C C1
4 C C2
5 C C3
6 D D
All in one line:
df['new_name'] = np.where(df.groupby('X')['X'].transform('size').ne(1),
df['X'].str.cat(df.groupby('X').cumcount().add(1).astype(str)),
df['X'])
IIUC
df.X+(df.groupby('X').cumcount()+1).mask(df.groupby('X').X.transform('count').eq(1),'').astype(str)
Out[18]:
0 A1
1 B
2 C1
3 A2
4 C2
5 C3
6 D
dtype: object

Combining Series in Pandas

I need to combine multiple Pandas Series that contain string values. The series are messages that result from multiple validation steps. I try to combine these messages into 1 Series to attach it to the DataFrame. The problem is that the result is empty.
This is an example:
import pandas as pd
df = pd.DataFrame({'a': ['a', 'b', 'c', 'd'], 'b': ['aa', 'bb', 'cc', 'dd']})
index1 = df[df['a'] == 'b'].index
index2 = df[df['a'] == 'a'].index
series = df.iloc[index1].apply(lambda x: x['b'] + '-bbb', axis=1)
series += df.iloc[index2].apply(lambda x: x['a'] + '-aaa', axis=1)
print series
# >>> series
# 0 NaN
# 1 NaN
Update
import pandas as pd
df = pd.DataFrame({'a': ['a', 'b', 'c', 'd'], 'b': ['aa', 'bb', 'cc', 'dd']})
index1 = df[df['a'] == 'b'].index
index2 = df[df['a'] == 'a'].index
series1 = df.iloc[index1].apply(lambda x: x['b'] + '-bbb', axis=1)
series2 = df.iloc[index2].apply(lambda x: x['a'] + '-aaa', axis=1)
series3 = df.iloc[index2].apply(lambda x: x['a'] + '-ccc', axis=1)
# series3 causes a ValueError: cannot reindex from a duplicate axis
series = pd.concat([series1, series2, series3])
df['series'] = series
print df
Update2
In this example the indices seem to get mixed up.
import pandas as pd
df = pd.DataFrame({'a': ['a', 'b', 'c', 'd'], 'b': ['aa', 'bb', 'cc', 'dd']})
index1 = df[df['a'] == 'a'].index
index2 = df[df['a'] == 'b'].index
index3 = df[df['a'] == 'c'].index
series1 = df.iloc[index1].apply(lambda x: x['a'] + '-aaa', axis=1)
series2 = df.iloc[index2].apply(lambda x: x['a'] + '-bbb', axis=1)
series3 = df.iloc[index3].apply(lambda x: x['a'] + '-ccc', axis=1)
print series1
print
print series2
print
print series3
print
df['series'] = pd.concat([series1, series2, series3], ignore_index=True)
print df
print
df['series'] = pd.concat([series2, series1, series3], ignore_index=True)
print df
print
df['series'] = pd.concat([series3, series2, series1], ignore_index=True)
print df
print
This results in this output:
0 a-aaa
dtype: object
1 b-bbb
dtype: object
2 c-ccc
dtype: object
a b series
0 a aa a-aaa
1 b bb b-bbb
2 c cc c-ccc
3 d dd NaN
a b series
0 a aa b-bbb
1 b bb a-aaa
2 c cc c-ccc
3 d dd NaN
a b series
0 a aa c-ccc
1 b bb b-bbb
2 c cc a-aaa
3 d dd NaN
I would expect only a's in row0, only b's in row1 and only c's in row2, but that's not the case...
Update 3
Here's a better example which should demonstrate the expected behaviour. As I said, the use case is that for a given DataFrame, a function evaluates each row and possibly returns an error message for some of the rows as a Series (some indexes are contained, some are not; if no error returns, the error series is empty).
In [12]:
s1 = pd.Series(['b', 'd'], index=[1, 3])
s2 = pd.Series(['a', 'b'], index=[0, 1])
s3 = pd.Series(['c', 'e'], index=[2, 4])
s4 = pd.Series([], index=[])
pd.concat([s1, s2, s3, s4]).sort_index()
# I'd like to get:
#
# 0 a
# 1 b b
# 2 c
# 3 d
# 4 e
Out[12]:
0 a
1 b
1 b
2 c
3 d
4 e
dtype: object
When concatenating the default is to use the existing indices, however if they collide then this will raise a ValueError as you've found so you need to set ignore_index=True:
In [33]:
series = pd.concat([series1, series2, series3], ignore_index=True)
df['series'] = series
print (df)
a b series
0 a aa bb-bbb
1 b bb a-aaa
2 c cc a-ccc
3 d dd NaN
EDIT
I think I know what you want now, you can achieve what you want by converting the series into a dataframe and then merging using the indices:
In [96]:
df = pd.DataFrame({'a': ['a', 'b', 'c', 'd'], 'b': ['aa', 'bb', 'cc', 'dd']})
index1 = df[df['a'] == 'b'].index
index2 = df[df['a'] == 'a'].index
series1 = df.iloc[index1].apply(lambda x: x['b'] + '-bbb', axis=1)
series2 = df.iloc[index2].apply(lambda x: x['a'] + '-aaa', axis=1)
series3 = df.iloc[index2].apply(lambda x: x['a'] + '-ccc', axis=1)
# we now don't ignore the index in order to preserve the identity of the row we want to merge back to later
series = pd.concat([series1, series2, series3])
# construct a dataframe from the series and give the column a name
df1 = pd.DataFrame({'series':series})
# perform an outer merge on both df's indices
df.merge(df1, left_index=True, right_index=True, how='outer')
Out[96]:
a b series
0 a aa a-aaa
0 a aa a-ccc
1 b bb bb-bbb
2 c cc NaN
3 d dd NaN
how about concat?
s1 = df.iloc[index1].apply(lambda x: x['b'] + '-bbb', axis=1)
s2 = df.iloc[index2].apply(lambda x: x['a'] + '-aaa', axis=1)
s = pd.concat([s1,s2])
print s
1 bb-bbb
0 a-aaa
dtype: object
I might have found a solution. I hope someone can comment on it...
s1 = pd.Series(['b', 'd'], index=[1, 3])
s2 = pd.Series(['a', 'b'], index=[0, 1])
s3 = pd.Series(['c', 'e'], index=[2, 4])
s4 = pd.Series([], index=[])
pd.concat([s1, s2, s3, s4]).sort_index()
df1 = pd.DataFrame(s1)
df2 = pd.DataFrame(s2)
df3 = pd.DataFrame(s3)
df4 = pd.DataFrame(s4)
d = pd.DataFrame({0:[]})
d = pd.merge(df1, d, how='outer', left_index=True, right_index=True)
d = d.fillna('')
d = pd.DataFrame(d['0_x'] + d['0_y'])
d = pd.merge(df2, d, how='outer', left_index=True, right_index=True)
d = d.fillna('')
d = pd.DataFrame(d['0_x'] + d['0_y'])
d = pd.merge(df3, d, how='outer', left_index=True, right_index=True)
d = d.fillna('')
d = pd.DataFrame(d['0_x'] + d['0_y'])
d = pd.merge(df4, d, how='outer', left_index=True, right_index=True)
d = d.fillna('')
d = pd.DataFrame(d['0_x'] + d['0_y'])
print d
which returns
0
0 a
1 bb
2 c
3 d
4 e

Categories