enumerate equal elements within dataframe column - python

I would like to enumerate elements in a column which appear more than once. Elements that appear only once should not be modified.
I have come up with two solutions, but they seem to be very inelegant, and I am hoping that there is a better solution.
Input:
X
0 A
1 B
2 C
3 A
4 C
5 C
6 D
Output:
new_name
X
A A1
A A2
B B
C C1
C C2
C C3
D D
Here are two possible ways of achieving this, one using .expanding().count(), the other using .cumcount(), but both pretty ugly
import pandas as pd
def solution_1(df):
pvt = (df.groupby(by='X')
.expanding()
.count()
.rename(columns={'X': 'Counter'})
.reset_index()
.drop('level_1', axis=1)
.assign(name = lambda s: s['X'] + s['Counter'].astype(int).astype(str))
.set_index('X')
)
pvt2 = (df.reset_index()
.groupby(by='X')
.count()
.rename(columns={'index': 'C'}
))
df2 = pd.merge(left=pvt, right=pvt2, left_index=True, right_index=True)
ind=df2['C']>1
df2.loc[ind, 'new_name']=df2.loc[ind, 'name']
df2.loc[~ind, 'new_name']=df2.loc[~ind].index
df2 = df2.drop(['Counter', 'C', 'name'], axis=1)
return df2
def solution_2(df):
pvt = pd.DataFrame(df.groupby(by='X')
.agg({'X': 'cumcount'})
).rename(columns={'X': 'Counter'})
pvt2 = pd.DataFrame(df.groupby(by='X')
.agg({'X': 'count'})
).rename(columns={'X': 'Total Count'})
# print(pvt2)
df2 = df.merge(pvt, left_index=True, right_index=True)
df3 = df2.merge(pvt2, left_on='X', right_index=True)
ind=df3['Total Count']>1
df3['Counter'] = df3['Counter']+1
df3.loc[ind, 'new_name']=df3.loc[ind, 'X']+df3.loc[ind, 'Counter'].astype(int).astype(str)
df3.loc[~ind, 'new_name']=df3.loc[~ind, 'X']
df3 = df3.drop(['Counter', 'Total Count'], axis=1).set_index('X')
return df3
if __name__ == '__main__':
s = ['A', 'B', 'C', 'A', 'C', 'C', 'D']
df = pd.DataFrame(s, columns=['X'])
print(df)
sol_1 = solution_1(df)
print(sol_1)
sol_2 = solution_2(df)
print(sol_2)
Any suggestions? Thanks a lot.

First we use GroupBy.cumcount to get a cumulative count for each unique value in X.
Then we add 1 and convert the numeric values to string with Series.astype.
Finally we concat the values to our original column with Series.cat:
df['new_name'] = df['X'].str.cat(df.groupby('X').cumcount().add(1).astype(str))
X new_name
0 A A1
1 A A2
2 B B1
3 C C1
4 C C2
5 C C3
6 D D1
If you actually dont want a number at the values which only appear once, we can use:
df['new_name'] = np.where(df.groupby('X')['X'].transform('size').eq(1),
df['new_name'].str.replace('\d', ''),
df['new_name'])
X new_name
0 A A1
1 A A2
2 B B
3 C C1
4 C C2
5 C C3
6 D D
All in one line:
df['new_name'] = np.where(df.groupby('X')['X'].transform('size').ne(1),
df['X'].str.cat(df.groupby('X').cumcount().add(1).astype(str)),
df['X'])

IIUC
df.X+(df.groupby('X').cumcount()+1).mask(df.groupby('X').X.transform('count').eq(1),'').astype(str)
Out[18]:
0 A1
1 B
2 C1
3 A2
4 C2
5 C3
6 D
dtype: object

Related

How to groupby a column but keep all rows as columns

I have a dataframe that was a result of a join operation. This operation had multiple matches, resulting in multiple rows. I want to move resulting match rows to be moved in to columns. Here is an example:
import pandas as pd
a = pd.DataFrame([[111,2,3]], columns=['id', 'var1', 'var2'])
b = pd.DataFrame([[111,'999','some data'],
[111,'999888','some more data']],
columns=['id', 'B', 'C'])
c = pd.merge(a, b, on='id')
I get:
id var1 var2 B C
0 111 2 3 999 some data
1 111 2 3 999888 some more data
but really I want:
id var1 var2 B C B C
0 111 2 3 999 some data 999888 some more data
I was thinking pivot was what I wanted but it makes the value the columns, not what I want. How can I achieve this and what is this operation called?
EDIT: To clarify, I don't care about the column names, could be b1 and b2 etc.
EDIT2: Many of the solutions do not work if there are more matches. Here is another example:
a = pd.DataFrame([[111,2,3], [222,3,4]], columns=['id', 'var1', 'var2'])
b = pd.DataFrame([[111,'999','some data'], [111,'999888','some more data'], [111,'999888777','and some more data'], [222,'111222','some extra data'], [222,'222333','and more extra data']], columns=['id', 'B', 'C'])
c = pd.merge(a, b, on='id')
IIUC, you can first reshape your dataframe b to force having duplicated columns, then join to a:
b2 = (b
.assign(col=b.groupby('id').cumcount())
.pivot(index='id', columns='col')
.sort_index(level='col', axis=1, sort_remaining=False)
.droplevel('col', axis=1)
)
# B C B C
# id
# 111 999 some data 999888 some more data
c = a.join(b2, on='id')
# id var1 var2 B C B C
# 0 111 2 3 999 some data 999888 some more data
with non-duplicated column names:
b2 = (b.assign(col=b.groupby('id').cumcount().add(1))
.pivot(index='id', columns='col')
.sort_index(level='col', axis=1, sort_remaining=False)
.pipe(lambda d: d.set_axis(d.columns.map(lambda x: '_'.join(map(str,x))),
axis=1))
)
# B_1 C_1 B_2 C_2
# id
# 111 999 some data 999888 some more data
c = a.join(b2, on='id')
# id var1 var2 B_1 C_1 B_2 C_2
# 0 111 2 3 999 some data 999888 some more data
With duplicate column names:
Solution allowing duplicate columns for multiple results per id:
import pandas as pd
a = pd.DataFrame([[111,2,3], [222,3,4]], columns=['id', 'var1', 'var2'])
b = pd.DataFrame([[111,'999','some data'], [111,'999888','some more data'], [111,'999888777','and some more data'], [222,'111222','some extra data'], [222,'222333','and more extra data']], columns=['id', 'B', 'C'])
c = pd.merge(a, b, on='id')
print(c)
resultColNames = ['B', 'C']
nResCols = len(resultColNames)
otherColNames = list(set(c.columns) - set(resultColNames))
maxResultSets = c.groupby('id').size().max()
deDup = True
intToResultCol = {i:resultColNames[i % nResCols] + ('_' + str(i // nResCols) if deDup else '') for i in range(maxResultSets * nResCols)}
rhs = pd.concat([
v[resultColNames].unstack().to_frame().swaplevel().sort_index().droplevel(1).reset_index(drop=True).T for _, v in c.groupby(otherColNames)
]).reset_index(drop=True).rename(columns=intToResultCol)
c = pd.concat([a, rhs], axis=1)
print(c)
Input:
id var1 var2 B C
0 111 2 3 999 some data
1 111 2 3 999888 some more data
2 111 2 3 999888777 and some more data
3 222 3 4 111222 some extra data
4 222 3 4 222333 and more extra data
Output:
id var1 var2 B C B C B C
0 111 2 3 999 some data 999888 some more data 999888777 and some more data
1 222 3 4 111222 some extra data 222333 and more extra data NaN NaN
Without duplicate column names:
Change deDup boolean in code above to True:
deDup = True
Output:
id var1 var2 B_0 C_0 B_1 C_1 B_2 C_2
0 111 2 3 999 some data 999888 some more data 999888777 and some more data
1 222 3 4 111222 some extra data 222333 and more extra data NaN NaN
Old solution (prior to question update by OP):
A solution in brief:
c = pd.concat([
c.groupby('id').nth(0).drop(columns=['B','C']).reset_index(),
c[['B','C']].unstack().to_frame().swaplevel().sort_index().T
], axis=1)
c = c.rename(columns={col:col[1] for col in c.columns if isinstance(col, tuple)})
Output:
id var1 var2 B C B C
0 111 2 3 999 some data 999888 some more data
Avoiding duplicate names:
Changing the final line renaming columns will ensure column names are not duplicated:
c = c.rename(columns={col:f'{col[1]}_{col[0]}' for col in c.columns if isinstance(col, tuple)})
Result:
id var1 var2 B_0 C_0 B_1 C_1
0 111 2 3 999 some data 999888 some more data
Using structured column names for each match:
To get a result without duplicate column names, instead assigning columns in the result set for each match a structured name in the form of a tuple of the result set number (0, 1, ...) and the column name (B, C), we can do this:
import pandas as pd
a = pd.DataFrame([[111,2,3]], columns=['id', 'var1', 'var2'])
b = pd.DataFrame([[111,'999','some data'], [111,'999888','some more data']], columns=['id', 'B', 'C'])
c = pd.merge(a, b, on='id')
print(c)
x = c.groupby('id').nth(0).drop(columns=['B','C']).reset_index()
y = c[['B','C']].unstack().to_frame().swaplevel().sort_index().T
c = pd.concat([x, y], axis=1)
print(c)
Output:
id var1 var2 (0, B) (0, C) (1, B) (1, C)
0 111 2 3 999 some data 999888 some more data
Let us stack and unstack to reshape the dataframe:
k = ['id', 'var1', 'var2']
c = c.set_index([*k, c.groupby(k).cumcount().add(1)]).stack().unstack([-1, -2])
c.columns = c.columns.map('{0[0]}_{0[1]}'.format)
Result
print(c)
B_1 C_1 B_2 C_2
id var1 var2
111 2 3 999 some data 999888 some more data
Use:
res = c.loc[:0, ['id', 'var1', 'var2']]
temp = c[['B', 'C']].values.flatten()
res[[f'new{i}' for i in range(len(temp))]]=temp
Output:

how to groupby and join multiple rows from multiple columns at a time?

I want to know how to groupby a single column and join multiple column strings each row.
Here's an example dataframe:
df = pd.DataFrame(np.array([['a', 'a', 'b', 'b'], [1, 1, 2, 2],
['k', 'l', 'm', 'n']]).T,
columns=['a', 'b', 'c'])
print(df)
a b c
0 a 1 k
1 a 1 l
2 b 2 m
3 b 2 n
I've tried something like,
df.groupby(['b', 'a'])['c'].apply(','.join).reset_index()
b a c
0 1 a k,l
1 2 b m,n
But that is not my required output,
Desired output:
a b c
0 1 a,a k,l
1 2 b,b m,n
How can I achieve this? I need a scalable solution because I'm dealing with millions of rows.
I think you need grouping by b column only and then if necessary create list of columns for apply function with GroupBy.agg:
df1 = df.groupby('b')['a','c'].agg(','.join).reset_index()
#alternative if want join all columns without b
#df1 = df.groupby('b').agg(','.join).reset_index()
print (df1)
b a c
0 1 a,a k,l
1 2 b,b m,n

Pandas: Sort before aggregate within a group

I have the following Pandas dataframe:
A B C
A A Test1
A A Test2
A A XYZ
A B BA
A B AB
B A AA
I want to group this dataset twice: First by A and B to concate the group within C and afterwards only on A to get the groups defined solely by column A. The result looks like this:
A A Test1,Test2,XYZ
A B AB, BA
B A AA
And the final result should be:
A A,A:(Test1,Test2,XYZ), A,B:(AB, BA)
B B,A:(AA)
Concatenating itself works, however the sorting does not seem work.
Can anyone help me with this problem?
Kind regards.
Using groupby + join
s1=df.groupby(['A','B']).C.apply(','.join)
s1
Out[421]:
A B
A A Test1,Test2,XYZ
B BA,AB
B A AA
Name: C, dtype: object
s1.reset_index().groupby('A').apply(lambda x : x.set_index(['A','B'])['C'].to_dict())
Out[420]:
A
A {('A', 'A'): 'Test1,Test2,XYZ', ('A', 'B'): 'B...
B {('B', 'A'): 'AA'}
dtype: object
First sort_values by 3 columns, then groupby with join first, then join A with B columns and last groupby for dictionary per groups:
df1 = df.sort_values(['A','B','C']).groupby(['A','B'])['C'].apply(','.join).reset_index()
#if only 3 columns DataFrame
#df1 = df.sort_values().groupby(['A','B'])['C'].apply(','.join).reset_index()
df1['D'] = df1['A'] + ',' + df1['B']
print (df1)
A B C D
0 A A Test1,Test2,XYZ A,A
1 A B AB,BA A,B
2 B A AA B,A
s = df1.groupby('A').apply(lambda x: dict(zip(x['D'], x['C']))).reset_index(name='val')
print (s)
A val
0 A {'A,A': 'Test1,Test2,XYZ', 'A,B': 'AB,BA'}
1 B {'B,A': 'AA'}
If need tuples only change first part of code:
df1 = df.sort_values(['A','B','C']).groupby(['A','B'])['C'].apply(tuple).reset_index()
df1['D'] = df1['A'] + ',' + df1['B']
print (df1)
A B C D
0 A A (Test1, Test2, XYZ) A,A
1 A B (AB, BA) A,B
2 B A (AA,) B,A
s = df1.groupby('A').apply(lambda x: dict(zip(x['D'], x['C']))).reset_index(name='val')
print (s)
A val
0 A {'A,A': ('Test1', 'Test2', 'XYZ'), 'A,B': ('AB...
1 B {'B,A': ('AA',)}

Pandas - Interleave / Zip two DataFrames by row

Suppose I have two dataframes:
>> df1
0 1 2
0 a b c
1 d e f
>> df2
0 1 2
0 A B C
1 D E F
How can I interleave the rows? i.e. get this:
>> interleaved_df
0 1 2
0 a b c
1 A B C
2 d e f
3 D E F
(Note my real DFs have identical columns, but not the same number of rows).
What I've tried
inspired by this question (very similar, but asks on columns):
import pandas as pd
from itertools import chain, zip_longest
df1 = pd.DataFrame([['a','b','c'], ['d','e','f']])
df2 = pd.DataFrame([['A','B','C'], ['D','E','F']])
concat_df = pd.concat([df1,df2])
new_index = chain.from_iterable(zip_longest(df1.index, df2.index))
# new_index now holds the interleaved row indices
interleaved_df = concat_df.reindex(new_index)
ValueError: cannot reindex from a duplicate axis
The last call fails because df1 and df2 have some identical index values (which is also the case with my real DFs).
Any ideas?
You can sort the index after concatenating and then reset the index i.e
import pandas as pd
df1 = pd.DataFrame([['a','b','c'], ['d','e','f']])
df2 = pd.DataFrame([['A','B','C'], ['D','E','F']])
concat_df = pd.concat([df1,df2]).sort_index().reset_index(drop=True)
Output :
0 1 2
0 a b c
1 A B C
2 d e f
3 D E F
EDIT (OmerB) : Incase of keeping the order regardless of the index value then.
import pandas as pd
df1 = pd.DataFrame([['a','b','c'], ['d','e','f']]).reset_index()
df2 = pd.DataFrame([['A','B','C'], ['D','E','F']]).reset_index()
concat_df = pd.concat([df1,df2]).sort_index().set_index('index')
Use toolz.interleave
In [1024]: from toolz import interleave
In [1025]: pd.DataFrame(interleave([df1.values, df2.values]))
Out[1025]:
0 1 2
0 a b c
1 A B C
2 d e f
3 D E F
Here's an extension of #Bharath's answer that can be applied to DataFrames with user-defined indexes without losing them, using pd.MultiIndex.
Define Dataframes with the full set of column/ index labels and names:
df1 = pd.DataFrame([['a','b','c'], ['d','e','f']], index=['one', 'two'], columns=['col_a', 'col_b','col_c'])
df1.columns.name = 'cols'
df1.index.name = 'rows'
df2 = pd.DataFrame([['A','B','C'], ['D','E','F']], index=['one', 'two'], columns=['col_a', 'col_b','col_c'])
df2.columns.name = 'cols'
df2.index.name = 'rows'
Add DataFrame ID to MultiIndex:
df1.index = pd.MultiIndex.from_product([[1], df1.index], names=["df_id", df1.index.name])
df2.index = pd.MultiIndex.from_product([[2], df2.index], names=["df_id", df2.index.name])
Then use #Bharath's concat() and sort_index():
data = pd.concat([df1, df2], axis=0, sort=True)
data.sort_index(axis=0, level=data.index.names[::-1], inplace=True)
Output:
cols col_a col_b col_c
df_id rows
1 one a b c
2 one A B C
1 two d e f
2 two D E F
You could also preallocate a new DataFrame, and then fill it using a slice.
def interleave(dfs):
data = np.transpose(np.array([np.empty(dfs[0].shape[0]*len(dfs), dtype=dt) for dt in dfs[0].dtypes]))
out = pd.DataFrame(data, columns=dfs[0].columns)
for ix, df in enumerate(dfs):
out.iloc[ix::len(dfs),:] = df.values
return out
The preallocation code is taken from this question.
While there's a chance it could outperform the index method for certain data types / sizes, it won't behave gracefully if the DataFrames have different sizes.
Note - for ~200000 rows with 20 columns of mixed string, integer and floating types, the index method is around 5x faster.
You can try this way :
In [31]: import pandas as pd
...: from itertools import chain, zip_longest
...:
...: df1 = pd.DataFrame([['a','b','c'], ['d','e','f']])
...: df2 = pd.DataFrame([['A','B','C'], ['D','E','F']])
In [32]: concat_df = pd.concat([df1,df2]).sort_index()
...:
In [33]: interleaved_df = concat_df.reset_index(drop=1)
In [34]: interleaved_df
Out[34]:
0 1 2
0 a b c
1 A B C
2 d e f
3 D E F

Combining Series in Pandas

I need to combine multiple Pandas Series that contain string values. The series are messages that result from multiple validation steps. I try to combine these messages into 1 Series to attach it to the DataFrame. The problem is that the result is empty.
This is an example:
import pandas as pd
df = pd.DataFrame({'a': ['a', 'b', 'c', 'd'], 'b': ['aa', 'bb', 'cc', 'dd']})
index1 = df[df['a'] == 'b'].index
index2 = df[df['a'] == 'a'].index
series = df.iloc[index1].apply(lambda x: x['b'] + '-bbb', axis=1)
series += df.iloc[index2].apply(lambda x: x['a'] + '-aaa', axis=1)
print series
# >>> series
# 0 NaN
# 1 NaN
Update
import pandas as pd
df = pd.DataFrame({'a': ['a', 'b', 'c', 'd'], 'b': ['aa', 'bb', 'cc', 'dd']})
index1 = df[df['a'] == 'b'].index
index2 = df[df['a'] == 'a'].index
series1 = df.iloc[index1].apply(lambda x: x['b'] + '-bbb', axis=1)
series2 = df.iloc[index2].apply(lambda x: x['a'] + '-aaa', axis=1)
series3 = df.iloc[index2].apply(lambda x: x['a'] + '-ccc', axis=1)
# series3 causes a ValueError: cannot reindex from a duplicate axis
series = pd.concat([series1, series2, series3])
df['series'] = series
print df
Update2
In this example the indices seem to get mixed up.
import pandas as pd
df = pd.DataFrame({'a': ['a', 'b', 'c', 'd'], 'b': ['aa', 'bb', 'cc', 'dd']})
index1 = df[df['a'] == 'a'].index
index2 = df[df['a'] == 'b'].index
index3 = df[df['a'] == 'c'].index
series1 = df.iloc[index1].apply(lambda x: x['a'] + '-aaa', axis=1)
series2 = df.iloc[index2].apply(lambda x: x['a'] + '-bbb', axis=1)
series3 = df.iloc[index3].apply(lambda x: x['a'] + '-ccc', axis=1)
print series1
print
print series2
print
print series3
print
df['series'] = pd.concat([series1, series2, series3], ignore_index=True)
print df
print
df['series'] = pd.concat([series2, series1, series3], ignore_index=True)
print df
print
df['series'] = pd.concat([series3, series2, series1], ignore_index=True)
print df
print
This results in this output:
0 a-aaa
dtype: object
1 b-bbb
dtype: object
2 c-ccc
dtype: object
a b series
0 a aa a-aaa
1 b bb b-bbb
2 c cc c-ccc
3 d dd NaN
a b series
0 a aa b-bbb
1 b bb a-aaa
2 c cc c-ccc
3 d dd NaN
a b series
0 a aa c-ccc
1 b bb b-bbb
2 c cc a-aaa
3 d dd NaN
I would expect only a's in row0, only b's in row1 and only c's in row2, but that's not the case...
Update 3
Here's a better example which should demonstrate the expected behaviour. As I said, the use case is that for a given DataFrame, a function evaluates each row and possibly returns an error message for some of the rows as a Series (some indexes are contained, some are not; if no error returns, the error series is empty).
In [12]:
s1 = pd.Series(['b', 'd'], index=[1, 3])
s2 = pd.Series(['a', 'b'], index=[0, 1])
s3 = pd.Series(['c', 'e'], index=[2, 4])
s4 = pd.Series([], index=[])
pd.concat([s1, s2, s3, s4]).sort_index()
# I'd like to get:
#
# 0 a
# 1 b b
# 2 c
# 3 d
# 4 e
Out[12]:
0 a
1 b
1 b
2 c
3 d
4 e
dtype: object
When concatenating the default is to use the existing indices, however if they collide then this will raise a ValueError as you've found so you need to set ignore_index=True:
In [33]:
series = pd.concat([series1, series2, series3], ignore_index=True)
df['series'] = series
print (df)
a b series
0 a aa bb-bbb
1 b bb a-aaa
2 c cc a-ccc
3 d dd NaN
EDIT
I think I know what you want now, you can achieve what you want by converting the series into a dataframe and then merging using the indices:
In [96]:
df = pd.DataFrame({'a': ['a', 'b', 'c', 'd'], 'b': ['aa', 'bb', 'cc', 'dd']})
index1 = df[df['a'] == 'b'].index
index2 = df[df['a'] == 'a'].index
series1 = df.iloc[index1].apply(lambda x: x['b'] + '-bbb', axis=1)
series2 = df.iloc[index2].apply(lambda x: x['a'] + '-aaa', axis=1)
series3 = df.iloc[index2].apply(lambda x: x['a'] + '-ccc', axis=1)
# we now don't ignore the index in order to preserve the identity of the row we want to merge back to later
series = pd.concat([series1, series2, series3])
# construct a dataframe from the series and give the column a name
df1 = pd.DataFrame({'series':series})
# perform an outer merge on both df's indices
df.merge(df1, left_index=True, right_index=True, how='outer')
Out[96]:
a b series
0 a aa a-aaa
0 a aa a-ccc
1 b bb bb-bbb
2 c cc NaN
3 d dd NaN
how about concat?
s1 = df.iloc[index1].apply(lambda x: x['b'] + '-bbb', axis=1)
s2 = df.iloc[index2].apply(lambda x: x['a'] + '-aaa', axis=1)
s = pd.concat([s1,s2])
print s
1 bb-bbb
0 a-aaa
dtype: object
I might have found a solution. I hope someone can comment on it...
s1 = pd.Series(['b', 'd'], index=[1, 3])
s2 = pd.Series(['a', 'b'], index=[0, 1])
s3 = pd.Series(['c', 'e'], index=[2, 4])
s4 = pd.Series([], index=[])
pd.concat([s1, s2, s3, s4]).sort_index()
df1 = pd.DataFrame(s1)
df2 = pd.DataFrame(s2)
df3 = pd.DataFrame(s3)
df4 = pd.DataFrame(s4)
d = pd.DataFrame({0:[]})
d = pd.merge(df1, d, how='outer', left_index=True, right_index=True)
d = d.fillna('')
d = pd.DataFrame(d['0_x'] + d['0_y'])
d = pd.merge(df2, d, how='outer', left_index=True, right_index=True)
d = d.fillna('')
d = pd.DataFrame(d['0_x'] + d['0_y'])
d = pd.merge(df3, d, how='outer', left_index=True, right_index=True)
d = d.fillna('')
d = pd.DataFrame(d['0_x'] + d['0_y'])
d = pd.merge(df4, d, how='outer', left_index=True, right_index=True)
d = d.fillna('')
d = pd.DataFrame(d['0_x'] + d['0_y'])
print d
which returns
0
0 a
1 bb
2 c
3 d
4 e

Categories