I have the following dataframe:
Col
0 A,B,C
1 B,A,D
2 C
3 A,D,E,F
4 B,C,F
df = pd.DataFrame({'Col': ['A,B,C', 'B,A,D', 'C', 'A,D,E,F', 'B,C,F']})
which needs to be turned into:
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
You could use str.get_dummies to get the dummy variables, then multiply with the columns:
tmp = df['Col'].str.get_dummies(sep=',')
out = tmp * tmp.columns
One-liner as suggested by #piRSquared:
out = df.Col.str.get_dummies(',').pipe(lambda d: d*[*d])
Output:
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
Benchmark:
On data created by duplicating the data in the OP:
#piRSquared's first method using numpy methods is the fastest solution here.
On randomly generated DataFrames of increasing sizes:
Code to reproduce the plot:
import perfplot
import pandas as pd
import numpy as np
def enke(df):
tmp = df['Col'].str.get_dummies(sep=',')
return tmp * tmp.columns
def mozway(df):
return pd.concat([pd.Series((idx:=x.split(',')), index=idx)
for x in df['Col']], axis=1).T.fillna('')
def piRSquared(df):
n = df.shape[0]
i = np.repeat(np.arange(n), df.Col.str.count(',')+1)
c, j = np.unique(df.Col.str.cat(sep=',').split(','), return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, df.index, c)
def piRSquared2(df):
n = df.shape[0]
base = df.Col.to_numpy().astype(str)
commas = np.char.count(base, ',')
sepped = ','.join(base).split(',')
i = np.repeat(np.arange(n), commas+1)
c, j = np.unique(sepped, return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, df.index, c)
def constructor1(n):
df = pd.DataFrame({'Col': ['A,B,C', 'B,A,D', 'C', 'A,D,E,F', 'B,C,F']})
return pd.concat([df]*n, ignore_index=True)
def constructor2(n):
uc = np.array([*ascii_uppercase])
k = [','.join(np.random.choice(uc, x, replace=False))
for x in np.random.randint(1, 10, size=n)]
return pd.DataFrame({'Col': k})
kernels = [enke, piRSquared, piRSquared2, mozway]
df = pd.DataFrame({'Col': ['A,B,C', 'B,A,D', 'C', 'A,D,E,F', 'B,C,F']})
perfplot.plot(
setup=constructor1,
kernels=kernels,
labels=[func.__name__ for func in kernels],
n_range=[2**k for k in range(15)],
xlabel='len(df)',
logx=True,
logy=True,
relative_to=0,
equality_check=pd.DataFrame.equals)
Using pandas.concat:
pd.concat([pd.Series((idx:=x.split(',')), index=idx)
for x in df['Col']], axis=1).T
For python < 3.8:
pd.concat([pd.Series(val, index=val)
for x in df['Col']
for val in [x.split(',')]], axis=1).T
Output:
A B C D E F
0 A B C NaN NaN NaN
1 A B NaN D NaN NaN
2 NaN NaN C NaN NaN NaN
3 A NaN NaN D E F
4 NaN B C NaN NaN F
NB. add fillna('') to have empty strings for missing values
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
This comes from my Project Overkill stash of tricks.
I'll use Numpy to identify where the labels are to be dropped in the 2-d array.
n = df.shape[0] # Get number of rows
base = df.Col.to_numpy().astype(str) # Turn `'Col'` to Numpy array
commas = np.char.count(base, ',') # Count commas in each row
sepped = ','.join(base).split(',') # Flat array of each element
i = np.repeat(np.arange(n), commas+1) # Row indices for flat array
# Note that I could've used `pd.factorize` here but I actually wanted
# a sorted array of labels so `np.unique` was the way to go.
# Otherwise I'd have used `j, c = pd.factorize(sepped)`
c, j = np.unique(sepped, return_inverse=True) # `j` col indices for flat array
# `c` will be the column labels
m = c.shape[0] # Get number of unique labels
a = np.full((n, m), '') # Default array of empty strings
a[i, j] = c[j] # Use row/col indices to insert
# the column labels in right spots
pd.DataFrame(a, df.index, c) # Construct new dataframe
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
Time Testing
The Functions
import pandas as pd
import numpy as np
from string import ascii_uppercase
def pir(s):
n = s.shape[0]
base = s.to_numpy().astype(str)
commas = np.char.count(base, ',')
sepped = ','.join(base).split(',')
i = np.repeat(np.arange(n), commas+1)
c, j = np.unique(sepped, return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, s.index, c)
def pir2(s):
n = s.shape[0]
sepped = s.str.cat(sep=',').split(',')
commas = s.str.count(',')
i = np.repeat(np.arange(n), commas+1)
c, j = np.unique(sepped, return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, s.index, c)
def mozway(s):
return pd.concat([
pd.Series((idx:=x.split(',')), index=idx)
for x in s
], axis=1).T.fillna('')
def enke(s):
return s.str.get_dummies(',').pipe(lambda d: d*d.columns)
The test data constructor
def constructor(n, m):
uc = np.array([*ascii_uppercase])
m = min(26, m)
k = [','.join(np.random.choice(uc, x, replace=False))
for x in np.random.randint(1, m, size=n)]
return pd.Series(k)
The results dataframe
res = pd.DataFrame(
index=['enke', 'mozway', 'pir', 'pir2'],
columns=[10, 30, 100, 300, 1000, 3000, 10000, 30000],
dtype=float
)
Run the test
from IPython.display import clear_output
for j in res.columns:
s = constructor(j, 10)
for i in res.index:
stmt = f'{i}(s)'
setp = f'from __main__ import s, {i}'
res.at[i, j] = timeit(stmt, setp, number=50)
print(res)
clear_output(True)
Show the results
res.T.plot(loglog=True)
res.div(res.min()).T
enke mozway pir pir2
10 8.634105 19.416376 1.000000 2.300573
30 7.626107 32.741218 1.000000 2.028423
100 5.071308 50.539772 1.000000 1.533791
300 3.475711 66.638151 1.000000 1.184982
1000 2.616885 79.032159 1.012205 1.000000
3000 2.518983 91.521389 1.094863 1.000000
10000 2.536735 98.172680 1.131758 1.000000
30000 2.603489 99.756007 1.149734 1.000000
So I made this dataframe
alp = "abcdefghijklmnopqrstuvwxyz0123456789"
s = "carl"
for i in s:
alp = alp.replace(i,"")
jaa = s+alp
x = list(jaa)
array = np.array(x)
re = np.reshape(array,(6,6))
dt = pd.DataFrame(re)
dt.columns = [1,2,3,4,5,6]
dt.index = [1,2,3,4,5,6]
dt
1 2 3 4 5 6
1 c a r l b d
2 e f g h i j
3 k m n o p q
4 s t u v w x
5 y z 0 1 2 3
6 4 5 6 7 8 9
I want to search a value , and print its row(index) and column.
For example, 'h', the output i want is 2,4.
Is there any way to get that output?
row, col = np.where(dt == "h")
print(dt.index[row[0]], dt.columns[col[0]])
I have a list of pairs--stored in a DataFrame--each pair having an 'a' column and a 'b' column. For each pair I want to return the 'b's that have the same 'a'. For example, given the following set of pairs:
a b
0 c d
1 e f
2 c g
3 e h
4 i j
5 e k
I would like to end up with:
a b equivalents
0 c d [g]
1 e f [h, k]
2 c g [d]
3 e h [f, k]
4 i j []
5 e k [h, e]
I can do this with the following:
def equivalents(x):
l = pairs[pairs["a"] == x["a"]]["b"].tolist()
return l[1:] if l else l
pairs["equivalents"] = pairs.apply(equivalents, axis = 1)
But it is painfully slow on larger sets (e.g. 1 million plus pairs). Any suggestions how I could do this faster?
I think this ought to be a bit faster. First, just add them up.
df['equiv'] = df.groupby('a')['b'].transform(sum)
a b equiv
0 c d dg
1 e f fhk
2 c g dg
3 e h fhk
4 i j j
5 e k fhk
Now convert to a list and remove whichever letter is already in column 'b'.
df.apply( lambda x: [ y for y in list( x.equiv ) if y != x.b ], axis=1 )
0 [g]
1 [h, k]
2 [d]
3 [f, k]
4 []
5 [f, h]
If I have the following:
df = pd.DataFrame(np.random.random((4,8)))
tupleList = zip([x for x in 'abcdefgh'], [y for y in ['iijjkkll'])
ind = pd.MultiIndex.from_tuples(tupleList)
df.columns = ind
In [71]: df
Out[71]:
a b c d e f g \
i i j j k k l
0 0.968112 0.809183 0.144320 0.518120 0.820079 0.648237 0.971552
1 0.959022 0.721705 0.139588 0.408940 0.230956 0.907192 0.467016
2 0.335085 0.537437 0.725119 0.486447 0.114048 0.150150 0.894322
3 0.051249 0.186547 0.779814 0.905914 0.024298 0.002489 0.339714
h
l
0 0.438330
1 0.225447
2 0.331413
3 0.530789
[4 rows x 8 columns]
what is the easiest way to select the columns that have a second level label of "j" or "k"?
c d e f
j j k k
0 0.948030 0.243993 0.627497 0.729024
1 0.087703 0.874968 0.581875 0.996466
2 0.802155 0.213450 0.375096 0.184569
3 0.164278 0.646088 0.201323 0.022498
I can do this:
df.loc[:, df.columns.get_level_values(1).isin(['j', 'k'])]
But that seems pretty verbose for something that feels like it should be simple. Any better approaches?
See here for multiindex using slicers, introduced in 0.14.0
In [36]: idx = pd.IndexSlice
In [37]: df.loc[:, idx[:, ['j', 'k']]]
Out[37]:
c d e f
j j k k
0 0.750582 0.877763 0.262696 0.226005
1 0.025902 0.967179 0.125647 0.297304
2 0.463544 0.104973 0.154113 0.284820
3 0.631695 0.841023 0.820907 0.938378
Taking the idea from From this answer: pandas: apply function to DataFrame that can return multiple rows
In my case, I have something like this, but larger:
df = pd.DataFrame({'Restaurants': ['A', 'B', 'C'],
'Tables':['D', 'E', 'F'],
'Chairs': ['G', 'H', 'I'],
'Menus': ['J', 'K', 'L'],
'Fridges': ['M', 'N', 'O'],
'Etc...': ['P', 'Q', 'R'], 'count':[3, 2, 3]})
Restaurants Tables Chairs Menus Fridges Etc... Count
0 A D G J M P 3
1 B E H K N Q 2
2 C F I L O R 3
and I would like to modify this:
def f(group):
row = group.irow(0)
return DataFrame({'class': [row['class']] * row['count']})
df.groupby('class', group_keys=False).apply(f)
so I could get
Restaurants Tables Chairs Menus Fridges Etc...
0 A D G J M P
1 A D G J M P
2 A D G J M P
0 B E H K N Q
1 B E H K N Q
0 C F I L O R
1 C F I L O R
2 C F I L O R
Is there an easy way to do it without typing every column's name?
#!/usr/bin/env python
import pandas as pd
from collections import defaultdict
d = defaultdict(list)
for n in range(len(df)):
for c in df.columns.tolist()[:-1]:
k = [df.ix[n][c]] * df.ix[n]['count']
for ks in k:
d[c].append(ks)
for j in range(df.ix[n]['count']):
d['index'].append(j)
new_df = pd.DataFrame(d, index=d['index']).drop(['index'], axis = 1)
new_df
Restaurants Tables Chairs Menus Fridges Etc...
0 A D G J M P
1 A D G J M P
2 A D G J M P
0 B E H K N Q
1 B E H K N Q
0 C F I L O R
1 C F I L O R
2 C F I L O R