Split column into multiple columns with unique values in pandas - python

I have the following dataframe:
Col
0 A,B,C
1 B,A,D
2 C
3 A,D,E,F
4 B,C,F
df = pd.DataFrame({'Col': ['A,B,C', 'B,A,D', 'C', 'A,D,E,F', 'B,C,F']})
which needs to be turned into:
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F

You could use str.get_dummies to get the dummy variables, then multiply with the columns:
tmp = df['Col'].str.get_dummies(sep=',')
out = tmp * tmp.columns
One-liner as suggested by #piRSquared:
out = df.Col.str.get_dummies(',').pipe(lambda d: d*[*d])
Output:
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
Benchmark:
On data created by duplicating the data in the OP:
#piRSquared's first method using numpy methods is the fastest solution here.
On randomly generated DataFrames of increasing sizes:
Code to reproduce the plot:
import perfplot
import pandas as pd
import numpy as np
def enke(df):
tmp = df['Col'].str.get_dummies(sep=',')
return tmp * tmp.columns
def mozway(df):
return pd.concat([pd.Series((idx:=x.split(',')), index=idx)
for x in df['Col']], axis=1).T.fillna('')
def piRSquared(df):
n = df.shape[0]
i = np.repeat(np.arange(n), df.Col.str.count(',')+1)
c, j = np.unique(df.Col.str.cat(sep=',').split(','), return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, df.index, c)
def piRSquared2(df):
n = df.shape[0]
base = df.Col.to_numpy().astype(str)
commas = np.char.count(base, ',')
sepped = ','.join(base).split(',')
i = np.repeat(np.arange(n), commas+1)
c, j = np.unique(sepped, return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, df.index, c)
def constructor1(n):
df = pd.DataFrame({'Col': ['A,B,C', 'B,A,D', 'C', 'A,D,E,F', 'B,C,F']})
return pd.concat([df]*n, ignore_index=True)
def constructor2(n):
uc = np.array([*ascii_uppercase])
k = [','.join(np.random.choice(uc, x, replace=False))
for x in np.random.randint(1, 10, size=n)]
return pd.DataFrame({'Col': k})
kernels = [enke, piRSquared, piRSquared2, mozway]
df = pd.DataFrame({'Col': ['A,B,C', 'B,A,D', 'C', 'A,D,E,F', 'B,C,F']})
perfplot.plot(
setup=constructor1,
kernels=kernels,
labels=[func.__name__ for func in kernels],
n_range=[2**k for k in range(15)],
xlabel='len(df)',
logx=True,
logy=True,
relative_to=0,
equality_check=pd.DataFrame.equals)

Using pandas.concat:
pd.concat([pd.Series((idx:=x.split(',')), index=idx)
for x in df['Col']], axis=1).T
For python < 3.8:
pd.concat([pd.Series(val, index=val)
for x in df['Col']
for val in [x.split(',')]], axis=1).T
Output:
A B C D E F
0 A B C NaN NaN NaN
1 A B NaN D NaN NaN
2 NaN NaN C NaN NaN NaN
3 A NaN NaN D E F
4 NaN B C NaN NaN F
NB. add fillna('') to have empty strings for missing values
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F

This comes from my Project Overkill stash of tricks.
I'll use Numpy to identify where the labels are to be dropped in the 2-d array.
n = df.shape[0] # Get number of rows
base = df.Col.to_numpy().astype(str) # Turn `'Col'` to Numpy array
commas = np.char.count(base, ',') # Count commas in each row
sepped = ','.join(base).split(',') # Flat array of each element
i = np.repeat(np.arange(n), commas+1) # Row indices for flat array
# Note that I could've used `pd.factorize` here but I actually wanted
# a sorted array of labels so `np.unique` was the way to go.
# Otherwise I'd have used `j, c = pd.factorize(sepped)`
c, j = np.unique(sepped, return_inverse=True) # `j` col indices for flat array
# `c` will be the column labels
m = c.shape[0] # Get number of unique labels
a = np.full((n, m), '') # Default array of empty strings
a[i, j] = c[j] # Use row/col indices to insert
# the column labels in right spots
pd.DataFrame(a, df.index, c) # Construct new dataframe
A B C D E F
0 A B C
1 A B D
2 C
3 A D E F
4 B C F
Time Testing
The Functions
import pandas as pd
import numpy as np
from string import ascii_uppercase
def pir(s):
n = s.shape[0]
base = s.to_numpy().astype(str)
commas = np.char.count(base, ',')
sepped = ','.join(base).split(',')
i = np.repeat(np.arange(n), commas+1)
c, j = np.unique(sepped, return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, s.index, c)
def pir2(s):
n = s.shape[0]
sepped = s.str.cat(sep=',').split(',')
commas = s.str.count(',')
i = np.repeat(np.arange(n), commas+1)
c, j = np.unique(sepped, return_inverse=True)
m = c.shape[0]
a = np.full((n, m), '')
a[i, j] = c[j]
return pd.DataFrame(a, s.index, c)
def mozway(s):
return pd.concat([
pd.Series((idx:=x.split(',')), index=idx)
for x in s
], axis=1).T.fillna('')
def enke(s):
return s.str.get_dummies(',').pipe(lambda d: d*d.columns)
The test data constructor
def constructor(n, m):
uc = np.array([*ascii_uppercase])
m = min(26, m)
k = [','.join(np.random.choice(uc, x, replace=False))
for x in np.random.randint(1, m, size=n)]
return pd.Series(k)
The results dataframe
res = pd.DataFrame(
index=['enke', 'mozway', 'pir', 'pir2'],
columns=[10, 30, 100, 300, 1000, 3000, 10000, 30000],
dtype=float
)
Run the test
from IPython.display import clear_output
for j in res.columns:
s = constructor(j, 10)
for i in res.index:
stmt = f'{i}(s)'
setp = f'from __main__ import s, {i}'
res.at[i, j] = timeit(stmt, setp, number=50)
print(res)
clear_output(True)
Show the results
res.T.plot(loglog=True)
res.div(res.min()).T
enke mozway pir pir2
10 8.634105 19.416376 1.000000 2.300573
30 7.626107 32.741218 1.000000 2.028423
100 5.071308 50.539772 1.000000 1.533791
300 3.475711 66.638151 1.000000 1.184982
1000 2.616885 79.032159 1.012205 1.000000
3000 2.518983 91.521389 1.094863 1.000000
10000 2.536735 98.172680 1.131758 1.000000
30000 2.603489 99.756007 1.149734 1.000000

Related

Convert 2 column dataframe into multi-level hierarchical dataframe

I have a pandas dataframe
From
To
A
B
A
C
D
E
F
F
B
G
B
H
B
I
G
J
G
K
L
L
M
M
N
N
I want to convert it into multi column hierarchy. The expected hierarchy will look like
Level_1
Level_2
Level_3
Level_4
A
B
G
J
A
B
G
K
A
B
H
A
B
I
A
C
D
E
F
F
L
L
M
M
N
N
Is there an in-built way in pandas to achieve this? I know i can use recursion, Is there any other simplified way?
You can easily get what you expect using networkx
# Python env: pip install networkx
# Anaconda env: conda install networkx
import networkx as nx
import pandas as pd
df = pd.DataFrame({'From': ['A', 'A', 'D', 'F', 'B', 'B', 'B', 'G', 'G', 'L', 'M', 'N'],
'To': ['B', 'C', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N']})
G = nx.from_pandas_edgelist(df, source='From', target='To', create_using=nx.DiGraph)
roots = [v for v, d in G.in_degree() if d == 0]
leaves = [v for v, d in G.out_degree() if d == 0]
all_paths = []
for root in roots:
for leaf in leaves:
paths = nx.all_simple_paths(G, root, leaf)
all_paths.extend(paths)
for node in nx.nodes_with_selfloops(G):
all_paths.append([node, node])
Output:
>>> pd.DataFrame(sorted(all_paths)).add_prefix('Level_').fillna('')
Level_0 Level_1 Level_2 Level_3
0 A B G J
1 A B G K
2 A B H
3 A B I
4 A C
5 D E
6 F F
7 L L
8 M M
9 N N
Documentation: networkx.algorithms.simple_paths.all_simple_paths
Solution without networkx:
def path(df, parent, cur_path=None):
if cur_path is None:
cur_path = []
x = df[df.From.eq(parent)]
if len(x) == 0:
yield cur_path
return
elif len(x) == 1:
yield cur_path + x["To"].to_list()
return
for _, row in x.iterrows():
yield from path(df, row["To"], cur_path + [row["To"]])
def is_sublist(l1, l2):
# checks if l1 is sublist of l2
if len(l1) > len(l2):
return False
for i in range(len(l2)):
if l1 == l2[i : i + len(l1)]:
return True
return False
unique_paths = []
for v in df["From"].unique():
for p in path(df, v, [v]):
if not any(is_sublist(p, up) for up in unique_paths):
unique_paths.append(p)
df = pd.DataFrame(
[{f"level_{i}": v for i, v in enumerate(p, 1)} for p in unique_paths]
).fillna("")
print(df)
Prints:
level_1 level_2 level_3 level_4
0 A B G J
1 A B G K
2 A B H
3 A B I
4 A C
5 D E
6 F F
7 L L
8 M M
9 N N
Here's an alternative method that uses repeated pandas merges to get the result:
def create_multi_level(
df: pd.DataFrame, from_col: str = "From", to_col: str = "To"
) -> pd.DataFrame:
# Separate root nodes from non-root
mask = df[from_col].isin(df[to_col].unique()) & (df[from_col] != df[to_col])
root = df[~mask].copy()
non_root = df[mask].copy()
# Define starting columns
root.columns = ["level_0", "level_1"]
non_root.columns = ["level_1", "level_2"]
i = 1
while True:
# merge
root = root.merge(non_root, on=f"level_{i}", how="left")
# Return if we're done
non_missing = root[f"level_{i + 1}"].notna()
if non_missing.mean() == 0:
return root.drop(columns=f"level_{i + 1}")
# Increment and rename columns
i += 1
non_root.columns = [f"level_{i}", f"level_{i + 1}"]
Output:
>>> create_multi_level(df)
level_0 level_1 level_2 level_3
0 A B G J
1 A B G K
2 A B H NaN
3 A B I NaN
4 A C NaN NaN
5 D E NaN NaN
6 F F NaN NaN
7 L L NaN NaN
8 M M NaN NaN
9 N N NaN NaN

string operation on pandas df

pandas df with 11 columns need to modify first 3 columns using regex and add a new column with this modified column and us this for downstream concatenation, something like this I need to keep the element as is of these columns and make it a unique string
column1 column2 column3 column4 ...column 11
need to do this new_col = column1:column2-column3(column4)
and make this new column,
column1 column2 column3 newcol column4 ...column 11
I could do this using simple python one line, not sure what is the syntax for pandas
l = cols[0] + ":" + cols[1] + "-" + cols[2] + "(" + cols[5] + ")"
You should just be able to do it with the same syntax you posted as long as all of the columns contain strings.
You can also use the Series.str.cat method.
df['new_col'] = cols[0].str.cat(':' + cols[1] + '-' + cols[2] + '(' + cols[5]+ ')')
consider the dataframe df
np.random.seed([3,1415])
df = pd.DataFrame(np.random.choice(a, (5, 10))).add_prefix('col ')
print(df)
col 0 col 1 col 2 col 3 col 4 col 5 col 6 col 7 col 8 col 9
0 Q L C K P X N L N T
1 I X A W Y M W A C A
2 U Z H T N S M E D T
3 N W H X N U F D X F
4 Z L Y H M G E H W S
Construct a custom format function
f = lambda row: '{col 1}:{col 2}-{col 3}({col 4})'.format(**row)
And apply to df
df.astype(str).apply(f, 1)
0 L:C-K(P)
1 W:A-C(A)
2 W:H-X(N)
3 E:H-W(S)
4 Y:E-P(N)
dtype: object
Add a new column with assign
df.assign(New=df.astype(str).apply(f, 1))
# assign in place with
# df['New'] = df.astype(str).apply(f, 1)
col 0 col 1 col 2 col 3 col 4 col 5 col 6 col 7 col 8 col 9 New
0 Q L C K P X N L N T L:C-K(P)
1 I X A W Y M W A C A X:A-W(Y)
2 U Z H T N S M E D T Z:H-T(N)
3 N W H X N U F D X F W:H-X(N)
4 Z L Y H M G E H W S L:Y-H(M)
Or you can wrap this up into another function that operates on pd.Series. This requires that you pass the columns in the correct order.
def u(a, b, c, d):
return a + ':' + b + '-' + c + '(' + d + ')'
df.assign(New=u(df['col 1'], df['col 2'], df['col 3'], df['col 4']))
# assign in place with
# df['New'] = u(df['col 1'], df['col 2'], df['col 3'], df['col 4'])
col 0 col 1 col 2 col 3 col 4 col 5 col 6 col 7 col 8 col 9 New
0 Q L C K P X N L N T L:C-K(P)
1 I X A W Y M W A C A X:A-W(Y)
2 U Z H T N S M E D T Z:H-T(N)
3 N W H X N U F D X F W:H-X(N)
4 Z L Y H M G E H W S L:Y-H(M)
Based on an answer that was recently deleted, this works fine:
df1 = pd.DataFrame({
'chrom': ['a', 'b', 'c'],
'start': ['d', 'e', 'f'],
'end': ['g', 'h', 'i'],
'strand': ['j', 'k', 'l']}
)
df1['unique_col'] = df1.chrom + ':' + df1.start + '-' + df1.end + '(' + df1.strand + ')'
It sounds like your original dataframe may not contain strings. If it contains numbers, you need something like this:
df1 = pd.DataFrame({
'chrom': [1.0, 2.0],
'start': [3.0, 4.0],
'end': [5.0, 6.0],
'strand': [7.0, 8.0]}
)
df1['unique_col'] = (
df1.chrom.astype(str) + ':'
+ df1.start.astype(str) + '-' + df1.end.astype(str)
+ '(' + df1.strand.astype(str) + ')'
)

How to convert column with list of values into rows in Pandas DataFrame

Hi I have a dataframe like this:
A B
0: some value [[L1, L2]]
I want to change it into:
A B
0: some value L1
1: some value L2
How can I do that?
Pandas >= 0.25
df1 = pd.DataFrame({'A':['a','b'],
'B':[[['1', '2']],[['3', '4', '5']]]})
print(df1)
A B
0 a [[1, 2]]
1 b [[3, 4, 5]]
df1 = df1.explode('B')
df1.explode('B')
A B
0 a 1
0 a 2
1 b 3
1 b 4
1 b 5
I don't know how good this approach is but it works when you have a list of items.
you can do it this way:
In [84]: df
Out[84]:
A B
0 some value [[L1, L2]]
1 another value [[L3, L4, L5]]
In [85]: (df['B'].apply(lambda x: pd.Series(x[0]))
....: .stack()
....: .reset_index(level=1, drop=True)
....: .to_frame('B')
....: .join(df[['A']], how='left')
....: )
Out[85]:
B A
0 L1 some value
0 L2 some value
1 L3 another value
1 L4 another value
1 L5 another value
UPDATE: a more generic solution
Faster solution with chain.from_iterable and numpy.repeat:
from itertools import chain
import numpy as np
import pandas as pd
df = pd.DataFrame({'A':['a','b'],
'B':[[['A1', 'A2']],[['A1', 'A2', 'A3']]]})
print (df)
A B
0 a [[A1, A2]]
1 b [[A1, A2, A3]]
df1 = pd.DataFrame({ "A": np.repeat(df.A.values,
[len(x) for x in (chain.from_iterable(df.B))]),
"B": list(chain.from_iterable(chain.from_iterable(df.B)))})
print (df1)
A B
0 a A1
1 a A2
2 b A1
3 b A2
4 b A3
Timings:
A = np.unique(np.random.randint(0, 1000, 1000))
B = [[list(string.ascii_letters[:random.randint(3, 10)])] for _ in range(len(A))]
df = pd.DataFrame({"A":A, "B":B})
print (df)
A B
0 0 [[a, b, c, d, e, f, g, h]]
1 1 [[a, b, c]]
2 3 [[a, b, c, d, e, f, g, h, i]]
3 5 [[a, b, c, d, e]]
4 6 [[a, b, c, d, e, f, g, h, i]]
5 7 [[a, b, c, d, e, f, g]]
6 8 [[a, b, c, d, e, f]]
7 10 [[a, b, c, d, e, f]]
8 11 [[a, b, c, d, e, f, g]]
9 12 [[a, b, c, d, e, f, g, h, i]]
10 13 [[a, b, c, d, e, f, g, h]]
...
...
In [67]: %timeit pd.DataFrame({ "A": np.repeat(df.A.values, [len(x) for x in (chain.from_iterable(df.B))]),"B": list(chain.from_iterable(chain.from_iterable(df.B)))})
1000 loops, best of 3: 818 µs per loop
In [68]: %timeit ((df['B'].apply(lambda x: pd.Series(x[0])).stack().reset_index(level=1, drop=True).to_frame('B').join(df[['A']], how='left')))
10 loops, best of 3: 103 ms per loop
I can't find a elegant way to handle this, but the following codes can work...
import pandas as pd
import numpy as np
df = pd.DataFrame([{"a":1,"b":[[1,2]]},{"a":4, "b":[[3,4,5]]}])
z = []
for k,row in df.iterrows():
for j in list(np.array(row.b).flat):
z.append({'a':row.a, 'b':j})
result = pd.DataFrame(z)
I think this is the fastest and simplest way:
df = pd.DataFrame({'A':['a','b'],
'B':[[['A1', 'A2']],[['A1', 'A2', 'A3']]]})
df.set_index('A')['B'].apply(lambda x: pd.Series(x[0]))
Here's another option
unpacked = (pd.melt(df.B.apply(pd.Series).reset_index(),id_vars='index')
.merge(df, left_on = 'index', right_index = True))
unpacked = (unpacked.loc[unpacked.value.notnull(),:]
.drop(columns=['index','variable','B'])
.rename(columns={'value':'B'})
Apply pd.series to column B --> splits each list entry to a different row
Melt this, so that each entry is a separate row (preserving index)
Merge this back on original dataframe
Tidy up - drop unnecessary columns and rename the values column

Optimizing pandas filter inside apply function

I have a list of pairs--stored in a DataFrame--each pair having an 'a' column and a 'b' column. For each pair I want to return the 'b's that have the same 'a'. For example, given the following set of pairs:
a b
0 c d
1 e f
2 c g
3 e h
4 i j
5 e k
I would like to end up with:
a b equivalents
0 c d [g]
1 e f [h, k]
2 c g [d]
3 e h [f, k]
4 i j []
5 e k [h, e]
I can do this with the following:
def equivalents(x):
l = pairs[pairs["a"] == x["a"]]["b"].tolist()
return l[1:] if l else l
pairs["equivalents"] = pairs.apply(equivalents, axis = 1)
But it is painfully slow on larger sets (e.g. 1 million plus pairs). Any suggestions how I could do this faster?
I think this ought to be a bit faster. First, just add them up.
df['equiv'] = df.groupby('a')['b'].transform(sum)
a b equiv
0 c d dg
1 e f fhk
2 c g dg
3 e h fhk
4 i j j
5 e k fhk
Now convert to a list and remove whichever letter is already in column 'b'.
df.apply( lambda x: [ y for y in list( x.equiv ) if y != x.b ], axis=1 )
0 [g]
1 [h, k]
2 [d]
3 [f, k]
4 []
5 [f, h]

how to select multiple values from a single level of a dataframe multiindex

If I have the following:
df = pd.DataFrame(np.random.random((4,8)))
tupleList = zip([x for x in 'abcdefgh'], [y for y in ['iijjkkll'])
ind = pd.MultiIndex.from_tuples(tupleList)
df.columns = ind
In [71]: df
Out[71]:
a b c d e f g \
i i j j k k l
0 0.968112 0.809183 0.144320 0.518120 0.820079 0.648237 0.971552
1 0.959022 0.721705 0.139588 0.408940 0.230956 0.907192 0.467016
2 0.335085 0.537437 0.725119 0.486447 0.114048 0.150150 0.894322
3 0.051249 0.186547 0.779814 0.905914 0.024298 0.002489 0.339714
h
l
0 0.438330
1 0.225447
2 0.331413
3 0.530789
[4 rows x 8 columns]
what is the easiest way to select the columns that have a second level label of "j" or "k"?
c d e f
j j k k
0 0.948030 0.243993 0.627497 0.729024
1 0.087703 0.874968 0.581875 0.996466
2 0.802155 0.213450 0.375096 0.184569
3 0.164278 0.646088 0.201323 0.022498
I can do this:
df.loc[:, df.columns.get_level_values(1).isin(['j', 'k'])]
But that seems pretty verbose for something that feels like it should be simple. Any better approaches?
See here for multiindex using slicers, introduced in 0.14.0
In [36]: idx = pd.IndexSlice
In [37]: df.loc[:, idx[:, ['j', 'k']]]
Out[37]:
c d e f
j j k k
0 0.750582 0.877763 0.262696 0.226005
1 0.025902 0.967179 0.125647 0.297304
2 0.463544 0.104973 0.154113 0.284820
3 0.631695 0.841023 0.820907 0.938378

Categories