I do have a dataframe like this:
import pandas as pd
df = pd.DataFrame({"c0": list('ABC'),
"c1": [" ".join(list('ab')), " ".join(list('def')), " ".join(list('s'))],
"c2": list('DEF')})
c0 c1 c2
0 A a b D
1 B d e f E
2 C s F
I want to create a pivot table that looks like this:
c2
c0 c1
A a D
b D
B d E
e E
f E
C s F
So, the entries in c1 are split and then treated as single elements used in a multiindex.
I do this as follows:
newdf = pd.DataFrame()
for indi, rowi in df.iterrows():
# get all single elements in string
n_elements = rowi['c1'].split()
# only one element so we can just add the entire row
if len(n_elements) == 1:
newdf = newdf.append(rowi)
# more than one element
else:
for eli in n_elements:
# that allows to add new elements using loc, without it we will have identical index values
if not newdf.empty:
newdf = newdf.reset_index(drop=True)
newdf.index = -1 * newdf.index - 1
# add entire row
newdf = newdf.append(rowi)
# replace the entire string by the single element
newdf.loc[indi, 'c1'] = eli
print newdf.reset_index(drop=True)
which yields
c0 c1 c2
0 A a D
1 A b D
2 B d E
3 B e E
4 B f E
5 C s F
Then I can just call
pd.pivot_table(newdf, index=['c0', 'c1'], aggfunc=lambda x: ' '.join(set(str(v) for v in x)))
which gives me the desired output (see above).
For huge dataframes that can be quite slow, so I am wondering whether there is a more efficient way of doing this.
Option 1
import numpy as np, pandas as pd
s = df.c1.str.split()
l = s.str.len()
newdf = df.loc[df.index.repeat(l)].assign(c1=np.concatenate(s)).set_index(['c0', 'c1'])
newdf
c2
c0 c1
A a D
b D
B d E
e E
f E
C s F
Option 2
Should be faster
import numpy as np, pandas as pd
s = np.core.defchararray.split(df.c1.values.astype(str), ' ')
l = [len(x) for x in s.tolist()]
r = np.arange(len(s)).repeat(l)
i = pd.MultiIndex.from_arrays([
df.c0.values[r],
np.concatenate(s)
], names=['c0', 'c1'])
newdf = pd.DataFrame({'c2': df.c2.values[r]}, i)
newdf
c2
c0 c1
A a D
b D
B d E
e E
f E
C s F
This is how I get the result , In R it is called unnest.
df.c1=df.c1.apply(lambda x : pd.Series(x).str.split(' '))
df.set_index(['c0', 'c2'])['c1'].apply(pd.Series).stack().reset_index().drop('level_2',1).rename(columns={0:'c1'}).set_index(['c0','c1'])
Out[208]:
c2
c0 c1
A a D
b D
B d E
e E
f E
C s F
Related
I have a pandas dataframe like this:
Id A B C D
1 a b c d
2 a b d
2 a c d
3 a d
3 a b c
I want to aggregate the empty values for the columns B-C and D, using the values contained in the other rows, by using the information for the same Id.
The resulting data frame should be the following:
Id A B C D
1 a b c d
2 a b c d
3 a b c d
There can be the possibility to have different values in the first column (A), for the same Id. In this case instead of putting the first instance I prefer to put another value indicating this event.
So for e.g.
Id A B C D
1 a b c d
2 a b d
2 x c d
It becomes:
Id A B C D
1 a b c d
2 f b c d
IIUC, you can use groupby_agg:
>>> df.groupby('Id')
.agg({'A': lambda x: x.iloc[0] if len(x.unique()) == 1 else 'f',
'B': 'first', 'C': 'first', 'D': 'first'})
A B C D
Id
1 a b c d
2 f b c d
The best way I can think to do this is to iterate through each unique Id, slicing it out of the original dataframe, and constructing a new row as a product of merging the relevant rows:
def aggregate(df):
ids = df['Id'].unique()
rows = []
for id in ids:
relevant = df[df['Id'] == id]
newrow = {c: "" for c in df.columns}
for _, row in relevant.iterrows():
for col in newrow:
if row[col]:
if len(newrow[col]):
if newrow[col][-1] == row[col]:
continue
newrow[col] += row[col]
rows.append(newrow)
return pd.DataFrame(rows)
I am trying to apply a function on the dataframe by checking for NULL values on each rows of an specific column.
However i have created the function but , i am not getting how to use the function on the rows having the values.
Input:
A B C D E F
0 f e b a d a
1 c b a c b
2 f f a b c c
3 d c c d c d
4 f b b b e b
5 b a f c d a
Expected Output
A B C D E F MATCHES Comments
0 f e b a d a AD, BC Unmatched
1 c b a c b BC Unmatched F is having blank values
2 f f a b c c AD, BC Unmatched
3 d c c d c d ALL MATCHED
4 f b b b e b AD Unmatched
5 b a f c d a AD, BC Unmatched
The script created is working when we don't have to check for the NaN values in df['F'] column, BUt when we check for the empty rows in df['F'] , It gives Error.
Code i have been trying:
def test(x):
try:
for idx in df.index:
unmatch_list = []
if not df.loc[idx, 'A'] == df.loc[idx, 'D']:
unmatch_list.append('AD')
if not df.loc[idx, 'B'] == df.loc[idx, 'C']:
unmatch_list.append('BC')
# etcetera...
if len(unmatch_list):
unmatch_string = ', '.join(unmatch_list) + ' Unmatched'
else:
unmatch_string = 'ALL MATCHED'
df.loc[idx, 'MATCHES'] = unmatch_string
except ValueError:
print ('error')
return df
## df = df.apply(lambda x: test(x) if(pd.notna(df['F'])) else x)
for row in df:
if row['F'].isna() == True:
row['Comments'] = "F is having blank values"
else:
df = test(df)
Please Suggest how can i use to function .
You could try something like this:
# get combis
df1 = df.copy().reset_index().melt(id_vars=['index'])
df1 = df1.merge(df1, on=['index', 'value'], how='inner')
df1 = df1[df1['variable_x'] != df1['variable_y']]
df1['combis'] = df1['variable_x'] + ':' + df1['variable_y']
df1 = df1.groupby(['index'])['combis'].apply(list)
# get empty rows
df2 = df.copy().reset_index().melt(id_vars=['index'])
df2 = df2[df2['value'].isna()]
df2 = df2.groupby(['index'])['variable'].apply(list)
# combine
df.join(df1).join(df2)
# A B C ... F combis variable
# 0 f e b ... a [D:F, F:D] NaN
# 1 c b a ... None [A:D, D:A, B:E, E:B] [F]
# 2 f f a ... c [A:B, B:A, E:F, F:E] NaN
# 3 d c c ... d [A:D, A:F, D:A, D:F, F:A, F:D, B:C, B:E, C:B, ... NaN
# 4 f b b ... b [B:C, B:D, B:F, C:B, C:D, C:F, D:B, D:C, D:F, ... NaN
# 5 b a f ... a [B:F, F:B] NaN
# [6 rows x 8 columns]
If you are only interested in the unmatched combinations you can use this:
import itertools
combis = [x+':'+y for x,y in itertools.permutations(df.columns, 2)]
df.join(df1).join(df2)['combis'].map(lambda lst: list(set(combis) - set(lst)))
I have a pandas dataframe that looks like:
c1 c2 c3 c4 result
a b c d 1
b c d a 1
a e d b 1
g a f c 1
but I want to randomly select 50% of the rows to swap the order of and also flip the result column from 1 to 0 (as shown below):
c1 c2 c3 c4 result
a b c d 1
d a b c 0 (we swapped c3 and c4 with c1 and c2)
a e d b 1
f c g a 0 (we swapped c3 and c4 with c1 and c2)
What's the idiomatic way to accomplish this?
You had the general idea. Shuffle the DataFrame and split it in half. Then modify one half and join back.
import numpy as np
np.random.seed(410112)
dfs = np.array_split(df.sample(frac=1), 2) # Shuffle then split in 1/2
# On one half set result to 0 and swap the columns
dfs[1]['result'] = 0
dfs[1] = dfs[1].rename(columns={'c1': 'c2', 'c2': 'c1', 'c3': 'c4', 'c4': 'c3'})
# Join Back
df = pd.concat(dfs).sort_index()
c1 c2 c3 c4 result
0 a b c d 1
1 c b a d 0
2 e a b d 0
3 g a f c 1
What is the fastest way to find duplicates where value from Column A have been reversed with value from Column B?
For example, if I have a DataFrame with :
Column A Column B
0 C P
1 D C
2 L G
3 A D
4 B P
5 B G
6 P B
7 J T
8 P C
9 J T
The result will be :
Column A Column B
0 C P
8 P C
4 B P
6 P B
I tried :
df1 = df
df2 = df
for i in df2.index:
res = df1[(df1['Column A'] == df2['Column A'][i]) & (df1['Column B'] == df2['Column B'][i])]
But this is very slow and it iterates over the same values...
Use merge with renamed columns DataFrame:
d = {'Column A':'Column B','Column B':'Column A'}
df = df.merge(df.rename(columns=d))
print (df)
Column A Column B
0 C P
1 B P
2 P B
3 P C
You could try using reindex for inversion.
column_Names=["B","A"]
df=df.reindex(columns=column_Names)
Or you could just do this:
col_list = list(df) # get a list of the columns
col_list[0], col_list[1] = col_list[1], col_list[0]
df.columns = col_list # assign back
I have a dataframe which has a single column like this:
a;d;c;d;e;r;w;e;o
--------------------
0 h;j;r;d;w;f;g;t;r
1 a;f;c;x;d;e;r;t;y
2 b;h;g;t;t;t;y;u;f
3 g;t;u;n;b;v;d;s;e
When I split it I am getting like this:
0 1 2 3 4 5 6 7 8
------------------------------
0 h j r d w f g t r
1 a f c x d e r t y
2 b h g t t t y u f
3 g t u n b v d s e
I need to assign a d c d e r w e o instead of 0 1 2 3 4 5 6 7 8 as column names.
I tried :
df = dataframe
df = df.iloc[:,0].str.split(';')
res = pd.DataFrame(df.columns.tolist())
res = pd.DataFrame(df.values.tolist())
I am getting values assigned to each column..But not column headers. What to do?
I think need create new DataFrame by expand=True parameter and then assign new columns names:
res = df.iloc[:,0].str.split(';', expand=True)
res.columns = df.columns[0].split(';')
print (res)
a d c d e r w e o
0 h j r d w f g t r
1 a f c x d e r t y
2 b h g t t t y u f
3 g t u n b v d s e
But maybe need sep=';' in read_csv if only one column data:
res = pd.read_csv(file, sep=';')