I have a Python dictionary:
adict = {
'col1': [
{'id': 1, 'tag': '#one#two'},
{'id': 2, 'tag': '#two#'},
{'id': 1, 'tag': '#one#three#'}
]
}
I want the result as follows:
Id tag
1 one,two,three
2 two
Could someone please tell me how to do this?
Try this
import pandas as pd
d={'col1':[{'id':1,'tag':'#one#two'},{'id':2,'tag':'#two#'},{'id':1,'tag':'#one#three#'}]}
df = pd.DataFrame()
for i in d:
for k in d[i]:
t = pd.DataFrame.from_dict(k, orient='index').T
t["tag"] = t["tag"].str.replace("#",",")
df = pd.concat([df,t])
tf = df.groupby(["id"])["tag"].apply(lambda x : ",".join(set(''.join(list(x)).strip(",").split(","))))
Here is a simple code
import pandas as pd
d = {'col1':[{'id':1,'tag':'#one#two'},{'id':2,'tag':'#two#'},{'id':1,'tag':'#one#three#'}]}
df = pd.DataFrame(d)
df['Id'] = df.col1.apply(lambda x: x['id'])
df['tag'] = df.col1.apply(lambda x: ''.join(list(','.join(x['tag'].split('#')))[1:]))
df.drop(columns = 'col1', inplace = True)
Output:
Id Tag
1 one, two
2 two
1 one, three
If order of tags is important first remove trailing # and split by #, then per groups remove duplicates and join:
df = pd.DataFrame(d['col1'])
df['tag'] = df['tag'].str.strip('#').str.split('#')
f = lambda x: ','.join(dict.fromkeys([z for y in x for z in y]).keys())
df = df.groupby('id')['tag'].apply(f).reset_index()
print (df)
id tag
0 1 one,two,three
1 2 two
If order of tags is not important for remove duplicates use sets:
df = pd.DataFrame(d['col1'])
df['tag'] = df['tag'].str.strip('#').str.split('#')
f = lambda x: ','.join(set([z for y in x for z in y]))
df = df.groupby('id')['tag'].apply(f).reset_index()
print (df)
id tag
0 1 three,one,two
1 2 two
I tried as below
import pandas as pd
a = {'col1':[{'id':1, 'tag':'#one#two'},{'id':2, 'tag':'#two#'},{'id':1, 'tag':'#one#three#'}]}
df = pd.DataFrame(a)
df[["col1", "col2"]] = pd.DataFrame(df.col1.values.tolist(), index = df.index)
df['col1'] = df.col1.str.replace('#', ',')
df = df.groupby(["col2"])["col1"].apply(lambda x : ",".join(set(''.join(list(x)).strip(",").split(","))))
O/P:
col2
1 one,two,three
2 two
dic=[{'col1':[{'id':1,'tag':'#one#two'},{'id':2,'tag':'#two#'},{'id':1,'tag':'#one#three#'}]}]
row=[]
for key in dic:
data=key['col1']
for rows in data:
row.append(rows)
df=pd.DataFrame(row)
print(df)
o
Related
In the column 'id', I would like to split the string into every 6 digits and add a comma.
df = pd.pivot_table(df, values=['id'], index=['eu_crm'], aggfunc='sum')
df.loc[df[:, 1] for i in range(0, len(['id'], 6)
Code:
import pandas as pd
data = {'Column 1': ['a', 'b', 'c'],
'id': [2468938493843983, 345642232, 23343433]}
df = pd.DataFrame(data)
df['id'] = df['id'].astype(str)
df['fromleft'] = [','.join([df['id'][i][j:j+6] for j in range(0, len(df['id'][i]), 6)]) for i in range(len(df))]
print(df)
Output:
Column 1 id fromleft
0 a 2468938493843983 246893,849384,3983
1 b 345642232 345642,232
2 c 23343433 233434,33
Assuming you want to split from the left:
df['id'] = df['id'].astype(str).str.replace(r'(.{6})(?=.)', r'\1,', regex=True)
Output:
id
0 280530,284442,284690
I have a df
index col1
0 a,c
1 d,f
2 o,k
I need a df like this
index col1
0 {"col1":"a,c"}
1 {"col1":"d,f"}
2 {"col1":"o,k"}
This needs to be applied for all columns in the df.
Tried with orient, but not as expected.
For all columns use double apply, columns name is passed by x.name, get dictionary:
df = df.apply(lambda x: x.apply(lambda y: {x.name: y}))
For json use:
import json
df = df.apply(lambda x: x.apply(lambda y: json.dumps({x.name: y})))
print (df)
col1
0 {"col1": "a,c"}
1 {"col1": "d,f"}
2 {"col1": "o,k"}
Alternative solution for dictionaries:
df = pd.DataFrame({c: [{c: x} for x in df[c]] for c in df.columns}, index=df.index)
Alterative2 solution for json (working well if all columns are filled by strings):
df = '{"' + df.columns + '": "' + df.astype(str) + '"}'
If you want strings exactly as shown, use:
df['col1'] = '{col1:'+df['col1']+'}'
# or
c = 'col1'
df[c] = f'{{{c}:'+df[c]+'}'
output:
0 {col1:a,c}
1 {col1:d,f}
2 {col1:o,k}
Name: col1, dtype: object
or, with quotes:
df['col1'] = '{"col1":"'+df['col1']+'"}'
# or
c = 'col1'
df[c] = f'{{"{c}":"'+df[c]+'"}'
output:
index col1
0 0 {"col1":"a,c"}
1 1 {"col1":"d,f"}
2 2 {"col1":"o,k"}
for all columns:
df = df.apply(lambda c: f'{{"{c.name}":"'+c.astype(str)+'"}')
NB. ensure "index" is the index
for dictionaries:
df['col1'] = [{'col1': x} for x in df['col1']]
output:
index col1
0 0 {'col1': 'a,c'}
1 1 {'col1': 'd,f'}
2 2 {'col1': 'o,k'}
I have a dataframe with column a. I need to get data after second _.
a
0 abc_def12_0520_123
1 def_ghij123_0120_456
raw_data = {'a': ['abc_def12_0520_123', 'def_ghij123_0120_456']}
df = pd.DataFrame(raw_data, columns = ['a'])
Output:
a b
0 abc_def12_0520_123 0520_123
1 def_ghij123_0120_456 0120_456
What I have tried:
df['b'] = df.number.str.replace('\D+', '')
I tried removing alphabets first, But its getting complex. Any suggestions
Here is how:
df['b'] = ['_'.join(s.split('_')[2:]) for s in df['a']]
print(df)
Output:
a b
0 abc_def12_0520_123 0520_123
1 def_ghij123_0120_456 0120_456
Explanation:
lst = ['_'.join(s.split('_')[2:]) for s in df['a']]
is the equivalent of:
lst = []
for s in df['a']:
a = s.split('_')[2:] # List all strings in list of substrings splitted '_' besides the first 2
lst.append('_'.join(a))
Try:
df['b'] = df['a'].str.split('_',2).str[-1]
a b
0 abc_def12_0520_123 0520_123
1 def_ghij123_0120_456 0120_456
I have a dataframe which has a dictionary inside a nested list and i want to split the column 'C' :
A B C
1 a [ {"id":2,"Col":{"x":3,"y":4}}]
2 b [ {"id":5,"Col":{"x":6,"y":7}}]
expected output :
A B C_id Col_x Col_y
1 a 2 3 4
2 b 5 6 7
From the comments, json_normalize might help you.
After extracting id and col columns with:
df[["Col", "id"]] = df["C"].apply(lambda x: pd.Series(x[0]))
You can explode the dictionary in Col with json_normalize and use concat to merge with existing dataframe:
df = pd.concat([df, json_normalize(df.Col)], axis=1)
Also, use drop to remove old columns.
Full code:
# Import modules
import pandas as pd
from pandas.io.json import json_normalize
# from flatten_json import flatten
# Create dataframe
df = pd.DataFrame([[1, "a", [ {"id":2,"Col":{"x":3,"y":4}}]],
[2, "b", [ {"id":5,"Col":{"x":6,"y":7}}]]],
columns=["A", "B", "C"])
# Add col and id column + remove old "C" column
df = pd.concat([df, df["C"].apply(lambda x: pd.Series(x[0]))], axis=1) \
.drop("C", axis=1)
print(df)
# A B Col id
# 0 1 a {'x': 3, 'y': 4} 2
# 1 2 b {'x': 6, 'y': 7} 5
# Show json_normalize behavior
print(json_normalize(df.Col))
# x y
# 0 3 4
# 1 6 7
# Explode dict in "col" column + remove "Col" colun
df = pd.concat([df, json_normalize(df.Col)], axis=1) \
.drop(["Col"], axis=1)
print(df)
# A B id x y
# 0 1 a 2 3 4
# 1 2 b 5 6 7
You can try .apply method
df['C_id'] = df['C'].apply(lambda x: x[0]['id'])
df['C_x'] = df['C'].apply(lambda x: x[0]['Col']['x'])
df['C_y'] = df['C'].apply(lambda x: x[0]['Col']['y'])
Code
import pandas as pd
A = [1, 2]
B = ['a', 'b']
C = [{"id":2,"Col":{"x":3,"y":4}}, {"id":5,"Col":{"x":6,"y":7}}]
df = pd.DataFrame({"A": A, "B": B, "C_id": [element["id"] for element in C],
"Col_x": [element["Col"]["x"] for element in C],
"Col_y": [element["Col"]["y"] for element in C]})
Ouput:
This question already has answers here:
How to explode a list inside a Dataframe cell into separate rows
(12 answers)
Closed 3 years ago.
I have a dataframe in pandas like this:
id info
1 [1,2]
2 [3]
3 []
And I want to split it into different rows like this:
id info
1 1
1 2
2 3
3 NaN
How can I do this?
You can try this out:
>>> import pandas as pd
>>> df = pd.DataFrame({'id': [1,2,3], 'info': [[1,2],[3],[]]})
>>> s = df.apply(lambda x: pd.Series(x['info']), axis=1).stack().reset_index(level=1, drop=True)
>>> s.name = 'info'
>>> df2 = df.drop('info', axis=1).join(s)
>>> df2['info'] = pd.Series(df2['info'], dtype=object)
>>> df2
id info
0 1 1
0 1 2
1 2 3
2 3 NaN
Similar question is posted in here
This is rather convoluted way, which drops empty cells:
import pandas as pd
df = pd.DataFrame({'id': [1,2,3],
'info': [[1,2], [3], [ ]]})
unstack_df = df.set_index(['id'])['info'].apply(pd.Series)\
.stack()\
.reset_index(level=1, drop=True)
unstack_df = unstack_df.reset_index()
unstack_df.columns = ['id', 'info']
unstack_df
>>
id info
0 1 1.0
1 1 2.0
2 2 3.0
Here's one way using np.repeat and itertools.chain. Converting empty lists to {np.nan} is a trick to fool Pandas into accepting an iterable as a value. This allows chain.from_iterable to work error-free.
import numpy as np
from itertools import chain
df.loc[~df['info'].apply(bool), 'info'] = {np.nan}
res = pd.DataFrame({'id': np.repeat(df['id'], df['info'].map(len).values),
'info': list(chain.from_iterable(df['info']))})
print(res)
id info
0 1 1.0
0 1 2.0
1 2 3.0
2 3 NaN
Try these methods too...
Method 1
def split_dataframe_rows(df,column_selectors):
# we need to keep track of the ordering of the columns
def _split_list_to_rows(row,row_accumulator,column_selector):
split_rows = {}
max_split = 0
for column_selector in column_selectors:
split_row = row[column_selector]
split_rows[column_selector] = split_row
if len(split_row) > max_split:
max_split = len(split_row)
for i in range(max_split):
new_row = row.to_dict()
for column_selector in column_selectors:
try:
new_row[column_selector] = split_rows[column_selector].pop(0)
except IndexError:
new_row[column_selector] = ''
row_accumulator.append(new_row)
new_rows = []
df.apply(_split_list_to_rows,axis=1,args = (new_rows,column_selectors))
new_df = pd.DataFrame(new_rows, columns=df.columns)
return new_df
Method 2
def flatten_data(json = None):
df = pd.DataFrame(json)
list_cols = [col for col in df.columns if type(df.loc[0, col]) == list]
for i in range(len(list_cols)):
col = list_cols[i]
meta_cols = [col for col in df.columns if type(df.loc[0, col]) != list] + list_cols[i+1:]
json_data = df.to_dict('records')
df = json_normalize(data=json_data, record_path=col, meta=meta_cols, record_prefix=col+str('_'), sep='_')
return json_normalize(df.to_dict('records'))