Pandas dataframe to nested dictionary - python

Let's say my dataframe looks like this.
date app_id country val1 val2 val3 val4
2016-01-01 123 US 50 70 80 90
2016-01-02 123 US 60 80 90 100
2016-01-03 123 US 70 88 99 11
I want to dump this into a nested dictionary or even a JSON object as follows:
{
country:
{
app_id:
{
date: [val1, val2, val3, val4]
}
}
}
So that way if I called my_dict['US'[123['2016-01-01']]], I would get to the list [50,70,80,90]
Is there an elegant way to go about doing this? I'm aware of Pandas's to_dict() function but I can't seem to get around nesting dictionaries.

1st create the dataframe you need. then using recur_dictify from DSM.
dd=df.groupby(['country','app_id','date'],as_index=False)['val1', 'val2', 'val3', 'val4'].apply(lambda x : x.values.tolist()[0]).to_frame()
def recur_dictify(frame):
if len(frame.columns) == 1:
if frame.values.size == 1: return frame.values[0][0]
return frame.values.squeeze()
grouped = frame.groupby(frame.columns[0])
d = {k: recur_dictify(g.iloc[:,1:]) for k,g in grouped}
return d
recur_dictify(dd.reset_index())
Out[711]:
{'US': {123: {'2016-01-01': [50, 70, 80, 90],
'2016-01-02': [60, 80, 90, 100],
'2016-01-03': [70, 88, 99, 11]}}}

update
Actually this might work with a simple nested dictionary:
import pandas as pd
from collections import defaultdict
nested_dict = lambda: defaultdict(nested_dict)
output = nested_dict()
for lst in df.values:
output[lst[1]][lst[0]][lst[2]] = lst[3:].tolist()
Or:
output = defaultdict(dict)
for lst in df.values:
try:
output[lst[1]][lst[0]].update({lst[2]:lst[3:].tolist()})
except KeyError:
output[lst[1]][lst[0]] = {}
finally:
output[lst[1]][lst[0]].update({lst[2]:lst[3:].tolist()})
Or:
output = defaultdict(dict)
for lst in df.values:
if output.get(lst[1], {}).get(lst[0]) == None:
output[lst[1]][lst[0]] = {}
output[lst[1]][lst[0]].update({lst[2]:lst[3:].tolist()})
output
Here is my old solution, we make use df.groupbyto group the dataframe by country and app_id. From here we collect the data (excluding country and app_id) and use defaultdict(dict) to add data to output dictionary in a nested way.
import pandas as pd
from collections import defaultdict
output = defaultdict(dict)
groups = ["country","app_id"]
cols = [i for i in df.columns if i not in groups]
for i,subdf in df.groupby(groups):
data = subdf[cols].set_index('date').to_dict("split") #filter away unwanted cols
d = dict(zip(data['index'],data['data']))
output[i[0]][i[1]] = d # assign country=level1, app_id=level2
output
return:
{'FR': {123: {'2016-01-01': [10, 20, 30, 40]}},
'US': {123: {'2016-01-01': [50, 70, 80, 90],
'2016-01-02': [60, 80, 90, 100],
'2016-01-03': [70, 88, 99, 11]},
124: {'2016-01-01': [10, 20, 30, 40]}}}
and output['US'][123]['2016-01-01'] return:
[50, 70, 80, 90]
if:
df = pd.DataFrame.from_dict({'app_id': {0: 123, 1: 123, 2: 123, 3: 123, 4: 124},
'country': {0: 'US', 1: 'US', 2: 'US', 3: 'FR', 4: 'US'},
'date': {0: '2016-01-01',
1: '2016-01-02',
2: '2016-01-03',
3: '2016-01-01',
4: '2016-01-01'},
'val1': {0: 50, 1: 60, 2: 70, 3: 10, 4: 10},
'val2': {0: 70, 1: 80, 2: 88, 3: 20, 4: 20},
'val3': {0: 80, 1: 90, 2: 99, 3: 30, 4: 30},
'val4': {0: 90, 1: 100, 2: 11, 3: 40, 4: 40}})

Related

Python Color Dataframe cells depending on values

I am trying to color the cells
I have the following Dataframe:
pd.DataFrame({'Jugador': {1: 'M. Sanchez',
2: 'L. Ovalle',
3: 'K. Soto',
4: 'U. Kanu',
5: 'K. Abud'},
'Equipo': {1: 'Houston Dash',
2: 'Tigres UANL',
3: 'Guadalajara',
4: 'Tigres UANL',
5: 'Cruz Azul'},
'Edad': {1: 26, 2: 22, 3: 26, 4: 24, 5: 29},
'Posición específica': {1: 'RAMF, RW',
2: 'LAMF, LW',
3: 'RAMF, RW, CF',
4: 'RAMF, CF, RWF',
5: 'RW, RAMF, LW'},
'Minutos jugados': {1: 2053, 2: 3777, 3: 2287, 4: 1508, 5: 1436},
'Offence': {1: 84, 2: 90, 3: 69, 4: 80, 5: 47},
'Defense': {1: 50, 2: 36, 3: 64, 4: 42, 5: 86},
'Passing': {1: 78, 2: 81, 3: 72, 4: 73, 5: 71},
'Total': {1: 72, 2: 71, 3: 69, 4: 66, 5: 66}})
How can I color the Offence, Defense and Passing cells green if > 60, red < 40 and yellow the rest?
Use Styler.applymap with custom function:
def styler(v):
if v > 60:
return 'background-color:green'
elif v < 40:
return 'background-color:red'
else:
return 'background-color:yellow'
df.style.applymap(styler, subset=['Offence','Defense','Passing'])
Alternative solution:
styler = lambda v: 'background-color:green' if v > 60 else 'background-color:red' if v < 40 else 'background-color:yellow'
df.style.applymap(styler, subset=['Offence','Defense','Passing'])
Another approach:
def hightlight(x):
c1 = 'background-color:green'
c2 = 'background-color:red'
c3 = 'background-color:yellow'
cols = ['Offence','Defense','Passing']
#DataFrame with same index and columns names as original filled empty strings
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
#modify values of df1 columns by boolean mask
df1[cols] = np.select([x[cols] > 60, x[cols] < 40], [c1, c2], default=c3)
return df1
df.style.apply(hightlight, axis=None)

python - Update default values in dictionary from dataframe rows

From a dataframe, I build a dictionary that has as keys each distinct value from a given column.
The value of each key is a nested dictionary, being the key the distinct values from another column.
The Values in the nested dictionary will be updated by iterating a dataframe (third column).
Example:
import pandas as pd
data = [['computer',1, 10]
,['computer',2,20]
,['computer',4, 40]
,['laptop',1, 100]
,['laptop',3, 30]
,['printer',2, 200]
]
df = pd.DataFrame(data,columns=['Product','id', 'qtt'])
print (df)
Product
id
qtt
computer
1
10
computer
2
20
computer
4
40
laptop
1
100
laptop
3
30
printer
2
200
kdf_key_dic = {key: None for key in df['id'].unique().tolist()}
product_key_dic = {key: kdf_key_dic for key in df['Product'].unique().tolist()}
print ("product_key_dic: ", product_key_dic)
product_key_dic: {
'computer': {1: None, 2: None, 4: None, 3: None},
'laptop': {1: None, 2: None, 4: None, 3: None},
'printer': {1: None, 2: None, 4: None, 3: None}
}
Now I'd like to update the product_key_dic dictionary, but I can't get it right, it always uses the same key-dict for each key in the main dictionary!
for index, row in df.iterrows():
product_key_dic[row['Product']].update({row['id']:row['qtt']})
print("\n product_key_dic:\n", product_key_dic)
I get:
product_key_dic:
{ 'computer': {1: 100, 2: 200, 4: 40, 3: 30},
'laptop': {1: 100, 2: 200, 4: 40, 3: 30},
'printer': {1: 100, 2: 200, 4: 40, 3: 30}
}
I expect:
{ 'computer': {1: 10, 2: 20, 4: 40, 3: None},
'laptop': {1: 100, 2: None, 4: None, 3: 30},
'printer': {1: None, 2: 200, 4: None, 3: None}
}
I can't understand the problem, somehow it's like each key has the nested dictoinary..?
We can try a different approach creating a MultiIndex.from_product based on the unique values from Product and Id then reshaping so we can call DataFrame.to_dict directly:
cols = ['Product', 'id']
product_key_dic = (
df.set_index(cols).reindex(
pd.MultiIndex.from_product(
[df[col].unique() for col in cols],
names=cols
)
) # Reindex to ensure all pairs are present in the DF
.replace({np.nan: None}) # Replace nan with None
.unstack('Product') # Create Columns from Product
.droplevel(0, axis=1) # Remove qtt from column MultiIndex
.to_dict()
)
product_key_dic:
{
'computer': {1: 10.0, 2: 20.0, 3: None, 4: 40.0},
'laptop': {1: 100.0, 2: None, 3: 30.0, 4: None},
'printer': {1: None, 2: 200.0, 3: None, 4: None}
}
Methods Used:
DataFrame.set_index
DataFrame.reindex
MultiIndex.from_product
Series.unique
DataFrame.replace
DataFrame.unstack
DataFrame.droplevel
DataFrame.to_dict
Setup and imports:
import numpy as np
import pandas as pd
data = [['computer', 1, 10], ['computer', 2, 20], ['computer', 4, 40],
['laptop', 1, 100], ['laptop', 3, 30], ['printer', 2, 200]]
df = pd.DataFrame(data, columns=['Product', 'id', 'qtt'])
The initial solution could be modified by adding a copy call to the dictionary in the comprehension to make them separate dictionaries rather than multiple references to the same one (How to copy a dictionary and only edit the copy). However, iterating over DataFrames is discouraged (Does pandas iterrows have performance issues?):
kdf_key_dic = {key: None for key in df['id'].unique().tolist()}
product_key_dic = {key: kdf_key_dic.copy()
for key in df['Product'].unique().tolist()}
for index, row in df.iterrows():
product_key_dic[row['Product']].update({row['id']: row['qtt']})
product_key_dic:
{
'computer': {1: 10.0, 2: 20.0, 3: None, 4: 40.0},
'laptop': {1: 100.0, 2: None, 3: 30.0, 4: None},
'printer': {1: None, 2: 200.0, 3: None, 4: None}
}
This is because you are reusing same dict object. Let's take these two statements.
kdf_key_dic = {key: None for key in df['id'].unique().tolist()}
product_key_dic = {key: kdf_key_dic for key in df['Product'].unique().tolist()}
You are passing kdf_key_dic as value(in the second statement) which is same object in each iteration.
So instead of this you can pass a copy of kdf_key_dic while constructing product_key_dic
product_key_dic = {key: kdf_key_dic.copy() for key in df['Product'].unique().tolist()}

Nested Dictionary to excel with color formatting

My dictionary looks like this :
dict1 = { '2020-10-11' : {
'group1':{
1 : 2356,
21 : 10001,
34 : 234
},
'group2':{
11 : 999,
2 : 101,
13 : 1234
}
},
'2020-10-12' : {
'group1':{
11 : 236,
21 : 100,
34 : 34
},
'group2':{
1 : 99,
3 : 121,
2 : 12
}
}
}
I wanted my output to look something like this :
The requirement is : for every date, the color should be different.
I have tried this using this method:
reform = {(level1_key, level2_key, level3_key): values
for level1_key, level2_dict in dict1.items()
for level2_key, level3_dict in level2_dict.items()
for level3_key, values in level3_dict.items()}
out = pd.DataFrame(reform,index = ['amount']).T
names=['date', 'group', 'id']
out.index.set_names(names, inplace=True)
out in xls :
After this how am I supposed to proceed for the color formatting in excel using python?
The first step is the entirely flatten the structure so that a 2-dimensional representation of the nested values emmerges:
dict1 = {'2020-10-11': {'group1': {1: 2356, 21: 10001, 34: 234}, 'group2': {11: 999, 2: 101, 13: 1234}}, '2020-10-12': {'group1': {11: 236, 21: 100, 34: 34}, 'group2': {1: 99, 3: 121, 2: 12}}}
def flatten(d, c = []):
flag = True
for a, b in d.items():
if isinstance(b, dict):
yield from flatten(b, c=c+[a] if flag or not c else [*c[:-2],'',a])
else:
yield c+[a, b] if flag or not c else [*(['']*(len(c))),a, b]
flag = False
data = list(flatten(dict1))
#[['2020-10-11', 'group1', 1, 2356], ['', '', 21, 10001], ['', '', 34, 234], ['', 'group2', 11, 999], ['', '', 2, 101], ['', '', 13, 1234], ['2020-10-12', 'group1', 11, 236], ['', '', 21, 100], ['', '', 34, 34], ['', 'group2', 1, 99], ['', '', 3, 121], ['', '', 2, 12]]
Next, create a pd.DataFrame from the results and apply the coloring:
import pandas as pd
df = pd.DataFrame(data, columns=['Date', 'Group', 'ID', 'Amount'])
writer = pd.ExcelWriter('test_rsults12.xls', engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1', index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
c_pool = iter([workbook.add_format({'bg_color': '#fff0c1'}), workbook.add_format({'bg_color': '#d5e6f5'})])
fmt = None
for i in range(len(data)):
if data[i][0]:
fmt = next(c_pool)
worksheet.set_row(i+1, cell_format=fmt)
writer.save()
Result:

How to convert pandas dataframe to hierarchical dictionary

I have the following pandas dataframe:
df1 = pd.DataFrame({'date': [200101,200101,200101,200101,200102,200102,200102,200102],'blockcount': [1,1,2,2,1,1,2,2],'reactiontime': [350,400,200,250,100,300,450,400]})
I am trying to create a hierarchical dictionary, with the values of the embedded dictionary as lists, that looks like this:
{200101: {1:[350, 400], 2:[200, 250]}, 200102: {1:[100, 300], 2:[450, 400]}}
How would I do this? The closest I get is using this code:
df1.set_index('date').groupby(level='date').apply(lambda x: x.set_index('blockcount').squeeze().to_dict()).to_dict()
Which returns:
{200101: {1: 400, 2: 250}, 200102: {1: 300, 2: 400}}
Here is another way using pivot_table:
d = df1.pivot_table(index='blockcount',columns='date',
values='reactiontime',aggfunc=list).to_dict()
print(d)
{200101: {1: [350, 400], 2: [200, 250]},
200102: {1: [100, 300], 2: [450, 400]}}
IIUC
df1.groupby(['date','blockcount']).reactiontime.agg(list).unstack(0).to_dict()
{200101: {1: [350, 400], 2: [200, 250]}, 200102: {1: [100, 300], 2: [450, 400]}}
You can do the following,
df2 = df1.groupby(['date', 'blockcount']).agg(lambda x: pd.Series(x).tolist())
# Formatting the result to the correct format
dct = {}
for k, v in df2["reactiontime"].items():
if k[0] not in dct:
dct[k[0]] = {}
dct[k[0]].update({k[1]: v})
Which produces,
>>> {200101: {1: [350, 400], 2: [200, 250]}, 200102: {1: [100, 300], 2: [450, 400]}}
dct holds the result you need.

Efficient grouping into dict

I have a list of tuples:
[('Player1', 'A', 1, 100),
('Player1', 'B', 15, 100),
('Player2', 'A', 7, 100),
('Player2', 'B', 65, 100),
('Global Total', None, 88, 100)]
Which I wish to convert to a dict in the following format:
{
'Player1': {
'A': [1, 12.5],
'B': [15, 18.75],
'Total': [16, 18.18]
},
'Player2': {
'A': [7, 87.5],
'B': [65, 81.25],
'Total': [72, 81.81]
},
'Global Total': {
'A': [8, 100],
'B': [80, 100]
}
}
So each Player dict has it's local total value and it's percentage according to it's global total value.
Currently, I do it like this:
fixed_vals = {}
for name, status, qtd, prct in data_set: # This is the list of tuples var
if name in fixed_vals:
fixed_vals[name].update({status: [qtd, prct]})
else:
fixed_vals[name] = {status: [qtd, prct]}
fixed_vals['Global Total']['Total'] = fixed_vals['Global Total'].pop(None)
total_a = 0
for k, v in fixed_vals.items():
if k != 'Global Total':
total_a += v['A'][0]
fixed_vals['Global Total']['A'] = [
total_a, total_a * 100 / fixed_vals['Global Total']['Total'][0]
]
fixed_vals['Global Total']['B'] = [
fixed_vals['Global Total']['Total'][0] - total_a,
fixed_vals['Global Total']['Total'][0] - fixed_vals['Global Total']['A'][1]
]
for player, vals in fixed_vals.items():
if player != 'Global Total':
vals['A'][1] = vals['A'][0] * 100 / fixed_vals['Global Total']['A'][0]
vals['B'][1] = fixed_vals['Global Total']['A'][1] - vals['B'][1]
the problem being that this is not very flexible since I have to do something similar to this,
but with almost 12 categories (A, B, ...)
Is there a better approach to this? Perhaps this is trivial with pandas?
Edit for clarification:
There are no duplicate categories for each Player, everyone of them has the same sequence (some might have 0 but the category is unique)
Everyone seems attracted to a dict-only solution, but why not try converting to pandas?
import pandas as pd
# given
tuple_list = [('Player1', 'A', 1, 100),
('Player1', 'B', 15, 100),
('Player2', 'A', 7, 100),
('Player2', 'B', 65, 100),
('Global Total', None, 88, 100)]
# make a dataframe
df = pd.DataFrame(tuple_list , columns = ['player', 'game','score', 'pct'])
del df['pct']
df = df[df.player!='Global Total']
df = df.pivot(index='player', columns='game', values='score')
df.columns.name=''
df.index.name=''
# just a check
assert df.to_dict() == {'A': {'Player1': 1, 'Player2': 7},
'B': {'Player1': 15, 'Player2': 65}}
# A B
#player
#Player1 1 15
#Player2 7 65
print('Obtained dataset:\n', df)
Basically, all you need is 'df' dataframe, and the rest you can
compute and add later, no need to save it to dictionary.
Below is updated on OP request:
# the sum across columns is this - this was the 'Grand Total' in the dicts
# A 8
# B 80
sum_col = df.sum(axis=0)
# lets calculate the share of each player score:
shares = df / df.sum(axis=0) * 100
assert shares.transpose().to_dict() == {'Player1': {'A': 12.5, 'B': 18.75},
'Player2': {'A': 87.5, 'B': 81.25}}
# in 'shares' the columns add to 100%:
# A B
#player
#Player1 12.50 18.75
#Player2 87.50 81.25
# lets mix up a dataframe close to original dictionary structure
mixed_df = pd.concat([df.A, shares.A, df.B, shares.B], axis=1)
totals = mixed_df.sum(axis=0)
totals.name = 'Total'
mixed_df = mixed_df.append(totals.transpose())
mixed_df.columns = ['A', 'A_pct', 'B', 'B_pct']
print('\nProducing some statistics\n', mixed_df)
one solution would be to use groupby to group consecutive Player scores from the same player
tup = [('Player1', 'A', 1, 100),('Player1', 'B', 15, 100),('Player2', 'A', 7, 100), ('Player2', 'B', 65, 100), ('Global Total', None, 88, 100)]`
then import our groupby
from itertools import groupby
result = dict((name,dict((x[1],x[2:]) for x in values)) for name,values in groupby(tup,lambda x:x[0]))
then just go and update all the totals
for key in result:
if key == "Global Total": continue # skip this one ...
# sum up our player scores
result[key]['total'] = [sum(col) for col in zip(*result[key].values())]
# you can print the results too
print result
# {'Player2': {'A': (7, 100), 'total': [72, 200], 'B': (65, 100)}, 'Player1': {'A': (1, 100), 'total': [16, 200], 'B': (15, 100)}, 'Global Total': {'total': [88, 100], None: (88, 100)}}
NOTE This solution !REQUIRES! that all of player1's scores are grouped together in your tuple, and all of player2's scores are grouped etc
A) Break your code up into manageable chunks:
from collections import defaultdict
result = defaultdict(dict)
for (cat, sub, num, percent) in input_list:
result[cat][sub] = [num, percent]
Now we have a dict with the player counts, but the only valid percentages are for total and we don't have global counts.
from collections import Counter
def build_global(dct):
keys = Counter()
for key in dct:
if key == "Global Total":
continue
for sub_key in dct[key]:
keys[sub_key] += dct[key][sub_key][0]
for key in keys:
dct["Global Total"][key] = [keys[key], 100]
build_global(result) now yields valid global counts for each event.
Finally:
def calc_percent(dct):
totals = dct["Global Total"]
for key in dct:
local_total = 0
if key == "Global Total":
continue
for sub_key in dct[key]:
local_total += dct[key][sub_key][0]
dct[key][sub_key][1] = (dct[key][sub_key][0]/float(totals[sub_key][0])) * 100
dct[key]['Total'] = [local_total, (local_total/float(dct['Global Total'][None][0])) * 100]
calc_percent(result) goes through and builds the percentages.
result is then:
defaultdict(<type 'dict'>,
{'Player2': {'A': [7, 87.5], 'B': [65, 81.25], 'Total': [72, 81.81818181818183]},
'Player1': {'A': [1, 12.5], 'B': [15, 18.75], 'Total': [16, 18.181818181818183]},
'Global Total': {'A': [8, 100], None: [88, 100], 'B': [80, 100]}})
If you need it exactly as specified, you can delete the None entry in global total and dict(result) to convert the defaultdict into a vanilla dict.
Using a remapping tool from more_itertools in Python 3.6+:
Given
import copy as cp
import collections as ct
import more_itertools as mit
data = [
("Player1", "A", 1, 100),
("Player1", "B", 15, 100),
("Player2", "A", 7, 100),
("Player2", "B", 65, 100),
('Global Total', None, 88, 100)
]
# Discard the last entry
data = data[:-1]
# Key functions
kfunc = lambda tup: tup[0]
vfunc = lambda tup: tup[1:]
rfunc = lambda x: {item[0]: [item[1]] for item in x}
Code
# Step 1
remapped = mit.map_reduce(data, kfunc, vfunc, rfunc)
# Step 2
intermediate = ct.defaultdict(list)
for d in remapped.values():
for k, v in d.items():
intermediate[k].extend(v)
# Step 3
remapped["Global Total"] = {k: [sum(v)] for k, v in intermediate.items()}
final = cp.deepcopy(remapped)
for name, d in remapped.items():
for lbl, v in d.items():
stat = (v[0]/remapped["Global Total"][lbl][0]) * 100
final[name][lbl].append(stat)
Details
Step 1 - build a new dict of remapped groups.
This is done by defining key functions that dictate how to process the keys and values. The reducing function processes the values into sub-dictionaries. See also docs for more details on more_itertools.map_reduce.
>>> remapped
defaultdict(None,
{'Player1': {'A': [1], 'B': [15]},
'Player2': {'A': [7], 'B': [65]}})
Step 2 - build an intermediate dict for lookups
>>> intermediate
defaultdict(list, {'A': [1, 7], 'B': [15, 65]})
Step 3 - build a final dict from the latter dictionaries
>>> final
defaultdict(None,
{'Player1': {'A': [1, 12.5], 'B': [15, 18.75]},
'Player2': {'A': [7, 87.5], 'B': [65, 81.25]},
'Global Total': {'A': [8, 100.0], 'B': [80, 100.0]}})

Categories