Creating a flare json to be used in D3 from pandas dataframe - python

I have a dataframe that I want to convert to a hierarchical flare json to be used in a D3 visulalization like this: D3 sunburst
My dataframe contains a hierarchial data such as this:
And the output I want should look like this:
{"name": "flare","children":
[
{"name": "Animal", "children":
[
{"name": "Mammal", "children":
[
{"name": "Fox","value":35000},
{"name": "Lion","value":25000}
]
},
{"name": "Fish", "children":
[
{"name": "Cod","value":35000}
]
}
]
},
{"name": "Plant", "children":
[
{"name": "Tree", "children":
[
{"name": "Oak","value":35000}
]
}
]
}
]
}
I have tried several approaches, but cant get it right. Here is my non-working code, inspired by this post: Pandas to D3. Serializing dataframes to JSON
from collections import defaultdict
import pandas as pd
df = pd.DataFrame({'group1':["Animal", "Animal", "Animal", "Plant"],'group2':["Mammal", "Mammal", "Fish", "Tree"], 'group3':["Fox", "Lion", "Cod", "Oak"],'value':[35000,25000,15000,1500] })
tree = lambda: defaultdict(tree)
d = tree()
for _, (group0,group1, group2, group3, value) in df.iterrows():
d['name'][group0]['children'] = group1
d['name'][group1]['children'] = group2
d['name'][group2]['children'] = group3
d['name'][group3]['children'] = value
json.dumps(d)

I am working on a similar visualization project that requires moving data from a Pandas DataFrame to a JSON file that works with D3.
I came across your post while looking for a solution and ended up writing something based on this GitHub repository and with input from the link you provided in this post.
The code is not pretty and is a bit hacky and slow. But based on my project, it seems to work just fine for any amount of data as long as it has three levels and a value field. You should be able to simply fork the D3 Starburst notebook and replace the flare.json file with this code's output.
The modification that I made here, based on the original GitHub post, is to provide consideration for three levels of data. So, if the name of the level 0 node exists, then append from level 1 and on. Likewise, if the name of the level 1 node exists, then append the level 2 node (the third level). Otherwise, append the full path of data. If you need more, some kind of recursion might do the trick, or just keep hacking it to add more levels
# code snip to format Pandas DataFrame to json for D3 Starburst Chart
# libraries
import json
import pandas as pd
# example data with three levels and a single value field
data = {'group1': ['Animal', 'Animal', 'Animal', 'Plant'],
'group2': ['Mammal', 'Mammal', 'Fish', 'Tree'],
'group3': ['Fox', 'Lion', 'Cod', 'Oak'],
'value': [35000, 25000, 15000, 1500]}
df = pd.DataFrame.from_dict(data)
print(df)
""" The sample dataframe
group1 group2 group3 value
0 Animal Mammal Fox 35000
1 Animal Mammal Lion 25000
2 Animal Fish Cod 15000
3 Plant Tree Oak 1500
"""
# initialize a flare dictionary
flare = {"name": "flare", "children": []}
# iterate through dataframe values
for row in df.values:
level0 = row[0]
level1 = row[1]
level2 = row[2]
value = row[3]
# create a dictionary with all the row data
d = {'name': level0,
'children': [{'name': level1,
'children': [{'name': level2,
'value': value}]}]}
# initialize key lists
key0 = []
key1 = []
# iterate through first level node names
for i in flare['children']:
key0.append(i['name'])
# iterate through next level node names
key1 = []
for _, v in i.items():
if isinstance(v, list):
for x in v:
key1.append(x['name'])
# add the full row of data if the root is not in key0
if level0 not in key0:
d = {'name': level0,
'children': [{'name': level1,
'children': [{'name': level2,
'value': value}]}]}
flare['children'].append(d)
elif level1 not in key1:
# if the root exists, then append only the next level children
d = {'name': level1,
'children': [{'name': level2,
'value': value}]}
flare['children'][key0.index(level0)]['children'].append(d)
else:
# if the root exists, then only append the next level children
d = {'name': level2,
'value': value}
flare['children'][key0.index(level0)]['children'][key1.index(level1)]['children'].append(d)
# uncomment next three lines to save as json file
# save to some file
# with open('filename_here.json', 'w') as outfile:
# json.dump(flare, outfile)
print(json.dumps(flare, indent=2))
""" the expected output of this json data
{
"name": "flare",
"children": [
{
"name": "Animal",
"children": [
{
"name": "Mammal",
"children": [
{
"name": "Fox",
"value": 35000
},
{
"name": "Lion",
"value1": 25000
}
]
},
{
"name": "Fish",
"children": [
{
"name": "Cod",
"value": 15000
}
]
}
]
},
{
"name": "Plant",
"children": [
{
"name": "Tree",
"children": [
{
"name": "Oak",
"value": 1500
}
]
}
]
}
]
}
"""

Related

Set values for empty list within nested dictionaries

I have a cte query that returns me results of values that are linked (i.e. child -> parent).
Then in Python I am trying to create a nested dictionary that would represent something like this:
{
"name": "Child_example",
"parents": [
{
"name": "child_parent_1",
"parents": [{"name": "child_parent_1_parent", "parents": [{"name": "end", "parents": []}]}]
},
{
"name": "child_parent_2",
"parents": [{"name": "end", "parents": []}]
},
{
"name": "child_parent_3",
"parents": [{"name": "child_parent_3_parent", "parents": [{"name": "end", "parents": []}]}]
}
]
}
My input data looks something like so (it can have more data):
child_col
parent_col
name
depth
Child_example
child_parent_1_col
child_parent_1
0
Child_example
child_parent_2_col
child_parent_2
0
Child_example
child_parent_3_col
child_parent_3
0
child_parent_1_col
child_parent_1_parent
1_parent
1
child_parent_2_col
end
1_parent
1
child_parent_3_col
child_parent_3_parent
3_parent
1
child_parent_3_parent
end
end_3
2
child_parent_1_parent
end
end_1
2
However with my code so far:
r_dict = defaultdict(list)
depth_zero = [x for x in rows if x.depth == 0]
for row in depth_zero:
r_dict['name'] = row.path_key
r_dict['parents'].append({'name': row.path_parent_key, 'parents': []})
depth_not_zero = [x for x in rows if x.depth != 0]
# Set inner levels
for parent in r_dict['parents']:
name = parent['name']
inner_parent = parent['parents'].copy()
for row in depth_not_zero:
if row.path_key == name:
inner_parent.append({'name': row.path_parent_key, 'parents': []})
name = row.path_parent_key
parent['parents'] = inner_parent
I only manage to achieve to append it to initial "parents", instead of setting the ultimate nested "parents". I know it is to do with this line of code:
inner_parent.append({'name': row.path_parent_key, 'parents': []})
But I cannot work out how to essentially get and set it. Would this be a case for recursion instead of the way I am doing it?
Below is an example of the first nested dictionary output that I am currently creating with my code:
{
"name": "Child_example",
"parents": [
{
"name": "child_parent_1",
"parents": [
{"name": "child_parent_1", "parents": []}, {"name": "end", "parents": []}
]
}
]
}
I'm a bit baffled by the way you are assigning the "name" value: "Child_example" comes from child_col, "child_parent_1" from name, "child_parent_3_parent" from parent_col. So I simplified it a bit: I put in the second column of the child row the same value as in the first column of its parents rows. That said, if you really need to take the names from different columns it's just a matter of adding some ifs.
My proposal is to loop over the rows in reverse order, creating the inner dicts and then moving them into the outer ones:
rows = [["c1","p1c1",0],
["c1","p2c1",0],
["c1","p3c1",0],
["p1c1","p1p1c1",1],
["p2c1","end",1],
["p3c1","p1p3c1",1],
["p1p3c1","end",2],
["p1p1c1","end",2]]
r_dict = {}
for row in reversed(rows):
if row[1] == "end":
r_dict[row[0]] = {"name":row[0], "parents":[]}
else:
if not row[0] in r_dict:
r_dict[row[0]] = {"name":row[0], "parents":[]}
r_dict[row[0]]["parents"].append(r_dict[row[1]])
del r_dict[row[1]]
r_dict
{'c1': {'name': 'c1', 'parents': [{'name': 'p3c1', 'parents': [{'name': 'p1p3c1', 'parents': []}]}, {'name': 'p2c1', 'parents': []}, {'name': 'p1c1', 'parents': [{'name': 'p1p1c1', 'parents': []}]}]}}

Remove duplicate dictionary from a list on the basis of key value by priority

Suppose I have the following type of list of dictionaries:
iterlist = [
{"Name": "Mike", "Type": "Admin"},
{"Name": "Mike", "Type": "Writer"},
{"Name": "Mike", "Type": "Reader"},
{"Name": "Zeke", "Type": "Writer"},
{"Name": "Zeke", "Type": "Reader"}
]
I want to remove duplicates of "Name" on the basis of "Type" by the following priority (Admin > Writer > Reader), so the end result should be:
iterlist = [
{"Name": "Mike", "Type": "Admin"},
{"Name": "Zeke", "Type": "Writer"}
]
I found a similar question but it removes duplicates for one explicit type of key-value: Link
Can someone please guide me on how to move forward with this?
This is a modified form of the solution suggested by #azro, their solution and the other solution does not take into account the priority you mentioned, you can get over that using the following code. Have a priority dict.
iterlist = [
{"Name": "Mike", "Type": "Writer"},
{"Name": "Mike", "Type": "Reader"},
{"Name": "Mike", "Type": "Admin"},
{"Name": "Zeke", "Type": "Reader"},
{"Name": "Zeke", "Type": "Writer"}
]
# this is used to get the priority
priorites = {i:idx for idx, i in enumerate(['Admin', 'Writer', 'Reader'])}
sort_key = lambda x:(x['Name'], priorites[x['Type']])
groupby_key = lambda x:x['Name']
result = [next(i[1]) for i in groupby(sorted(iterlist, key=sort_key), key=groupby_key)]
print(result)
Output
[{'Name': 'Mike', 'Type': 'Admin'}, {'Name': 'Zeke', 'Type': 'Writer'}]
You can also use pandas in the following way:
transform the list of dictionary to data frame:
import pandas as pd
df = pd.DataFrame(iterlist)
create a mapping dict:
m = {'Admin': 3, 'Writer': 2, 'Reader': 1}
create a priority column using replace:
df['pri'] = df['Type'].replace(m)
sort_values by pri and groupby by Name and get the first element only:
df = df.sort_values('pri', ascending=False).groupby('Name').first().reset_index()
drop the pri column and return to dictionary using to_dict:
df.drop('pri', axis='columns').to_dict(orient='records')
This will give you the following:
[{'Name': 'Mike', 'Type': 'Admin'}, {'Name': 'Zeke', 'Type': 'Writer'}]
Here is solution you can try out,
unique = {}
for v in iterlist:
# check if key exists, if not update to `unique` dict
if not unique.get(v['Name']):
unique[v['Name']] = v
print(unique.values())
dict_values([{'Name': 'Mike', 'Type': 'Admin'}, {'Name': 'Zeke', 'Type': 'Writer'}])

csv to complex nested json

So, I have a huge CSV file that looks like:
PN,PCA Code,MPN Code,DATE_CODE,Supplier Code,CM Code,Fiscal YEAR,Fiscal MONTH,Usage,Defects
13-1668-01,73-2590,MPN148,1639,S125,CM1,2017,5,65388,0
20-0127-02,73-2171,MPN170,1707,S125,CM1,2017,9,11895,0
19-2472-01,73-2302,MPN24,1711,S119,CM1,2017,10,4479,0
20-0127-02,73-2169,MPN170,1706,S125,CM1,2017,9,7322,0
20-0127-02,73-2296,MPN170,1822,S125,CM1,2018,12,180193,0
15-14399-01,73-2590,MPN195,1739,S133,CM6,2018,11,1290,0
What I want to do is group up all the data by PCA Code. So, a PCA Code will have certain number for parts, those parts would be manufactured by certain MPN Code and the final nested JSON structure that I want looks like:
[
{
PCA: {
"code": "73-2590",
"CM": ["CM1", "CM6"],
"parts": [
{
"number": "13-1668-01",
"manufacturer": [
{
"id": "MPN148"
"info": [
{
"date_code": 1639,
"supplier": {
"id": "S125",
"FYFM": "2020-9",
"usage": 65388,
"defects": 0,
}
}
]
},
]
}
]
}
}
]
So, I want this structure for multiple part numbers (PNs) having different MPNs with different Date Codes and so on.
I am currently using Pandas to do this but I'm stuck on how to proceed with the nesting.
My code so far:
import json
import pandas as pd
dataframe = pd.read_csv('files/dppm_wc.csv')
data = {'PCAs': []}
for key, group in dataframe.groupby('PCA Code'):
for index, row in group.itterrows():
temp_dict = {'PCA Code': key, 'CM Code': row['CM Code'], 'parts': []}
with open('output.txt', 'w') as file:
file.write(json.dumps(data, indent=4))
How do I proceed to achieve the nested JSON format that I want? Is there a better way to do this than what I am doing?
I don't really understand what you wish to do with that structure, but I guess it could be achieved with something like this
data = {'PCAs': []}
for key, group in df.groupby('PCA Code'):
temp_dict = {'PCA Code': key, 'CM Code': [], 'parts': []}
for index, row in group.iterrows():
temp_dict['CM Code'].append(row['CM Code'])
temp_dict['parts'].append(
{'number': row['PN'],
'manufacturer': [
{
'id': row['MPN Code'],
'info': [
{
'date_code': row['DATE_CODE'],
'supplier': {'id': row['Supplier Code'],
'FYFM': '%s-%s' % (row['Fiscal YEAR'], row['Fiscal MONTH']),
'usage': row['Usage'],
'defects': row['Defects']}
}
]
}]
}
)
data['PCAs'].append(temp_dict)

python trasform data from csv to array of dictionaries and group by field value

I have csv like this:
id,company_name,country,country_id
1,batstop,usa, xx
2,biorice,italy, yy
1,batstop,italy, yy
3,legstart,canada, zz
I want an array of dictionaries to import to firebase. I need to group the different country informations for the same company in a nested list of dictionaries. This is the desired output:
[ {'id':'1', 'agency_name':'batstop', countries [{'country':'usa','country_id':'xx'}, {'country':'italy','country_id':'yy'}]} ,
{'id':'2', 'agency_name':'biorice', countries [{'country':'italy','country_id':'yy'}]},
{'id':'3', 'legstart':'legstart', countries [{'country':'canada','country_id':'zz'}]} ]
Recently I had a similar task, the groupby function from itertools and the itemgetter function from operator - both standard python libraries - helped me a lot. Here's the code considering your csv, note how defining the primary keys of your csv dataset is important.
import csv
import json
from operator import itemgetter
from itertools import groupby
primary_keys = ['id', 'company_name']
# Start extraction
with open('input.csv', 'r') as file:
# Read data from csv
reader = csv.DictReader(file)
# Sort data accordingly to primary keys
reader = sorted(reader, key=itemgetter(*primary_keys))
# Create a list of tuples
# Each tuple containing a dict of the group primary keys and its values, and a list of the group ordered dicts
groups = [(dict(zip(primary_keys, _[0])), list(_[1])) for _ in groupby(reader, key=itemgetter(*primary_keys))]
# Create formatted dict to be converted into firebase objects
group_dicts = []
for group in groups:
group_dict = {
"id": group[0]['id'],
"agency_name": group[0]['company_name'],
"countries": [
dict(country=_['country'], country_id=_['country_id']) for _ in group[1]
],
}
group_dicts.append(group_dict)
print("\n".join([json.dumps(_, indent=2) for _ in group_dicts]))
Here's the output:
{
"id": "1",
"agency_name": "batstop",
"countries": [
{
"country": "usa",
"country_id": " xx"
},
{
"country": "italy",
"country_id": " yy"
}
]
}
{
"id": "2",
"agency_name": "biorice",
"countries": [
{
"country": "italy",
"country_id": " yy"
}
]
}
{
"id": "3",
"agency_name": "legstart",
"countries": [
{
"country": "canada",
"country_id": " zz"
}
]
}
There's no external library,
Hope it suits you well!
You can try this, you may have to change a few parts to get it working with your csv, but hope it's enough to get you started:
csv = [
"1,batstop,usa, xx",
"2,biorice,italy, yy",
"1,batstop,italy, yy",
"3,legstart,canada, zz"
]
output = {} # dictionary useful to avoid searching in list for existing ids
# Parse each row
for line in csv:
cols = line.split(',')
id = int(cols[0])
agency_name = cols[1]
country = cols[2]
country_id = cols[3]
if id in output:
output[id]['countries'].append([{'country': country,
'country_id': country_id}])
else:
output[id] = {'id': id,
'agency_name': agency_name,
'countries': [{'country': country,
'country_id': country_id}]
}
# Put into list
json_output = []
for key in output.keys():
json_output.append( output[key] )
# Check output
for row in json_output:
print(row)

Particular nested dictionary from a Pandas DataFrame for circle packing

I am trying to create a particular nested dictionary from a DataFrame in Pandas conditions, in order to then visualize.
dat = pd.DataFrame({'cat_1' : ['marketing', 'marketing', 'marketing', 'communications'],
'child_cat' : ['marketing', 'social media', 'marketing', 'communications],
'skill' : ['digital marketing','media marketing','research','seo'],
'value' : ['80', '101', '35', '31']
and I would like to turn this into a dictionary that looks a bit like this:
{
"name": "general skills",
"children": [
{
"name": "marketing",
"children": [
{
"name": "marketing",
"children": [
{
"name": "digital marketing",
"value": 80
},
{
"name": "research",
"value": 35
}
]
},
{
"name": "social media", // notice that this is a sibling of the parent marketing
"children": [
{
"name": "media marketing",
"value": 101
}
]
}
]
},
{
"name": "communications",
"children": [
{
"name": "communications",
"children": [
{
"name": "seo",
"value": 31
}
]
}
]
}
]
}
So cat_1 is the parent node, child_cat is its children, and skill is its child too. I am having trouble with creating the additional children lists. Any help?
With a lot of inefficiencies I came up with this solution. Probably highly sub-optimal
final = {}
# control dict to get only one broad category
contrl_dict = {}
contrl_dict['dummy'] = None
final['name'] = 'variants'
final['children'] = []
# line is the values of each row
for idx, line in enumerate(df_dict.values):
# parent categories dict
broad_dict_1 = {}
print(line)
# this takes every value of the row minus the value in the end
for jdx, col in enumerate(line[:-1]):
# look into the broad category first
if jdx == 0:
# check in our control dict - does this category exist? if not add it and continue
if not col in contrl_dict.keys():
# if it doesn't it appends it
contrl_dict[col] = 'added'
# then the broad dict parent takes the name
broad_dict_1['name'] = col
# the children are the children broad categories which will be populated further
broad_dict_1['children'] = []
# go to broad categories 2
for ydx, broad_2 in enumerate(list(df_dict[df_dict.broad_categories == col].broad_2.unique())):
# sub categories dict
prov_dict = {}
prov_dict['name'] = broad_2
# children is again a list
prov_dict['children'] = []
# now isolate the skills and values of each broad_2 category and append them
for row in df_dict[df_dict.broad_2 == broad_2].values:
prov_d_3 = {}
# go to each row
for xdx, direct in enumerate(row):
# in each row, values 2 and 3 are name and value respectively add them
if xdx == 2:
prov_d_3['name'] = direct
if xdx == 3:
prov_d_3['size'] = direct
prov_dict['children'].append(prov_d_3)
broad_dict_1['children'].append(prov_dict)
# if it already exists in the control dict then it moves on
else:
continue
final['children'].append(broad_dict_1)

Categories