Particular nested dictionary from a Pandas DataFrame for circle packing - python

I am trying to create a particular nested dictionary from a DataFrame in Pandas conditions, in order to then visualize.
dat = pd.DataFrame({'cat_1' : ['marketing', 'marketing', 'marketing', 'communications'],
'child_cat' : ['marketing', 'social media', 'marketing', 'communications],
'skill' : ['digital marketing','media marketing','research','seo'],
'value' : ['80', '101', '35', '31']
and I would like to turn this into a dictionary that looks a bit like this:
{
"name": "general skills",
"children": [
{
"name": "marketing",
"children": [
{
"name": "marketing",
"children": [
{
"name": "digital marketing",
"value": 80
},
{
"name": "research",
"value": 35
}
]
},
{
"name": "social media", // notice that this is a sibling of the parent marketing
"children": [
{
"name": "media marketing",
"value": 101
}
]
}
]
},
{
"name": "communications",
"children": [
{
"name": "communications",
"children": [
{
"name": "seo",
"value": 31
}
]
}
]
}
]
}
So cat_1 is the parent node, child_cat is its children, and skill is its child too. I am having trouble with creating the additional children lists. Any help?

With a lot of inefficiencies I came up with this solution. Probably highly sub-optimal
final = {}
# control dict to get only one broad category
contrl_dict = {}
contrl_dict['dummy'] = None
final['name'] = 'variants'
final['children'] = []
# line is the values of each row
for idx, line in enumerate(df_dict.values):
# parent categories dict
broad_dict_1 = {}
print(line)
# this takes every value of the row minus the value in the end
for jdx, col in enumerate(line[:-1]):
# look into the broad category first
if jdx == 0:
# check in our control dict - does this category exist? if not add it and continue
if not col in contrl_dict.keys():
# if it doesn't it appends it
contrl_dict[col] = 'added'
# then the broad dict parent takes the name
broad_dict_1['name'] = col
# the children are the children broad categories which will be populated further
broad_dict_1['children'] = []
# go to broad categories 2
for ydx, broad_2 in enumerate(list(df_dict[df_dict.broad_categories == col].broad_2.unique())):
# sub categories dict
prov_dict = {}
prov_dict['name'] = broad_2
# children is again a list
prov_dict['children'] = []
# now isolate the skills and values of each broad_2 category and append them
for row in df_dict[df_dict.broad_2 == broad_2].values:
prov_d_3 = {}
# go to each row
for xdx, direct in enumerate(row):
# in each row, values 2 and 3 are name and value respectively add them
if xdx == 2:
prov_d_3['name'] = direct
if xdx == 3:
prov_d_3['size'] = direct
prov_dict['children'].append(prov_d_3)
broad_dict_1['children'].append(prov_dict)
# if it already exists in the control dict then it moves on
else:
continue
final['children'].append(broad_dict_1)

Related

python - collect full path till leaf on organization tree

I got organizations tree stored as json
{
"name": "amos",
"direct_reports": [
{
"name": "bart",
"direct_reports": [
{
"name": "colin",
"direct_reports": []
},
{
"name": "clara",
"direct_reports": []
}
]
},
{
"name": "bravo",
"direct_reports": [
{
"name": "cupid",
"direct_reports": []
},
{
"name": "clever",
"direct_reports": []
}
]
}
]
}
I need to store full "management path" for each employee, such as:
management_chain["clever"]={bravo,amos}
management_chain["bart"]={amos}
Currently I manage to reach all edges and classify those as employees and managers with code as followed:
def get_herarchy(org):
tmp_obj = {}
tmp_obj['managers'] = []
for emp in org['direct_reports']:
tmp_obj['managers'].append(org['name'])
print("manager "+org['name'])
if len(emp['direct_reports'])>0:
get_herarchy(emp)
tmp_obj['name'] = emp['name']
print(emp['name'])
return tmp_obj
But the dictionary doesn't holds the right values
Like this, maybe:
def get_chain(org, name):
if org['name'] == name:
return [name]
for emp in org['direct_reports']:
chain = get_chain(emp, name)
if chain:
return [org['name']] + chain
return None
print(get_chain(org, 'bart')) # ['amos', 'bart']
print(get_chain(org, 'clever')) # ['amos', 'bravo', 'clever']
UPD: This is how to make a dictionary:
def nested_iter(org):
yield org['name']
for emp in org['direct_reports']:
yield from nested_iter(emp)
print({name: get_chain(org, name)[0:-1] for name in nested_iter(org)})

How to write a python code to get all the data from a particular date?

I have json file which has a list of ids and date. How to write a python program to print all the ids for a particular month from the json file
Below is the sample json data
{
"entities": [
{
"Fields": [
{
"Name": "version",
"values": [
{
"value": "Cycle 1"
}
]
},
{
"Name": "subject",
"values": [
{
"value": "1008"
}
]
},
{
"Name": "project",
"values": [
{}
]
},
{
"Name": "linkage",
"values": [
{
"value": "N"
}
]
},
{
"Name": "cycle-id",
"values": []
},
{
"Name": "creation-time",
"values": [
{
"value": "2016-07-12"
}
]
},
{
"Name": "id",
"values": [
{
"value": "1"
}
]
}]}]}
I have just tried to load the json file from below code.
import json
f = open('defects-export-0-100.json')
data = json.load(f)
print(data)
# month = str("MM")
month = '09'
defect_items = []
defectIDs = []
for item in data["entities"]:
for container in item["Fields"]:
if container["Name"] == "creation-time":
if container["values"][0]["value"].split("-")[1] == month:
defect_items.append(item)
for item in defect_items:
for container in item["Fields"]:
if container["Name"] == "id":
defectIDs.append(container["values"][0]["value"])
My desired output: All the IDs from the one particular month of creation date.
The biggest issue is how you're referencing keys in a dictionary. You can get the value at a particular key with:
x = {"key": value}
x["key"]
# value
I've made some assumptions about your data set, but this code works with the sample you gave.
import json
with open("data.txt", "r") as f:
data = json.load(f)
#month = str("MM")
month = "07"
defect_items = []
defectIDs = []
# Loop through each entity
for item in data["entities"]:
# Loop through each field
for container in item["Fields"]:
# Find the field with the name "creation-item"
if container["Name"] == "creation-time":
# Check if the value matches with the desired date
# Assuming there can only be one value
if container["values"][0]["value"].split("-")[1] == month:
defect_items.append(item)
# Loop through the defective items
for item in defect_items:
# Loop through the fields
for container in item["Fields"]:
# Find the field with the name "id"
if container["Name"] == "id":
# Grab the value
# Assuming there can only be one value
defectIDs.append(container["values"][0]["value"])
Once the data is loaded, you can interact with it as you would any Python object. Get all the items with:
items = data['entities']
For the code below to work, create a variable month and set it to a string with the format MM (where M is a digit of the month: e.g. month='01' for January) so it exactly matches the correct month format of the data.
Then, run the following loop to collect the IDs:
ids = []
for item in items.keys():
id = None
time = False
for container in items[item].keys():
if items[item][container]['Name'] == 'creation-time':
if items[item][container]['values']['value'].split('-')[1] == month:
time = True
if items[item][container]['Name'] == 'id':
id = items[item][container]['values']['value']
if time and id: ids.append(id)

Set values for empty list within nested dictionaries

I have a cte query that returns me results of values that are linked (i.e. child -> parent).
Then in Python I am trying to create a nested dictionary that would represent something like this:
{
"name": "Child_example",
"parents": [
{
"name": "child_parent_1",
"parents": [{"name": "child_parent_1_parent", "parents": [{"name": "end", "parents": []}]}]
},
{
"name": "child_parent_2",
"parents": [{"name": "end", "parents": []}]
},
{
"name": "child_parent_3",
"parents": [{"name": "child_parent_3_parent", "parents": [{"name": "end", "parents": []}]}]
}
]
}
My input data looks something like so (it can have more data):
child_col
parent_col
name
depth
Child_example
child_parent_1_col
child_parent_1
0
Child_example
child_parent_2_col
child_parent_2
0
Child_example
child_parent_3_col
child_parent_3
0
child_parent_1_col
child_parent_1_parent
1_parent
1
child_parent_2_col
end
1_parent
1
child_parent_3_col
child_parent_3_parent
3_parent
1
child_parent_3_parent
end
end_3
2
child_parent_1_parent
end
end_1
2
However with my code so far:
r_dict = defaultdict(list)
depth_zero = [x for x in rows if x.depth == 0]
for row in depth_zero:
r_dict['name'] = row.path_key
r_dict['parents'].append({'name': row.path_parent_key, 'parents': []})
depth_not_zero = [x for x in rows if x.depth != 0]
# Set inner levels
for parent in r_dict['parents']:
name = parent['name']
inner_parent = parent['parents'].copy()
for row in depth_not_zero:
if row.path_key == name:
inner_parent.append({'name': row.path_parent_key, 'parents': []})
name = row.path_parent_key
parent['parents'] = inner_parent
I only manage to achieve to append it to initial "parents", instead of setting the ultimate nested "parents". I know it is to do with this line of code:
inner_parent.append({'name': row.path_parent_key, 'parents': []})
But I cannot work out how to essentially get and set it. Would this be a case for recursion instead of the way I am doing it?
Below is an example of the first nested dictionary output that I am currently creating with my code:
{
"name": "Child_example",
"parents": [
{
"name": "child_parent_1",
"parents": [
{"name": "child_parent_1", "parents": []}, {"name": "end", "parents": []}
]
}
]
}
I'm a bit baffled by the way you are assigning the "name" value: "Child_example" comes from child_col, "child_parent_1" from name, "child_parent_3_parent" from parent_col. So I simplified it a bit: I put in the second column of the child row the same value as in the first column of its parents rows. That said, if you really need to take the names from different columns it's just a matter of adding some ifs.
My proposal is to loop over the rows in reverse order, creating the inner dicts and then moving them into the outer ones:
rows = [["c1","p1c1",0],
["c1","p2c1",0],
["c1","p3c1",0],
["p1c1","p1p1c1",1],
["p2c1","end",1],
["p3c1","p1p3c1",1],
["p1p3c1","end",2],
["p1p1c1","end",2]]
r_dict = {}
for row in reversed(rows):
if row[1] == "end":
r_dict[row[0]] = {"name":row[0], "parents":[]}
else:
if not row[0] in r_dict:
r_dict[row[0]] = {"name":row[0], "parents":[]}
r_dict[row[0]]["parents"].append(r_dict[row[1]])
del r_dict[row[1]]
r_dict
{'c1': {'name': 'c1', 'parents': [{'name': 'p3c1', 'parents': [{'name': 'p1p3c1', 'parents': []}]}, {'name': 'p2c1', 'parents': []}, {'name': 'p1c1', 'parents': [{'name': 'p1p1c1', 'parents': []}]}]}}

Creating a flare json to be used in D3 from pandas dataframe

I have a dataframe that I want to convert to a hierarchical flare json to be used in a D3 visulalization like this: D3 sunburst
My dataframe contains a hierarchial data such as this:
And the output I want should look like this:
{"name": "flare","children":
[
{"name": "Animal", "children":
[
{"name": "Mammal", "children":
[
{"name": "Fox","value":35000},
{"name": "Lion","value":25000}
]
},
{"name": "Fish", "children":
[
{"name": "Cod","value":35000}
]
}
]
},
{"name": "Plant", "children":
[
{"name": "Tree", "children":
[
{"name": "Oak","value":35000}
]
}
]
}
]
}
I have tried several approaches, but cant get it right. Here is my non-working code, inspired by this post: Pandas to D3. Serializing dataframes to JSON
from collections import defaultdict
import pandas as pd
df = pd.DataFrame({'group1':["Animal", "Animal", "Animal", "Plant"],'group2':["Mammal", "Mammal", "Fish", "Tree"], 'group3':["Fox", "Lion", "Cod", "Oak"],'value':[35000,25000,15000,1500] })
tree = lambda: defaultdict(tree)
d = tree()
for _, (group0,group1, group2, group3, value) in df.iterrows():
d['name'][group0]['children'] = group1
d['name'][group1]['children'] = group2
d['name'][group2]['children'] = group3
d['name'][group3]['children'] = value
json.dumps(d)
I am working on a similar visualization project that requires moving data from a Pandas DataFrame to a JSON file that works with D3.
I came across your post while looking for a solution and ended up writing something based on this GitHub repository and with input from the link you provided in this post.
The code is not pretty and is a bit hacky and slow. But based on my project, it seems to work just fine for any amount of data as long as it has three levels and a value field. You should be able to simply fork the D3 Starburst notebook and replace the flare.json file with this code's output.
The modification that I made here, based on the original GitHub post, is to provide consideration for three levels of data. So, if the name of the level 0 node exists, then append from level 1 and on. Likewise, if the name of the level 1 node exists, then append the level 2 node (the third level). Otherwise, append the full path of data. If you need more, some kind of recursion might do the trick, or just keep hacking it to add more levels
# code snip to format Pandas DataFrame to json for D3 Starburst Chart
# libraries
import json
import pandas as pd
# example data with three levels and a single value field
data = {'group1': ['Animal', 'Animal', 'Animal', 'Plant'],
'group2': ['Mammal', 'Mammal', 'Fish', 'Tree'],
'group3': ['Fox', 'Lion', 'Cod', 'Oak'],
'value': [35000, 25000, 15000, 1500]}
df = pd.DataFrame.from_dict(data)
print(df)
""" The sample dataframe
group1 group2 group3 value
0 Animal Mammal Fox 35000
1 Animal Mammal Lion 25000
2 Animal Fish Cod 15000
3 Plant Tree Oak 1500
"""
# initialize a flare dictionary
flare = {"name": "flare", "children": []}
# iterate through dataframe values
for row in df.values:
level0 = row[0]
level1 = row[1]
level2 = row[2]
value = row[3]
# create a dictionary with all the row data
d = {'name': level0,
'children': [{'name': level1,
'children': [{'name': level2,
'value': value}]}]}
# initialize key lists
key0 = []
key1 = []
# iterate through first level node names
for i in flare['children']:
key0.append(i['name'])
# iterate through next level node names
key1 = []
for _, v in i.items():
if isinstance(v, list):
for x in v:
key1.append(x['name'])
# add the full row of data if the root is not in key0
if level0 not in key0:
d = {'name': level0,
'children': [{'name': level1,
'children': [{'name': level2,
'value': value}]}]}
flare['children'].append(d)
elif level1 not in key1:
# if the root exists, then append only the next level children
d = {'name': level1,
'children': [{'name': level2,
'value': value}]}
flare['children'][key0.index(level0)]['children'].append(d)
else:
# if the root exists, then only append the next level children
d = {'name': level2,
'value': value}
flare['children'][key0.index(level0)]['children'][key1.index(level1)]['children'].append(d)
# uncomment next three lines to save as json file
# save to some file
# with open('filename_here.json', 'w') as outfile:
# json.dump(flare, outfile)
print(json.dumps(flare, indent=2))
""" the expected output of this json data
{
"name": "flare",
"children": [
{
"name": "Animal",
"children": [
{
"name": "Mammal",
"children": [
{
"name": "Fox",
"value": 35000
},
{
"name": "Lion",
"value1": 25000
}
]
},
{
"name": "Fish",
"children": [
{
"name": "Cod",
"value": 15000
}
]
}
]
},
{
"name": "Plant",
"children": [
{
"name": "Tree",
"children": [
{
"name": "Oak",
"value": 1500
}
]
}
]
}
]
}
"""

Robust way to sum all values corresponding to a particular objects property?

I have an array as such.
items = [
{
"title": "title1",
"category": "category1",
"value": 200
},
{
"title": "title2",
"category": "category2",
"value": 450
},
{
"title": "title3",
"category": "category1",
"value": 100
}
]
This array consists of many dictionaries with a property category and value.
What is the robust way of getting an array of category objects with their value summed like:
data= [
{
"category": "category1",
"value": 300
},
{
"category": "category2",
"value": 450
}
]
I'm looking for the best algorithm or way possible for both the small array as well as the huge array. If there is an existing algorithm please point me to the source.
What I tried??
data = []
for each item in items:
if data has a dictionary with dictionary.category == item.category:
data's dictionary.value = data's dictionary.value + item.value
else:
data.push({"category": item.category, "value":item.value})
Note: Any programming language is welcome. Please comment before downvoting.
In javascript, you can use reduce to group the array into an object. Use the category as the property. Use Object.values to convert the object into an array.
var items = [{
"title": "title1",
"category": "category1",
"value": 200
},
{
"title": "title2",
"category": "category2",
"value": 450
},
{
"title": "title3",
"category": "category1",
"value": 100
}
];
var data = Object.values(items.reduce((c, v) => {
c[v.category] = c[v.category] || {category: v.category,value: 0};
c[v.category].value += v.value;
return c;
}, {}));
console.log(data);
What you need is a SQL group by like operation. Usually, those group by operations are handling with hashing algorithms. If all your data could fit in memory (small to large data structures) you can implement it very quickly.
If your data structure is huge, you will need to use intermediate memory (such as hard drive or database).
An easy python approach will be:
data_tmp = {}
for item in items:
if item['category'] not in data_tmp:
data_tmp[item['category']] = 0
data_tmp[item['category']] += item['value']
data = []
for k, v in data_tmp.items():
data.append({
'category': k,
'value': v
})
# done
If you want more pythonic code you can use a defaultdict:
from collections import defaultdict
data_tmp = defaultdict(int)
for item in items:
data_tmp[item['category']] += item['value']
data = []
for k, v in data_tmp.items():
data.append({
'category': k,
'value': v
})
# done
In Python, Pandas is likely to be a more convenient and efficient way of doing this.
import pandas as pd
df = pd.DataFrame(items)
sums = df.groupby("category", as_index=False).sum()
data = sums.to_dict("records")
For the final step, it may be more convenient to leave sums as a dataframe and work with it like that instead of converting back to a list of dictionaries.
Using itertools.groupby
d = []
lista = sorted(items, key=lambda x: x['category'])
for k, g in groupby(lista, key=lambda x: x['category']):
temp = {}
temp['category'] = k
temp['value'] = sum([i['value'] for i in list(g)])
d.append(temp)
print(d)
# [{'category': 'category1', 'value': 300}, {'category': 'category2', 'value': 450}]

Categories