Convert Pandas Dataframe to nested dictionary - python

I am trying to convert a dataframe to a nested dictionary but no success so far.
Dataframe: clean_data['Model', 'Problem', 'Size']
Here's how my data looks like:
Model Problem Size
lenovo a6020 screen broken 1
lenovo a6020a40 battery 60
bluetooth 60
buttons 60
lenovo k4 wi-fi 3
bluetooth 3
My desired output:
{
"name": "Brand",
"children": [
{
"name": "Lenovo",
"children": [
{
"name": "lenovo a6020",
"children": {
"name": "screen broken",
"size": 1
}
},
{
"name": "lenovo a6020a40",
"children": [
{
"name": "battery",
"size": 60
},
{
"name": "bluetooth",
"size": 60
},
{
"name": "buttons",
"size": 60
}
]
},
{
"name": "lenovo k4",
"children": [
{
"name": "wi-fi",
"size": 3
},
{
"name": "bluetooth",
"size": 3
}
]
}
]
}
]
}
I have tried pandas.DataFrame.to_dict method But it is returning a simple dictionary but I want it like the one mentioned above.

Use:
print (df)
Model Problem size
0 lenovo a6020 screen broken 1
1 lenovo a6020a40 battery 60
2 NaN bluetooth 60
3 NaN buttons 60
4 lenovo k4 wi-fi 3
5 NaN bluetooth 3
#repalce missing values by forward filling
df = df.ffill()
#split Model column by first whitesapces to 2 columns
df[['a','b']] = df['Model'].str.split(n=1, expand=True)
#each level convert to list of dictionaries
#for correct keys use rename
L = (df.rename(columns={'Problem':'name'})
.groupby(['a','b'])['name','size']
.apply(lambda x: x.to_dict('r'))
.rename('children')
.reset_index()
.rename(columns={'b':'name'})
.groupby('a')['name','children']
.apply(lambda x: x.to_dict('r'))
.rename('children')
.reset_index()
.rename(columns={'a':'name'})
.to_dict('r')
)
#print (L)
#create outer level by contructor
d = { "name": "Brand", "children": L}
print (d)
{
'name': 'Brand',
'children': [{
'name': 'lenovo',
'children': [{
'name': 'a6020',
'children': [{
'name': 'screen broken',
'size': 1
}]
}, {
'name': 'a6020a40',
'children': [{
'name': 'battery',
'size': 60
}, {
'name': 'bluetooth',
'size': 60
}, {
'name': 'buttons',
'size': 60
}]
}, {
'name': 'k4',
'children': [{
'name': 'wi-fi',
'size': 3
}, {
'name': 'bluetooth',
'size': 3
}]
}]
}]
}

Related

Extract data from JSON index loaded file

My JSON file looks like:
{
"numAccounts": xxxx,
"filtersApplied": {
"accountIds": "All",
"checkIds": "All",
"categories": [
"cost_optimizing"
],
"statuses": "All",
"regions": "All",
"organizationalUnitIds": [
"yyyyy"
]
},
"categoryStatusMap": {
"cost_optimizing": {
"statusMap": {
"RULE_ERROR": {
"name": "Blue",
"count": 11
},
"ERROR": {
"name": "Red",
"count": 11
},
"OK": {
"name": "Green",
"count": 11
},
"WARN": {
"name": "Yellow",
"count": 11
}
},
"name": "Cost Optimizing",
"monthlySavings": 1111
}
},
"accountStatusMap": {
"xxxxxxxx": {
"cost_optimizing": {
"statusMap": {
"OK": {
"name": "Green",
"count": 1111
},
"WARN": {
"name": "Yellow",
"count": 111
}
},
"name": "Cost Optimizing",
"monthlySavings": 1111
}
},
Which I load into memory using pandas:
df = pd.read_json('file.json', orient='index')
I find the index orient the most suitable because it gives me:
print(df)
0
numAccounts 125
filtersApplied {'accountIds': 'All', 'checkIds': 'All', 'cate...
categoryStatusMap {'cost_optimizing': {'statusMap': {'RULE_ERROR...
accountStatusMap {'xxxxxxx': {'cost_optimizing': {'statusM...
Now, how can I access the accountStatusMap entry?
I tried account_status_map = df['accountStatusMap'] which gives me a
KeyError: 'accountStatusMap'
Is there something specific to the index orientation in how to access specific entries in a dataframe?

How to combine two csv files to create one JSON object with specific header names with pandas

I currently have two csv tables. One is for Pharmaceuticals and the other is for Nutraceuticals:
here is Pharmaceuticals
Drug Percentile
0 SZ Antipsychotic 57
1 AMG 811 57
2 Colchicine 57
3 Ibuprofen 29
4 Sleep Deprivation 29
5 Rofecoxib 29
and here is Nutraceuticals
Drug Percentile
0 Canova 57
1 Omega-3 fatty acids 43
2 Luteolin 29
3 Ethanol Extract of Aurantiochytrium sp. (EEA) 14
4 Osthole 14
5 Arisaema amurense var. serratum 14
I basically need to combine these two csv files into one JSON object such that each table is an array of dictionaries paired with their respective label. The expected output is this (the names are different, only structure is important here):
{
"nutraceuticals": [
{
"name": "Omega-3 fatty acids",
"percentile": 38
},
{
"name": "Luteolin",
"percentile": 25
},
{
"name": "Vitamin D",
"percentile": 25
},
{
"name": "Probiotics supplements",
"percentile": 13
}
],
"pharmaceuticals": [
{
"name": "Dexamethasone",
"percentile": 25
},
{
"name": "Rofecoxib",
"percentile": 25
},
{
"name": "Ibuprofen",
"percentile": 25
},
{
"name": "Laquinimod",
"percentile": 25
},
{
"name": "Lumiliximab",
"percentile": 25
}
]
}
I currently have this code:
result = Drug_Scoring.to_json(orient="records")
parsed = json.loads(result)
parsed = json.dumps(parsed, indent=4)
which produces the correct format in dictionary code, here:
[
{
"Drug": " Canova",
"Percentile": 57
},
{
"Drug": " Omega-3 fatty acids",
"Percentile": 43
},
{
"Drug": " Luteolin",
"Percentile": 29
},
{
"Drug": " Ethanol Extract of Aurantiochytrium sp. (EEA)",
"Percentile": 14
}
]
and so on, but does not combine the two and does not have the proper labeling such as "pharmaceuticals": [ and "nutraceuticals": [ I basically need the two csv file names to be the array titles in the JSON object and then combine the two. Thank you.
first create a dictionary object from your two dataframes.
d = dict(zip(['Pharmaceuticals','Nutraceuticals'],
[Pharmaceuticals,Nutraceuticals]))
out = {k : v.rename(columns={'Drug' : 'name', 'Percentile' : 'percentile'}
).to_dict(orient='records')
for k,v in d.items() }
print(out)
{'Pharmaceuticals': [{'name': 'SZ Antipsychotic', 'percentile': 57},
{'name': 'AMG 811', 'percentile': 57},
{'name': 'Colchicine', 'percentile': 57},
{'name': 'Ibuprofen', 'percentile': 29},
{'name': 'Sleep Deprivation', 'percentile': 29},
{'name': 'Rofecoxib', 'percentile': 29}],
'Nutraceuticals': [{'name': 'Canova', 'percentile': 57},
{'name': 'Omega-3 fatty acids', 'percentile': 43},
{'name': 'Luteolin', 'percentile': 29},
{'name': 'Ethanol Extract of Aurantiochytrium sp. (EEA)', 'percentile': 14},
{'name': 'Osthole', 'percentile': 14},
{'name': 'Arisaema amurense var. serratum', 'percentile': 14}]}
then dump to json.
with open('yourfile.json','w') as fp:
json.dump(out,fp,indent=4)
yourfile.json
{
"Pharmaceuticals": [
{
"name": "SZ Antipsychotic",
"percentile": 57
},
{
"name": "AMG 811",
"percentile": 57
},
{
"name": "Colchicine",
"percentile": 57
},
{
"name": "Ibuprofen",
"percentile": 29
},
{
"name": "Sleep Deprivation",
"percentile": 29
},
{
"name": "Rofecoxib",
"percentile": 29
}
],
"Nutraceuticals": [
{
"name": "Canova",
"percentile": 57
},
{
"name": "Omega-3 fatty acids",
"percentile": 43
},
{
"name": "Luteolin",
"percentile": 29
},
{
"name": "Ethanol Extract of Aurantiochytrium sp. (EEA)",
"percentile": 14
},
{
"name": "Osthole",
"percentile": 14
},
{
"name": "Arisaema amurense var. serratum",
"percentile": 14
}
]
}

Merge lists of complex dicts with arbitrary keys

I have this code:
dotteds = ["apple.orange.banana", "a.b.c", "a.b.d"]
name = "name"
avtype = "type"
fields = "fields"
main_dictionary_list = []
for x in dotteds:
split_name = x.split('.')
if len(split_name) > 1:
value = {name: split_name[-1], avtype: 'string'}
dicts = []
for y in split_name:
dicts.append({name: y, avtype: {name: y, avtype: "record", fields: []}})
dicts[-1] = value
value = value['name']+split_name[-1]
for z in reversed(range(len(dicts))):
if z != 0:
dicts[z - 1]['type']['fields'].append(dicts[z])
main_dictionary_list.append(dicts[0])
else:
dicts = []
value = {name: split_name[-1], avtype: 'string'}
dicts.append(value)
main_dictionary_list.append(dicts[0])
print(main_dictionary_list)
Which gives me an output like this:
[{
'name': 'apple',
'type': {
'name': 'apple',
'type': 'record',
'fields': [{
'name': 'orange',
'type': {
'name': 'orange',
'type': 'record',
'fields': [{
'name': 'banana',
'type': 'string'
}
]
}
}
]
}
}, {
'name': 'a',
'type': {
'name': 'a',
'type': 'record',
'fields': [{
'name': 'b',
'type': {
'name': 'b',
'type': 'record',
'fields': [{
'name': 'c',
'type': 'string'
}
]
}
}
]
}
}, {
'name': 'a',
'type': {
'name': 'a',
'type': 'record',
'fields': [{
'name': 'b',
'type': {
'name': 'b',
'type': 'record',
'fields': [{
'name': 'd',
'type': 'string'
}
]
}
}
]
}
}
]
Ideally I need:
[{
'name': 'apple',
'type': {
'name': 'apple',
'type': 'record',
'fields': [{
'name': 'orange',
'type': {
'name': 'orange',
'type': 'record',
'fields': [{
'name': 'banana',
'type': 'string'
}
]
}
}
]
}
}, {
'name': 'a',
'type': {
'name': 'a',
'type': 'record',
'fields': [{
'name': 'b',
'type': {
'name': 'b',
'type': 'record',
'fields': [{
'name': 'c',
'type': 'string'
},
{
'name': 'd',
'type': 'string'
}
]
}
}
]
}
}
]
I need to be able to do this with any number of combinations:
apple.orange.banana, a.b.c, a.b.d, a.b.q.e.a.s.d, etc.
I cannot figure out how to combine the similar 'name: key' combinations. It's intended to be avro format.
I have also tried making the dotted values into a dictionary which is a bit of trouble on its own.
You can use recursion with collections.defaultdict:
from collections import defaultdict
def group(vals, last=None):
if any(len(i) == 1 for i in vals):
return [{'name':last, 'type':{'name':last, 'type':'record', 'fields':[{'name':i[0], 'type':'string'} if len(i) == 1 else group([i], i[0])[0] for i in vals]}}]
_d = defaultdict(list)
for i in vals:
_d[i[0]].append(i[1:])
return [{'name':a, 'type':group(b, last=a)} if last is None else
{'name':last, 'type':'record', 'fields':group(b, last=a)} for a, b in _d.items()]
import json
vals = ['apple.orange.banana', 'a.b.c', 'a.b.d']
print(json.dumps(group([i.split('.') for i in vals]), indent=4))
Output:
[
{
"name": "apple",
"type": [
{
"name": "apple",
"type": "record",
"fields": [
{
"name": "orange",
"type": {
"name": "orange",
"type": "record",
"fields": [
{
"name": "banana",
"type": "string"
}
]
}
}
]
}
]
},
{
"name": "a",
"type": [
{
"name": "a",
"type": "record",
"fields": [
{
"name": "b",
"type": {
"name": "b",
"type": "record",
"fields": [
{
"name": "c",
"type": "string"
},
{
"name": "d",
"type": "string"
}
]
}
}
]
}
]
}
]
vals = ['asd.2', 'asd.3', 'asd.5.3.4']
print(json.dumps(group([i.split('.') for i in vals]), indent=4))
Output:
[
{
"name": "asd",
"type": [
{
"name": "asd",
"type": {
"name": "asd",
"type": "record",
"fields": [
{
"name": "2",
"type": "string"
},
{
"name": "3",
"type": "string"
},
{
"name": "5",
"type": "record",
"fields": [
{
"name": "5",
"type": "record",
"fields": [
{
"name": "3",
"type": {
"name": "3",
"type": "record",
"fields": [
{
"name": "4",
"type": "string"
}
]
}
}
]
}
]
}
]
}
}
]
}
]

Adding new pairs to a json file

I have a json file I need to add pairs to, I convert it into a dict, but now I need to put my new values in a specific place.
This is some of the json file I convert:
"rootObject": {
"id": "6ff0010c-00fe-485b-b695-4ddd6aca4dcd",
"type": "IDO_GEAR",
"children": [
{
"id": "1dd94d1a-e52d-40b3-a82b-6db02a8fbbab",
"type": "IDO_SYSTEM_LOADCASE",
"children": [],
"childList": "SYSTEMLOADCASE",
"properties": [
{
"name": "IDCO_IDENTIFICATION",
"value": "1dd94d1a-e52d-40b3-a82b-6db02a8fbbab"
},
{
"name": "IDCO_DESIGNATION",
"value": "Lastfall 1"
},
{
"name": "IDSLC_TIME_PORTION",
"value": 100
},
{
"name": "IDSLC_DISTANCE_PORTION",
"value": 100
},
{
"name": "IDSLC_OPERATING_TIME_IN_HOURS",
"value": 1
},
{
"name": "IDSLC_OPERATING_TIME_IN_SECONDS",
"value": 3600
},
{
"name": "IDSLC_OPERATING_REVOLUTIONS",
"value": 1
},
{
"name": "IDSLC_OPERATING_DISTANCE",
"value": 1
},
{
"name": "IDSLC_ACCELERATION",
"value": 9.81
},
{
"name": "IDSLC_EPSILON_X",
"value": 0
},
{
"name": "IDSLC_EPSILON_Y",
"value": 0
},
{
"name": "IDSLC_EPSILON_Z",
"value": 0
},
{
"name": "IDSLC_CALCULATION_WITH_OWN_WEIGHT",
"value": "CO_CALCULATION_WITHOUT_OWN_WEIGHT"
},
{
"name": "IDSLC_CALCULATION_WITH_TEMPERATURE",
"value": "CO_CALCULATION_WITH_TEMPERATURE"
},
{
"name": "IDSLC_FLAG_FOR_LOADCASE_CALCULATION",
"value": "LB_CALCULATE_LOADCASE"
},
{
"name": "IDSLC_STATUS_OF_LOADCASE_CALCULATION",
"value": false
}
I want to add somthing like ENTRY_ONE and ENTRY_TWO like this:
"rootObject": {
"id": "6ff0010c-00fe-485b-b695-4ddd6aca4dcd",
"type": "IDO_GEAR",
"children": [
{
"id": "1dd94d1a-e52d-40b3-a82b-6db02a8fbbab",
"type": "IDO_SYSTEM_LOADCASE",
"children": [],
"childList": "SYSTEMLOADCASE",
"properties": [
{
"name": "IDCO_IDENTIFICATION",
"value": "1dd94d1a-e52d-40b3-a82b-6db02a8fbbab"
},
{
"name": "IDCO_DESIGNATION",
"value": "Lastfall 1"
},
{
"name": "IDSLC_TIME_PORTION",
"value": 100
},
{
"name": "IDSLC_DISTANCE_PORTION",
"value": 100
},
{
"name": "ENTRY_ONE",
"value": 100
},
{
"name": "ENTRY_TWO",
"value": 55
},
{
"name": "IDSLC_OPERATING_TIME_IN_HOURS",
"value": 1
},
{
"name": "IDSLC_OPERATING_TIME_IN_SECONDS",
"value": 3600
},
{
"name": "IDSLC_OPERATING_REVOLUTIONS",
"value": 1
},
{
"name": "IDSLC_OPERATING_DISTANCE",
"value": 1
},
{
"name": "IDSLC_ACCELERATION",
"value": 9.81
},
{
"name": "IDSLC_EPSILON_X",
"value": 0
},
{
"name": "IDSLC_EPSILON_Y",
"value": 0
},
{
"name": "IDSLC_EPSILON_Z",
"value": 0
},
{
"name": "IDSLC_CALCULATION_WITH_OWN_WEIGHT",
"value": "CO_CALCULATION_WITHOUT_OWN_WEIGHT"
},
{
"name": "IDSLC_CALCULATION_WITH_TEMPERATURE",
"value": "CO_CALCULATION_WITH_TEMPERATURE"
},
{
"name": "IDSLC_FLAG_FOR_LOADCASE_CALCULATION",
"value": "LB_CALCULATE_LOADCASE"
},
{
"name": "IDSLC_STATUS_OF_LOADCASE_CALCULATION",
"value": false
}
How can I add the entries so that they are under the IDCO_IDENTIFICATION tag, and not under the rootObject?
The way I see your json file, it WOULD be under rootObject as EVERYTHING is under that key. There's quite a few closing brackets and braces missing.
So I can only assume you are meaning you want it directly under IDCO_IDENTIFICATION (which is nested under rootObject). But that doesn't match what you have as your example output either. You have the new ENTRY_ONE and ENTRY_TWO within the properties, within the children, within the rootObject, not "under" IDCO_IDENTIFICATION. So I'm going to follow what you are asking for from your example output.
import json
with open('C:/test.json') as f:
data = json.load(f)
new_elements = [{"name":"ENTRY_ONE", "value":100},
{"name":"ENTRY_TWO", "value":55}]
for each in new_elements:
data['rootObject']['children'][0]['properties'].append(each)
with open('C:/test.json', 'w') as f:
json.dump(data, f)
Output:
import pprint
pprint.pprint(data)
{'rootObject': {'children': [{'childList': 'SYSTEMLOADCASE',
'children': [],
'id': '1dd94d1a-e52d-40b3-a82b-6db02a8fbbab',
'properties': [{'name': 'IDCO_IDENTIFICATION',
'value': '1dd94d1a-e52d-40b3-a82b-6db02a8fbbab'},
{'name': 'IDCO_DESIGNATION',
'value': 'Lastfall 1'},
{'name': 'IDSLC_TIME_PORTION',
'value': 100},
{'name': 'IDSLC_DISTANCE_PORTION',
'value': 100},
{'name': 'IDSLC_OPERATING_TIME_IN_HOURS',
'value': 1},
{'name': 'IDSLC_OPERATING_TIME_IN_SECONDS',
'value': 3600},
{'name': 'IDSLC_OPERATING_REVOLUTIONS',
'value': 1},
{'name': 'IDSLC_OPERATING_DISTANCE',
'value': 1},
{'name': 'IDSLC_ACCELERATION',
'value': 9.81},
{'name': 'IDSLC_EPSILON_X',
'value': 0},
{'name': 'IDSLC_EPSILON_Y',
'value': 0},
{'name': 'IDSLC_EPSILON_Z',
'value': 0},
{'name': 'IDSLC_CALCULATION_WITH_OWN_WEIGHT',
'value': 'CO_CALCULATION_WITHOUT_OWN_WEIGHT'},
{'name': 'IDSLC_CALCULATION_WITH_TEMPERATURE',
'value': 'CO_CALCULATION_WITH_TEMPERATURE'},
{'name': 'IDSLC_FLAG_FOR_LOADCASE_CALCULATION',
'value': 'LB_CALCULATE_LOADCASE'},
{'name': 'IDSLC_STATUS_OF_LOADCASE_CALCULATION',
'value': False},
{'name': 'ENTRY_ONE',
'value': 100},
{'name': 'ENTRY_TWO',
'value': 55}],
'type': 'IDO_SYSTEM_LOADCASE'}],
'id': '6ff0010c-00fe-485b-b695-4ddd6aca4dcd',
'type': 'IDO_GEAR'}}

Python - Adding fields and labels to nested json file

I have a dataframe as follows:
Name_ID | URL | Count | Rating
------------------------------------------------
ABC | www.example.com/ABC | 10 | 5
123 | www.example.com/123 | 9 | 4
XYZ | www.example.com/XYZ | 5 | 2
ABC111 | www.example.com/ABC111 | 5 | 2
ABC121 | www.example.com/ABC121 | 5 | 2
222 | www.example.com/222 | 5 | 3
abc222 | www.example.com/abc222 | 4 | 2
ABCaaa | www.example.com/ABCaaa | 4 | 2
I am trying to create a JSON as follows:
{
"name": "sampledata",
"children": [
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 100
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 100
}
]
}
]
},
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 50
},
{
"name": "ABCaaa",
"size": 50
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "ABC",
"size": 16
},
{
"name": "ABC111",
"size": 16
},
{
"name": "ABC121",
"size": 16
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 50
}
]
}
]
}
]
}
In order to do that:
I am trying to add labels such as "name" and "children" to the json while creating it.
I tried something like
results = [{"name": i, "children": j} for i,j in results.items()]
But it won't label it properly I believe.
Also, add another field with the label `"size"which I am planning to calculate based on the formula:
(Rating*Count*10000)/number_of_children_to_the_immediate_parent
Here is my dirty code:
import pandas as pd
from collections import defaultdict
import json
data =[('ABC', 'www.example.com/ABC', 10 , 5), ('123', 'www.example.com/123', 9, 4), ('XYZ', 'www.example.com/XYZ', 5, 2), ('ABC111', 'www.example.com/ABC111', 5, 2), ('ABC121', 'www.example.com/ABC121', 5, 2), ('222', 'www.example.com/222', 5, 3), ('abc222', 'www.example.com/abc222', 4, 2), ('ABCaaa', 'www.example.com/ABCaaa', 4, 2)]
df = pd.DataFrame(data, columns=['Name', 'URL', 'Count', 'Rating'])
gp = df.groupby(['Count'])
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {}
temp["name"] = name
temp["children"] = []
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp2 = {}
temp2["name"] = n
temp2["children"] = g.reset_index().T.to_dict().values()
for t in temp2["children"]:
t["size"] = (t["Rating"] * t["Count"] * 10000) / len(temp2["children"])
t["name"] = t["Name"]
del t["Count"]
del t["Rating"]
del t["URL"]
del t["Name"]
del t["index"]
temp["children"].append(temp2)
children.append(temp)
dict_json["children"] = children
print json.dumps(dict_json, indent=4)
Though the above code does print what I need, I am looking for more efficient and cleaner way to do the same, mainly because the actual dataset might be even more nested and complicated. Any help/suggestion will be much appreciated.
Quite an interesting problem and a great question!
You can improve your approach by reorganizing the code inside the loops and using list comprehensions. No need to delete things and introduce temp variables inside loops:
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {"name": name, "children": []}
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp["children"].append({
"name": n,
"children": [
{"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)}
for _, row in g.iterrows()
]
})
children.append(temp)
dict_json["children"] = children
Or, a "wrapped" version:
dict_json = {
"name": "flare",
"children": [
{
"name": name,
"children": [
{
"name": n,
"children": [
{
"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)
} for _, row in g.iterrows()
]
} for n, g in group.groupby(['Rating'])
]
} for name, group in gp
]
}
I'm getting the following dictionary printed for you sample input dataframe:
{
"name": "flare",
"children": [
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 40000
},
{
"name": "ABCaaa",
"size": 40000
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "XYZ",
"size": 33333
},
{
"name": "ABC111",
"size": 33333
},
{
"name": "ABC121",
"size": 33333
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 150000
}
]
}
]
},
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 360000
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 500000
}
]
}
]
}
]
}
If I understand correctly what you wan't to do is put a groupby into a nested json, if that is the case then you could use pandas groupby and cast it into a nested list of lists as so:
lol = pd.DataFrame(df.groupby(['Count','Rating'])\
.apply(lambda x: list(x['Name_ID']))).reset_index().values.tolist()
lol should look something like this:
[['10', '5', ['ABC']],
['4', '2', ['abc222', 'ABCaaa']],
['5', '2', ['XYZ ', 'ABC111', 'ABC121']],
['5', '3', ['222 ']],
['9', '4', ['123 ']]]
after that you could loop over lol to put it into a dict, but since you want to set nested items you'l have to use autovivification (check it out):
class autovividict(dict):
def __missing__(self, key):
value = self[key] = type(self)()
return value
d = autovividict()
for l in lol:
d[l[0]][l[1]] = l[2]
now you can use the json pack for printing and exporting:
print json.dumps(d,indent=2)
In case you need more than one groupby, you could concat your groups with pandas, cast to lol, remove any nans, and then loop, let me know if a full example can help.
setup
from io import StringIO
import pandas as pd
txt = """Name_ID,URL,Count,Rating
ABC,www.example.com/ABC,10,5
123,www.example.com/123,9,4
XYZ,www.example.com/XYZ,5,2
ABC111,www.example.com/ABC111,5,2
ABC121,www.example.com/ABC121,5,2
222,www.example.com/222,5,3
abc222,www.example.com/abc222,4,2
ABCaaa,www.example.com/ABCaaa,4,2"""
df = pd.read_csv(StringIO(txt))
size
pre-calculate it
df['size'] = df.Count.mul(df.Rating) \
.mul(10000) \
.div(df.groupby(
['Count', 'Rating']).Name_ID.transform('count')
).astype(int)
solution
create recursive function
def h(d):
if isinstance(d, pd.Series): d = d.to_frame().T
rec_cond = d.index.nlevels > 1 or d.index.nunique() > 1
return {'name': str(d.index[0]), 'size': str(d['size'].iloc[0])} if not rec_cond else \
[dict(name=str(n), children=h(g.xs(n))) for n, g in d.groupby(level=0)]
demo
import json
my_dict = dict(name='flare', children=h(df.set_index(['Count', 'Rating', 'Name_ID'])))
json.dumps(my_dict)
'{"name": "flare", "children": [{"name": "4", "children": [{"name": "2", "children": [{"name": "ABCaaa", "children": {"name": "ABCaaa", "size": "40000"}}, {"name": "abc222", "children": {"name": "abc222", "size": "40000"}}]}]}, {"name": "5", "children": [{"name": "2", "children": [{"name": "ABC111", "children": {"name": "ABC111", "size": "33333"}}, {"name": "ABC121", "children": {"name": "ABC121", "size": "33333"}}, {"name": "XYZ", "children": {"name": "XYZ", "size": "33333"}}]}, {"name": "3", "children": {"name": "222", "size": "150000"}}]}, {"name": "9", "children": [{"name": "4", "children": {"name": "123", "size": "360000"}}]}, {"name": "10", "children": [{"name": "5", "children": {"name": "ABC", "size": "500000"}}]}]}'
my_dict
{'children': [{'children': [{'children': [{'children': {'name': 'ABCaaa',
'size': '40000'},
'name': 'ABCaaa'},
{'children': {'name': 'abc222', 'size': '40000'}, 'name': 'abc222'}],
'name': '2'}],
'name': '4'},
{'children': [{'children': [{'children': {'name': 'ABC111', 'size': '33333'},
'name': 'ABC111'},
{'children': {'name': 'ABC121', 'size': '33333'}, 'name': 'ABC121'},
{'children': {'name': 'XYZ', 'size': '33333'}, 'name': 'XYZ'}],
'name': '2'},
{'children': {'name': '222', 'size': '150000'}, 'name': '3'}],
'name': '5'},
{'children': [{'children': {'name': '123', 'size': '360000'}, 'name': '4'}],
'name': '9'},
{'children': [{'children': {'name': 'ABC', 'size': '500000'}, 'name': '5'}],
'name': '10'}],
'name': 'flare'}

Categories