Complex Pandas Dataframe to Nested Dictionary/JSON

Complex Pandas Dataframe to Nested Dictionary/JSON - python

I have 3 Dataframes, I have merged them into a single one, and want to represent the dataframe into a Nested dictionary / json format.
df1: This contains general information about a patient.
>>> df1 = pd.DataFrame({'PatientId' : [1,2], 'Gender' : ['M', 'F'], 'Marital_status':['married', 'unmarried']})
>>> df1
PatientId Gender Marital_status
1 M married
2 F unmarried
df2:
this contains details of each admission of a patient, and the diagnosis.
>>> df2 = pd.DataFrame({'PatientId': [1,1,2,2], 'AdmissionId' : [1,2,1,2], 'Diagnosis_Code': ['DXS', 'SDE', 'DEF', 'ATR'], 'Stay_Duration' : [45,14,79,32]})
>>> df2
PatientId AdmissionId Diagnosis_Code Stay_Duration
1 1 DXS 45
1 2 SDE 14
2 1 DEF 79
2 2 ATR 32
df3:
this dataframe contains all the lab testes reports of a patient carried out in each admission.
>>> df3 = pd.DataFrame(
{
'PatientId':[1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2],
'AdmissionId' : [1,1,1,1,2,2,2,2,1,1,1,1,2,2,2,2],
'LabTest' : [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2],
'LabName' : ['ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'PQR'],
'LabValue' : [5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4],
'IsNormal' : [True, False, True, True, True, False, True, True, True, False, True, True, True, False, True, True]
}
)
>>> df3
PatientId AdmissionId LabTest LabName LabValue IsNormal
1 1 1 ABC 5.7 True
1 1 1 XYZ 1.9 False
1 1 2 ABC 5.6 True
1 1 2 XYZ 2.4 True
1 2 1 PQR 5.7 True
1 2 1 XYZ 1.9 False
1 2 2 ABC 5.6 True
1 2 2 XYZ 2.4 True
2 1 1 ABC 5.7 True
2 1 1 XYZ 1.9 False
2 1 2 ABC 5.6 True
2 1 2 XYZ 2.4 True
2 2 1 PQR 5.7 True
2 2 1 XYZ 1.9 False
2 2 2 ABC 5.6 True
2 2 2 PQR 2.4 True
I want my output to look like this --
"PatientId" : 1
"Gender":M
"Marital_Status" : married
"AdmissionsInfo":
{
"AdmissionID": 1
"Diagnosis": DXS
"Stay_Duration" : 45
"lab reports" :
{
"labtest":1
"labinfo":
{
"labName":ABC
"labValue":5.6
"isNormal":True
},
{
"labName": XYZ
"labValue": 2.4
"isNormal": True
}
"labtest":2
"labinfo":
{
"labName":ABC
"labValue":5.7
"isNormal":True
},
{
"labName": XYZ
"labValue": 1.9
"isNormal":False
}
}
"AdmissionID": 2
"Diagnosis": SDE
"Stay_Duration" : 45
/
/
//
} end of patient 1's all admissions' info
"PatientId" : 2
"Gender": F
"Marital_Status" : unmarried
"AdmissionsInfo":
//
//
and so on }}}

Find below a full pandas (although redundant) solution.
First merge your three dataframes into one callled df_merged:
df_merged = df3.merge(df1, on="PatientId").merge(df2, on=["PatientId", "AdmissionId"])
Now create the hierarchy you need (this part is ugly, but works, happy to receive feedback on it):
(df_merged.groupby(["PatientId", "Gender", "Marital_status", "AdmissionId", "Diagnosis_Code", "Stay_Duration", "LabTest"])
.apply(lambda x: x[["LabName", "LabValue", "IsNormal"]].to_dict("r"))
.reset_index()
.rename(columns={0:"LabInfo"})
.groupby(["PatientId", "Gender", "Marital_status", "AdmissionId", "Diagnosis_Code", "Stay_Duration"])
.apply(lambda x: x[["LabTest", "LabInfo"]].to_dict("r"))
.reset_index()
.rename(columns={0:"LabReports"})
.groupby(["PatientId", "Gender", "Marital_status"])
.apply(lambda x: x[["AdmissionId", "Diagnosis_Code", "Stay_Duration", "LabReports"]].to_dict("r"))
.reset_index()
.rename(columns={0:"AdmissionsInfo"})
.to_json(orient="records"))
And dump that into a json object:
>>> import json
>>> print(json.dumps(json.loads(j), indent=2, sort_keys=False))
And the result:
[
{
"PatientId": 1,
"Gender": "M",
"Marital_status": "married",
"AdmissionsInfo": [
{
"AdmissionId": "1",
"Diagnosis_Code": "DXS",
"Stay_Duration": "45",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
},
{
"AdmissionId": "2",
"Diagnosis_Code": "SDE",
"Stay_Duration": "14",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "PQR",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
}
]
},
{
"PatientId": 2,
"Gender": "F",
"Marital_status": "unmarried",
"AdmissionsInfo": [
{
"AdmissionId": "1",
"Diagnosis_Code": "DEF",
"Stay_Duration": "79",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
},
{
"AdmissionId": "2",
"Diagnosis_Code": "ATR",
"Stay_Duration": "32",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "PQR",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "PQR",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
}
]
}
]

In [84]: json_list = []
In [85]: for index1, row1 in df1.iterrows():
...: d = dict(row1)
...: json_list.append(d)
...: for index2, row2 in df2[df2['PatientId'] == row1['PatientId']].iterrows():
...: d2 = dict(row2)
...: del d2['PatientId']
...: d['AdmissionsInfo'] = d2
...: lab_reports_list = []
...: for index3, row3 in df3[(df3['PatientId'] == row2['PatientId']) & (df3['AdmissionId'] == row2['AdmissionId'])].iterrows():
...: d3 = dict(row3)
...: d4 = {}
...: d4['labtest'] = row3['LabTest']
...: d4['labinfo'] = {'labName': row3['LabName'], 'labValue': row3['LabValue'], 'isNormal': row3['IsNormal']}
...: lab_reports_list.append(d4)
...: d2['lab reports'] = lab_reports_list
Output
In [87]: json_list
Out[87]:
[{'PatientId': 1,
'Gender': 'M',
'Marital_status': 'married',
'AdmissionsInfo': {'AdmissionId': 2,
'Diagnosis_Code': 'SDE',
'Stay_Duration': 14,
'lab reports': [{'labtest': 1,
'labinfo': {'labName': 'PQR', 'labValue': 5.7, 'isNormal': True}},
{'labtest': 1,
'labinfo': {'labName': 'XYZ', 'labValue': 1.9, 'isNormal': False}},
{'labtest': 2,
'labinfo': {'labName': 'ABC', 'labValue': 5.6, 'isNormal': True}},
{'labtest': 2,
'labinfo': {'labName': 'XYZ', 'labValue': 2.4, 'isNormal': True}}]}},
{'PatientId': 2,
'Gender': 'F',
'Marital_status': 'unmarried',
'AdmissionsInfo': {'AdmissionId': 2,
'Diagnosis_Code': 'ATR',
'Stay_Duration': 32,
'lab reports': [{'labtest': 1,
'labinfo': {'labName': 'PQR', 'labValue': 5.7, 'isNormal': True}},
{'labtest': 1,
'labinfo': {'labName': 'XYZ', 'labValue': 1.9, 'isNormal': False}},
{'labtest': 2,
'labinfo': {'labName': 'ABC', 'labValue': 5.6, 'isNormal': True}},
{'labtest': 2,
'labinfo': {'labName': 'PQR', 'labValue': 2.4, 'isNormal': True}}]}}]

Arnaud's response does the job but seems not very "Pythonic".
There might be something possible with aggregate functions on DataFrameGroupby. Tryed the .agg(dict) but not functional.
for those wanting to help:
patient_df = pd.DataFrame({'PatientId' : [1,2], 'Gender' : ['M', 'F'], 'Marital_status':['married', 'unmarried']})
admission_df = pd.DataFrame({'PatientId': [1,1,2,2], 'AdmissionId' : [1,2,1,2], 'Diagnosis_Code': ['DXS', 'SDE', 'DEF', 'ATR'], 'Stay_Duration' : [45,14,79,32]})
lab_df = pd.DataFrame(
{
'PatientId':[1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2],
'AdmissionId' : [1,1,1,1,2,2,2,2,1,1,1,1,2,2,2,2],
'LabTest' : [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2],
'LabName' : ['ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'PQR'],
'LabValue' : [5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4],
'IsNormal' : [True, False, True, True, True, False, True, True, True, False, True, True, True, False, True, True]
}
)

Related

Parse JSON from Api to pandas dataframe with doubled names

I'm parsing a json and I don't understand how to correctly decompose it into a dataframe.
Json structure i have (api response):
{
"result": {
"data": [],
"totals": [
0
]
},
"timestamp": "2021-11-25 15:19:21"
}
response_ =
{
"result":{
"data":[
{
"dimensions":[
{
"id":"2023-01-10",
"name":""
},
{
"id":"123",
"name":"good3"
}
],
"metrics":[
10,
20,
30,
40
]
},
{
"dimensions":[
{
"id":"2023-01-10",
"name":""
},
{
"id":"234",
"name":"good2"
}
],
"metrics":[
1,
2,
3,
4
]
}
],
"totals":[
11,
22,
33,
44
]
},
"timestamp":"2023-02-07 12:58:40"
}
I don't need "timestamp" and "totals" - just "data". So i do:
...
response_ = requests.post(url, headers=head, data=body)
datas = response_.json()
datas_ = datas['result']['data']
df1 = pd.json_normalize(datas_)
I got:
dimensions
metrics
0
[{'id': '2023-01-10', 'name': ''}, {'id': '123', 'name': 'good1'}]
[10, 20, 30, 40]
1
[{'id': '2023-01-10', 'name': ''}, {'id': '234', 'name': 'good2'}]
[1, 2, 3, 4]
But i need dataframe like:
id_
name_
id
name
metric1
metric2
metric3
metric4
0
2023-01-10
123
good1
10
20
30
40
1
2023-01-10
234
good2
1
2
3
4
When i try like:
df1 = pd.json_normalize(datas_, 'dimensions')
i get all id's and name's in one column.
Explain step by step if possible. Thank you.

Try:
response = {
"result": {
"data": [
{
"dimensions": [
{"id": "2023-01-10", "name": ""},
{"id": "123", "name": "good3"},
],
"metrics": [10, 20, 30, 40],
},
{
"dimensions": [
{"id": "2023-01-10", "name": ""},
{"id": "234", "name": "good2"},
],
"metrics": [1, 2, 3, 4],
},
],
"totals": [11, 22, 33, 44],
},
"timestamp": "2023-02-07 12:58:40",
}
tmp = [
{
**{f"{k}_": v for k, v in d["dimensions"][0].items()},
**{k: v for k, v in d["dimensions"][1].items()},
**{f'metric{i}':m for i, m in enumerate(d['metrics'], 1)}
}
for d in response["result"]["data"]
]
df = pd.DataFrame(tmp)
print(df)
Prints:
id_ name_ id name metric1 metric2 metric3 metric4
0 2023-01-10 123 good3 10 20 30 40
1 2023-01-10 234 good2 1 2 3 4

Get nested JSON from pandas dataframe grouped by multiple columns

I have a pandas dataframe:
d = {'key': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'crow', 'crow', 'crow', 'crow'],
'date': ['2021-01-01', '2021-01-01', '2021-01-02', '2021-01-02', '2021-01-01', '2021-01-01','2021-01-02', '2021-01-02', '2021-01-01', '2021-01-01', '2021-01-02', '2021-01-02'],
'class': [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
'count': [12, 3, 5, 5, 3, 1, 4, 1, 7, 3, 8, 2],
'percent': [.8, .2, .5, .5, .75, .25, .8, .2, .7, .3, .8, .2]
}
df = pd.DataFrame(data=d)
df
key date class count percent
0 foo 2021-01-01 1 12 0.80
1 foo 2021-01-01 2 3 0.20
2 foo 2021-01-02 1 5 0.50
3 foo 2021-01-02 2 5 0.50
4 bar 2021-01-01 1 3 0.75
5 bar 2021-01-01 2 1 0.25
6 bar 2021-01-02 1 4 0.80
7 bar 2021-01-02 2 1 0.20
8 crow 2021-01-01 1 7 0.70
9 crow 2021-01-01 2 3 0.30
10 crow 2021-01-02 1 8 0.80
11 crow 2021-01-02 2 2 0.20
I would like to create a nested JSON file that grouped by key and date where count: is a list containing the sums of the counts of key for that day and percent: are lists containing the percentages of the class counts over the total count (there needs to be one list per day containing the percentages of each class).
[
[
{
"key": "foo",
"count": [
15,
10
],
"predictions": [
[
.80,
.20
],
[
.50,
.50,
]
]
},
{
"key": "bar",
"count": [
4,
5
],
"predictions": [
[
.75,
.25
],
[
.80,
.20
]
]
},
{
"key": "crow",
"count": [
10,
10
],
"predictions": [
[
.70,
.30
],
[
.80,
.20
]
]
}
]
]
So far I have:
import json
dfj = dfd.groupby(["key","date"]).apply(lambda x: x.to_dict("r")).to_json(orient="records")
print(json.dumps(json.loads(dfj), indent=2, sort_keys=True))
which returns:
[
[
{
"class": 1,
"count": 3,
"date": "2021-01-01",
"key": "bar",
"percent": 0.75
},
{
"class": 2,
"count": 1,
"date": "2021-01-01",
"key": "bar",
"percent": 0.25
}
],
[
{
"class": 1,
"count": 4,
"date": "2021-01-02",
"key": "bar",
"percent": 0.8
},
{
"class": 2,
"count": 1,
"date": "2021-01-02",
"key": "bar",
"percent": 0.2
}
],
[
{
"class": 1,
"count": 7,
"date": "2021-01-01",
"key": "crow",
"percent": 0.7
},
{
"class": 2,
"count": 3,
"date": "2021-01-01",
"key": "crow",
"percent": 0.3
}
],
[
{
"class": 1,
"count": 8,
"date": "2021-01-02",
"key": "crow",
"percent": 0.8
},
{
"class": 2,
"count": 2,
"date": "2021-01-02",
"key": "crow",
"percent": 0.2
}
],
[
{
"class": 1,
"count": 12,
"date": "2021-01-01",
"key": "foo",
"percent": 0.8
},
{
"class": 2,
"count": 3,
"date": "2021-01-01",
"key": "foo",
"percent": 0.2
}
],
[
{
"class": 1,
"count": 5,
"date": "2021-01-02",
"key": "foo",
"percent": 0.5
},
{
"class": 2,
"count": 5,
"date": "2021-01-02",
"key": "foo",
"percent": 0.5
}
]
]
Any help would be appreciated. Thank you.

You can use:
d = {'count': ('count', 'sum'), 'predictions': ('percent', list)}
g = df.groupby(['key', 'date']).agg(**d).groupby(level=0).agg(list)
dct = [{'key': k, **v} for k, v in g.to_dict('i').items()]
Details:
groupby the given dataframe on key and date and agg using the dictionary d,
groupby the aggregated frame from step 1 on level=0 and agg using list
Finally using to_dict with orient=index to convert the frame from step 2 to dictionary followed by dict comprehension to add the key variable in dictionary.
Result:
[{'key': 'bar', 'count': [4, 5], 'predictions': [[0.75, 0.25], [0.8, 0.2]]},
{'key': 'crow', 'count': [10, 10], 'predictions': [[0.7, 0.3], [0.8, 0.2]]},
{'key': 'foo', 'count': [15, 10], 'predictions': [[0.8, 0.2], [0.5, 0.5]]}]

Dataframe to PowerBI's json format

I am trying to convert Dataframe data into PowerBI's JSON format. But no luck so far.
DataFrame:
ProductID Name Category IsCompete ManufacturedOn
0 1 Adjustable Race Components true 07/30/2014
1 2 LL Crankarm Components false 07/30/2014
2 3 HL Mountain Frame - Silver Bikes true 07/30/2019
Expected JSON Format:
{
"rows": [
{
"ProductID": 1,
"Name": "Adjustable Race",
"Category": "Components",
"IsCompete": true,
"ManufacturedOn": "07/30/2014"
},
{
"ProductID": 2,
"Name": "LL Crankarm",
"Category": "Components",
"IsCompete": true,
"ManufacturedOn": "07/30/2014"
},
{
"ProductID": 3,
"Name": "HL Mountain Frame - Silver",
"Category": "Bikes",
"IsCompete": true,
"ManufacturedOn": "07/30/2014"
}
]
}

use pandas to_dict method :
json = {'rows':df.to_dict('records')}
print(json)
{'rows': [{'ProductID': 1,
'Name': 'Adjustable Race',
'Category': 'Components',
'IsCompete': True,
'ManufacturedOn': '07/30/2014'},
{'ProductID': 2,
'Name': 'LL Crankarm',
'Category': 'Components',
'IsCompete': False,
'ManufacturedOn': '07/30/2014'},
{'ProductID': 3,
'Name': 'HL Mountain Frame - Silver',
'Category': 'Bikes',
'IsCompete': True,
'ManufacturedOn': '07/30/2019'}]}

Convert Pandas Dataframe to nested dictionary

I am trying to convert a dataframe to a nested dictionary but no success so far.
Dataframe: clean_data['Model', 'Problem', 'Size']
Here's how my data looks like:
Model Problem Size
lenovo a6020 screen broken 1
lenovo a6020a40 battery 60
bluetooth 60
buttons 60
lenovo k4 wi-fi 3
bluetooth 3
My desired output:
{
"name": "Brand",
"children": [
{
"name": "Lenovo",
"children": [
{
"name": "lenovo a6020",
"children": {
"name": "screen broken",
"size": 1
}
},
{
"name": "lenovo a6020a40",
"children": [
{
"name": "battery",
"size": 60
},
{
"name": "bluetooth",
"size": 60
},
{
"name": "buttons",
"size": 60
}
]
},
{
"name": "lenovo k4",
"children": [
{
"name": "wi-fi",
"size": 3
},
{
"name": "bluetooth",
"size": 3
}
]
}
]
}
]
}
I have tried pandas.DataFrame.to_dict method But it is returning a simple dictionary but I want it like the one mentioned above.

Use:
print (df)
Model Problem size
0 lenovo a6020 screen broken 1
1 lenovo a6020a40 battery 60
2 NaN bluetooth 60
3 NaN buttons 60
4 lenovo k4 wi-fi 3
5 NaN bluetooth 3
#repalce missing values by forward filling
df = df.ffill()
#split Model column by first whitesapces to 2 columns
df[['a','b']] = df['Model'].str.split(n=1, expand=True)
#each level convert to list of dictionaries
#for correct keys use rename
L = (df.rename(columns={'Problem':'name'})
.groupby(['a','b'])['name','size']
.apply(lambda x: x.to_dict('r'))
.rename('children')
.reset_index()
.rename(columns={'b':'name'})
.groupby('a')['name','children']
.apply(lambda x: x.to_dict('r'))
.rename('children')
.reset_index()
.rename(columns={'a':'name'})
.to_dict('r')
)
#print (L)
#create outer level by contructor
d = { "name": "Brand", "children": L}
print (d)
{
'name': 'Brand',
'children': [{
'name': 'lenovo',
'children': [{
'name': 'a6020',
'children': [{
'name': 'screen broken',
'size': 1
}]
}, {
'name': 'a6020a40',
'children': [{
'name': 'battery',
'size': 60
}, {
'name': 'bluetooth',
'size': 60
}, {
'name': 'buttons',
'size': 60
}]
}, {
'name': 'k4',
'children': [{
'name': 'wi-fi',
'size': 3
}, {
'name': 'bluetooth',
'size': 3
}]
}]
}]
}

Python - Adding fields and labels to nested json file

I have a dataframe as follows:
Name_ID | URL | Count | Rating
------------------------------------------------
ABC | www.example.com/ABC | 10 | 5
123 | www.example.com/123 | 9 | 4
XYZ | www.example.com/XYZ | 5 | 2
ABC111 | www.example.com/ABC111 | 5 | 2
ABC121 | www.example.com/ABC121 | 5 | 2
222 | www.example.com/222 | 5 | 3
abc222 | www.example.com/abc222 | 4 | 2
ABCaaa | www.example.com/ABCaaa | 4 | 2
I am trying to create a JSON as follows:
{
"name": "sampledata",
"children": [
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 100
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 100
}
]
}
]
},
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 50
},
{
"name": "ABCaaa",
"size": 50
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "ABC",
"size": 16
},
{
"name": "ABC111",
"size": 16
},
{
"name": "ABC121",
"size": 16
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 50
}
]
}
]
}
]
}
In order to do that:
I am trying to add labels such as "name" and "children" to the json while creating it.
I tried something like
results = [{"name": i, "children": j} for i,j in results.items()]
But it won't label it properly I believe.
Also, add another field with the label `"size"which I am planning to calculate based on the formula:
(Rating*Count*10000)/number_of_children_to_the_immediate_parent
Here is my dirty code:
import pandas as pd
from collections import defaultdict
import json
data =[('ABC', 'www.example.com/ABC', 10 , 5), ('123', 'www.example.com/123', 9, 4), ('XYZ', 'www.example.com/XYZ', 5, 2), ('ABC111', 'www.example.com/ABC111', 5, 2), ('ABC121', 'www.example.com/ABC121', 5, 2), ('222', 'www.example.com/222', 5, 3), ('abc222', 'www.example.com/abc222', 4, 2), ('ABCaaa', 'www.example.com/ABCaaa', 4, 2)]
df = pd.DataFrame(data, columns=['Name', 'URL', 'Count', 'Rating'])
gp = df.groupby(['Count'])
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {}
temp["name"] = name
temp["children"] = []
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp2 = {}
temp2["name"] = n
temp2["children"] = g.reset_index().T.to_dict().values()
for t in temp2["children"]:
t["size"] = (t["Rating"] * t["Count"] * 10000) / len(temp2["children"])
t["name"] = t["Name"]
del t["Count"]
del t["Rating"]
del t["URL"]
del t["Name"]
del t["index"]
temp["children"].append(temp2)
children.append(temp)
dict_json["children"] = children
print json.dumps(dict_json, indent=4)
Though the above code does print what I need, I am looking for more efficient and cleaner way to do the same, mainly because the actual dataset might be even more nested and complicated. Any help/suggestion will be much appreciated.

Quite an interesting problem and a great question!
You can improve your approach by reorganizing the code inside the loops and using list comprehensions. No need to delete things and introduce temp variables inside loops:
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {"name": name, "children": []}
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp["children"].append({
"name": n,
"children": [
{"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)}
for _, row in g.iterrows()
]
})
children.append(temp)
dict_json["children"] = children
Or, a "wrapped" version:
dict_json = {
"name": "flare",
"children": [
{
"name": name,
"children": [
{
"name": n,
"children": [
{
"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)
} for _, row in g.iterrows()
]
} for n, g in group.groupby(['Rating'])
]
} for name, group in gp
]
}
I'm getting the following dictionary printed for you sample input dataframe:
{
"name": "flare",
"children": [
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 40000
},
{
"name": "ABCaaa",
"size": 40000
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "XYZ",
"size": 33333
},
{
"name": "ABC111",
"size": 33333
},
{
"name": "ABC121",
"size": 33333
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 150000
}
]
}
]
},
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 360000
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 500000
}
]
}
]
}
]
}

If I understand correctly what you wan't to do is put a groupby into a nested json, if that is the case then you could use pandas groupby and cast it into a nested list of lists as so:
lol = pd.DataFrame(df.groupby(['Count','Rating'])\
.apply(lambda x: list(x['Name_ID']))).reset_index().values.tolist()
lol should look something like this:
[['10', '5', ['ABC']],
['4', '2', ['abc222', 'ABCaaa']],
['5', '2', ['XYZ ', 'ABC111', 'ABC121']],
['5', '3', ['222 ']],
['9', '4', ['123 ']]]
after that you could loop over lol to put it into a dict, but since you want to set nested items you'l have to use autovivification (check it out):
class autovividict(dict):
def __missing__(self, key):
value = self[key] = type(self)()
return value
d = autovividict()
for l in lol:
d[l[0]][l[1]] = l[2]
now you can use the json pack for printing and exporting:
print json.dumps(d,indent=2)
In case you need more than one groupby, you could concat your groups with pandas, cast to lol, remove any nans, and then loop, let me know if a full example can help.

setup
from io import StringIO
import pandas as pd
txt = """Name_ID,URL,Count,Rating
ABC,www.example.com/ABC,10,5
123,www.example.com/123,9,4
XYZ,www.example.com/XYZ,5,2
ABC111,www.example.com/ABC111,5,2
ABC121,www.example.com/ABC121,5,2
222,www.example.com/222,5,3
abc222,www.example.com/abc222,4,2
ABCaaa,www.example.com/ABCaaa,4,2"""
df = pd.read_csv(StringIO(txt))
size
pre-calculate it
df['size'] = df.Count.mul(df.Rating) \
.mul(10000) \
.div(df.groupby(
['Count', 'Rating']).Name_ID.transform('count')
).astype(int)
solution
create recursive function
def h(d):
if isinstance(d, pd.Series): d = d.to_frame().T
rec_cond = d.index.nlevels > 1 or d.index.nunique() > 1
return {'name': str(d.index[0]), 'size': str(d['size'].iloc[0])} if not rec_cond else \
[dict(name=str(n), children=h(g.xs(n))) for n, g in d.groupby(level=0)]
demo
import json
my_dict = dict(name='flare', children=h(df.set_index(['Count', 'Rating', 'Name_ID'])))
json.dumps(my_dict)
'{"name": "flare", "children": [{"name": "4", "children": [{"name": "2", "children": [{"name": "ABCaaa", "children": {"name": "ABCaaa", "size": "40000"}}, {"name": "abc222", "children": {"name": "abc222", "size": "40000"}}]}]}, {"name": "5", "children": [{"name": "2", "children": [{"name": "ABC111", "children": {"name": "ABC111", "size": "33333"}}, {"name": "ABC121", "children": {"name": "ABC121", "size": "33333"}}, {"name": "XYZ", "children": {"name": "XYZ", "size": "33333"}}]}, {"name": "3", "children": {"name": "222", "size": "150000"}}]}, {"name": "9", "children": [{"name": "4", "children": {"name": "123", "size": "360000"}}]}, {"name": "10", "children": [{"name": "5", "children": {"name": "ABC", "size": "500000"}}]}]}'
my_dict
{'children': [{'children': [{'children': [{'children': {'name': 'ABCaaa',
'size': '40000'},
'name': 'ABCaaa'},
{'children': {'name': 'abc222', 'size': '40000'}, 'name': 'abc222'}],
'name': '2'}],
'name': '4'},
{'children': [{'children': [{'children': {'name': 'ABC111', 'size': '33333'},
'name': 'ABC111'},
{'children': {'name': 'ABC121', 'size': '33333'}, 'name': 'ABC121'},
{'children': {'name': 'XYZ', 'size': '33333'}, 'name': 'XYZ'}],
'name': '2'},
{'children': {'name': '222', 'size': '150000'}, 'name': '3'}],
'name': '5'},
{'children': [{'children': {'name': '123', 'size': '360000'}, 'name': '4'}],
'name': '9'},
{'children': [{'children': {'name': 'ABC', 'size': '500000'}, 'name': '5'}],
'name': '10'}],
'name': 'flare'}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Complex Pandas Dataframe to Nested Dictionary/JSON - python

Related

Parse JSON from Api to pandas dataframe with doubled names

Get nested JSON from pandas dataframe grouped by multiple columns

Dataframe to PowerBI's json format

Convert Pandas Dataframe to nested dictionary

Python - Adding fields and labels to nested json file

Categories

Resources