Complex Pandas Dataframe to Nested Dictionary/JSON - python

I have 3 Dataframes, I have merged them into a single one, and want to represent the dataframe into a Nested dictionary / json format.
df1: This contains general information about a patient.
>>> df1 = pd.DataFrame({'PatientId' : [1,2], 'Gender' : ['M', 'F'], 'Marital_status':['married', 'unmarried']})
>>> df1
PatientId Gender Marital_status
1 M married
2 F unmarried
df2:
this contains details of each admission of a patient, and the diagnosis.
>>> df2 = pd.DataFrame({'PatientId': [1,1,2,2], 'AdmissionId' : [1,2,1,2], 'Diagnosis_Code': ['DXS', 'SDE', 'DEF', 'ATR'], 'Stay_Duration' : [45,14,79,32]})
>>> df2
PatientId AdmissionId Diagnosis_Code Stay_Duration
1 1 DXS 45
1 2 SDE 14
2 1 DEF 79
2 2 ATR 32
df3:
this dataframe contains all the lab testes reports of a patient carried out in each admission.
>>> df3 = pd.DataFrame(
{
'PatientId':[1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2],
'AdmissionId' : [1,1,1,1,2,2,2,2,1,1,1,1,2,2,2,2],
'LabTest' : [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2],
'LabName' : ['ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'PQR'],
'LabValue' : [5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4],
'IsNormal' : [True, False, True, True, True, False, True, True, True, False, True, True, True, False, True, True]
}
)
>>> df3
PatientId AdmissionId LabTest LabName LabValue IsNormal
1 1 1 ABC 5.7 True
1 1 1 XYZ 1.9 False
1 1 2 ABC 5.6 True
1 1 2 XYZ 2.4 True
1 2 1 PQR 5.7 True
1 2 1 XYZ 1.9 False
1 2 2 ABC 5.6 True
1 2 2 XYZ 2.4 True
2 1 1 ABC 5.7 True
2 1 1 XYZ 1.9 False
2 1 2 ABC 5.6 True
2 1 2 XYZ 2.4 True
2 2 1 PQR 5.7 True
2 2 1 XYZ 1.9 False
2 2 2 ABC 5.6 True
2 2 2 PQR 2.4 True
I want my output to look like this --
"PatientId" : 1
"Gender":M
"Marital_Status" : married
"AdmissionsInfo":
{
"AdmissionID": 1
"Diagnosis": DXS
"Stay_Duration" : 45
"lab reports" :
{
"labtest":1
"labinfo":
{
"labName":ABC
"labValue":5.6
"isNormal":True
},
{
"labName": XYZ
"labValue": 2.4
"isNormal": True
}
"labtest":2
"labinfo":
{
"labName":ABC
"labValue":5.7
"isNormal":True
},
{
"labName": XYZ
"labValue": 1.9
"isNormal":False
}
}
"AdmissionID": 2
"Diagnosis": SDE
"Stay_Duration" : 45
/
/
//
} end of patient 1's all admissions' info
"PatientId" : 2
"Gender": F
"Marital_Status" : unmarried
"AdmissionsInfo":
//
//
and so on }}}

Find below a full pandas (although redundant) solution.
First merge your three dataframes into one callled df_merged:
df_merged = df3.merge(df1, on="PatientId").merge(df2, on=["PatientId", "AdmissionId"])
Now create the hierarchy you need (this part is ugly, but works, happy to receive feedback on it):
(df_merged.groupby(["PatientId", "Gender", "Marital_status", "AdmissionId", "Diagnosis_Code", "Stay_Duration", "LabTest"])
.apply(lambda x: x[["LabName", "LabValue", "IsNormal"]].to_dict("r"))
.reset_index()
.rename(columns={0:"LabInfo"})
.groupby(["PatientId", "Gender", "Marital_status", "AdmissionId", "Diagnosis_Code", "Stay_Duration"])
.apply(lambda x: x[["LabTest", "LabInfo"]].to_dict("r"))
.reset_index()
.rename(columns={0:"LabReports"})
.groupby(["PatientId", "Gender", "Marital_status"])
.apply(lambda x: x[["AdmissionId", "Diagnosis_Code", "Stay_Duration", "LabReports"]].to_dict("r"))
.reset_index()
.rename(columns={0:"AdmissionsInfo"})
.to_json(orient="records"))
And dump that into a json object:
>>> import json
>>> print(json.dumps(json.loads(j), indent=2, sort_keys=False))
And the result:
[
{
"PatientId": 1,
"Gender": "M",
"Marital_status": "married",
"AdmissionsInfo": [
{
"AdmissionId": "1",
"Diagnosis_Code": "DXS",
"Stay_Duration": "45",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
},
{
"AdmissionId": "2",
"Diagnosis_Code": "SDE",
"Stay_Duration": "14",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "PQR",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
}
]
},
{
"PatientId": 2,
"Gender": "F",
"Marital_status": "unmarried",
"AdmissionsInfo": [
{
"AdmissionId": "1",
"Diagnosis_Code": "DEF",
"Stay_Duration": "79",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
},
{
"AdmissionId": "2",
"Diagnosis_Code": "ATR",
"Stay_Duration": "32",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "PQR",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "PQR",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
}
]
}
]

In [84]: json_list = []
In [85]: for index1, row1 in df1.iterrows():
...: d = dict(row1)
...: json_list.append(d)
...: for index2, row2 in df2[df2['PatientId'] == row1['PatientId']].iterrows():
...: d2 = dict(row2)
...: del d2['PatientId']
...: d['AdmissionsInfo'] = d2
...: lab_reports_list = []
...: for index3, row3 in df3[(df3['PatientId'] == row2['PatientId']) & (df3['AdmissionId'] == row2['AdmissionId'])].iterrows():
...: d3 = dict(row3)
...: d4 = {}
...: d4['labtest'] = row3['LabTest']
...: d4['labinfo'] = {'labName': row3['LabName'], 'labValue': row3['LabValue'], 'isNormal': row3['IsNormal']}
...: lab_reports_list.append(d4)
...: d2['lab reports'] = lab_reports_list
Output
In [87]: json_list
Out[87]:
[{'PatientId': 1,
'Gender': 'M',
'Marital_status': 'married',
'AdmissionsInfo': {'AdmissionId': 2,
'Diagnosis_Code': 'SDE',
'Stay_Duration': 14,
'lab reports': [{'labtest': 1,
'labinfo': {'labName': 'PQR', 'labValue': 5.7, 'isNormal': True}},
{'labtest': 1,
'labinfo': {'labName': 'XYZ', 'labValue': 1.9, 'isNormal': False}},
{'labtest': 2,
'labinfo': {'labName': 'ABC', 'labValue': 5.6, 'isNormal': True}},
{'labtest': 2,
'labinfo': {'labName': 'XYZ', 'labValue': 2.4, 'isNormal': True}}]}},
{'PatientId': 2,
'Gender': 'F',
'Marital_status': 'unmarried',
'AdmissionsInfo': {'AdmissionId': 2,
'Diagnosis_Code': 'ATR',
'Stay_Duration': 32,
'lab reports': [{'labtest': 1,
'labinfo': {'labName': 'PQR', 'labValue': 5.7, 'isNormal': True}},
{'labtest': 1,
'labinfo': {'labName': 'XYZ', 'labValue': 1.9, 'isNormal': False}},
{'labtest': 2,
'labinfo': {'labName': 'ABC', 'labValue': 5.6, 'isNormal': True}},
{'labtest': 2,
'labinfo': {'labName': 'PQR', 'labValue': 2.4, 'isNormal': True}}]}}]

Arnaud's response does the job but seems not very "Pythonic".
There might be something possible with aggregate functions on DataFrameGroupby. Tryed the .agg(dict) but not functional.
for those wanting to help:
patient_df = pd.DataFrame({'PatientId' : [1,2], 'Gender' : ['M', 'F'], 'Marital_status':['married', 'unmarried']})
admission_df = pd.DataFrame({'PatientId': [1,1,2,2], 'AdmissionId' : [1,2,1,2], 'Diagnosis_Code': ['DXS', 'SDE', 'DEF', 'ATR'], 'Stay_Duration' : [45,14,79,32]})
lab_df = pd.DataFrame(
{
'PatientId':[1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2],
'AdmissionId' : [1,1,1,1,2,2,2,2,1,1,1,1,2,2,2,2],
'LabTest' : [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2],
'LabName' : ['ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'PQR'],
'LabValue' : [5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4],
'IsNormal' : [True, False, True, True, True, False, True, True, True, False, True, True, True, False, True, True]
}
)

Related

Parse JSON from Api to pandas dataframe with doubled names

I'm parsing a json and I don't understand how to correctly decompose it into a dataframe.
Json structure i have (api response):
{
"result": {
"data": [],
"totals": [
0
]
},
"timestamp": "2021-11-25 15:19:21"
}
response_ =
{
"result":{
"data":[
{
"dimensions":[
{
"id":"2023-01-10",
"name":""
},
{
"id":"123",
"name":"good3"
}
],
"metrics":[
10,
20,
30,
40
]
},
{
"dimensions":[
{
"id":"2023-01-10",
"name":""
},
{
"id":"234",
"name":"good2"
}
],
"metrics":[
1,
2,
3,
4
]
}
],
"totals":[
11,
22,
33,
44
]
},
"timestamp":"2023-02-07 12:58:40"
}
I don't need "timestamp" and "totals" - just "data". So i do:
...
response_ = requests.post(url, headers=head, data=body)
datas = response_.json()
datas_ = datas['result']['data']
df1 = pd.json_normalize(datas_)
I got:
dimensions
metrics
0
[{'id': '2023-01-10', 'name': ''}, {'id': '123', 'name': 'good1'}]
[10, 20, 30, 40]
1
[{'id': '2023-01-10', 'name': ''}, {'id': '234', 'name': 'good2'}]
[1, 2, 3, 4]
But i need dataframe like:
id_
name_
id
name
metric1
metric2
metric3
metric4
0
2023-01-10
123
good1
10
20
30
40
1
2023-01-10
234
good2
1
2
3
4
When i try like:
df1 = pd.json_normalize(datas_, 'dimensions')
i get all id's and name's in one column.
Explain step by step if possible. Thank you.
Try:
response = {
"result": {
"data": [
{
"dimensions": [
{"id": "2023-01-10", "name": ""},
{"id": "123", "name": "good3"},
],
"metrics": [10, 20, 30, 40],
},
{
"dimensions": [
{"id": "2023-01-10", "name": ""},
{"id": "234", "name": "good2"},
],
"metrics": [1, 2, 3, 4],
},
],
"totals": [11, 22, 33, 44],
},
"timestamp": "2023-02-07 12:58:40",
}
tmp = [
{
**{f"{k}_": v for k, v in d["dimensions"][0].items()},
**{k: v for k, v in d["dimensions"][1].items()},
**{f'metric{i}':m for i, m in enumerate(d['metrics'], 1)}
}
for d in response["result"]["data"]
]
df = pd.DataFrame(tmp)
print(df)
Prints:
id_ name_ id name metric1 metric2 metric3 metric4
0 2023-01-10 123 good3 10 20 30 40
1 2023-01-10 234 good2 1 2 3 4

Get nested JSON from pandas dataframe grouped by multiple columns

I have a pandas dataframe:
d = {'key': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'crow', 'crow', 'crow', 'crow'],
'date': ['2021-01-01', '2021-01-01', '2021-01-02', '2021-01-02', '2021-01-01', '2021-01-01','2021-01-02', '2021-01-02', '2021-01-01', '2021-01-01', '2021-01-02', '2021-01-02'],
'class': [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
'count': [12, 3, 5, 5, 3, 1, 4, 1, 7, 3, 8, 2],
'percent': [.8, .2, .5, .5, .75, .25, .8, .2, .7, .3, .8, .2]
}
df = pd.DataFrame(data=d)
df
key date class count percent
0 foo 2021-01-01 1 12 0.80
1 foo 2021-01-01 2 3 0.20
2 foo 2021-01-02 1 5 0.50
3 foo 2021-01-02 2 5 0.50
4 bar 2021-01-01 1 3 0.75
5 bar 2021-01-01 2 1 0.25
6 bar 2021-01-02 1 4 0.80
7 bar 2021-01-02 2 1 0.20
8 crow 2021-01-01 1 7 0.70
9 crow 2021-01-01 2 3 0.30
10 crow 2021-01-02 1 8 0.80
11 crow 2021-01-02 2 2 0.20
I would like to create a nested JSON file that grouped by key and date where count: is a list containing the sums of the counts of key for that day and percent: are lists containing the percentages of the class counts over the total count (there needs to be one list per day containing the percentages of each class).
[
[
{
"key": "foo",
"count": [
15,
10
],
"predictions": [
[
.80,
.20
],
[
.50,
.50,
]
]
},
{
"key": "bar",
"count": [
4,
5
],
"predictions": [
[
.75,
.25
],
[
.80,
.20
]
]
},
{
"key": "crow",
"count": [
10,
10
],
"predictions": [
[
.70,
.30
],
[
.80,
.20
]
]
}
]
]
So far I have:
import json
dfj = dfd.groupby(["key","date"]).apply(lambda x: x.to_dict("r")).to_json(orient="records")
print(json.dumps(json.loads(dfj), indent=2, sort_keys=True))
which returns:
[
[
{
"class": 1,
"count": 3,
"date": "2021-01-01",
"key": "bar",
"percent": 0.75
},
{
"class": 2,
"count": 1,
"date": "2021-01-01",
"key": "bar",
"percent": 0.25
}
],
[
{
"class": 1,
"count": 4,
"date": "2021-01-02",
"key": "bar",
"percent": 0.8
},
{
"class": 2,
"count": 1,
"date": "2021-01-02",
"key": "bar",
"percent": 0.2
}
],
[
{
"class": 1,
"count": 7,
"date": "2021-01-01",
"key": "crow",
"percent": 0.7
},
{
"class": 2,
"count": 3,
"date": "2021-01-01",
"key": "crow",
"percent": 0.3
}
],
[
{
"class": 1,
"count": 8,
"date": "2021-01-02",
"key": "crow",
"percent": 0.8
},
{
"class": 2,
"count": 2,
"date": "2021-01-02",
"key": "crow",
"percent": 0.2
}
],
[
{
"class": 1,
"count": 12,
"date": "2021-01-01",
"key": "foo",
"percent": 0.8
},
{
"class": 2,
"count": 3,
"date": "2021-01-01",
"key": "foo",
"percent": 0.2
}
],
[
{
"class": 1,
"count": 5,
"date": "2021-01-02",
"key": "foo",
"percent": 0.5
},
{
"class": 2,
"count": 5,
"date": "2021-01-02",
"key": "foo",
"percent": 0.5
}
]
]
Any help would be appreciated. Thank you.
You can use:
d = {'count': ('count', 'sum'), 'predictions': ('percent', list)}
g = df.groupby(['key', 'date']).agg(**d).groupby(level=0).agg(list)
dct = [{'key': k, **v} for k, v in g.to_dict('i').items()]
Details:
groupby the given dataframe on key and date and agg using the dictionary d,
groupby the aggregated frame from step 1 on level=0 and agg using list
Finally using to_dict with orient=index to convert the frame from step 2 to dictionary followed by dict comprehension to add the key variable in dictionary.
Result:
[{'key': 'bar', 'count': [4, 5], 'predictions': [[0.75, 0.25], [0.8, 0.2]]},
{'key': 'crow', 'count': [10, 10], 'predictions': [[0.7, 0.3], [0.8, 0.2]]},
{'key': 'foo', 'count': [15, 10], 'predictions': [[0.8, 0.2], [0.5, 0.5]]}]

Dataframe to PowerBI's json format

I am trying to convert Dataframe data into PowerBI's JSON format. But no luck so far.
DataFrame:
ProductID Name Category IsCompete ManufacturedOn
0 1 Adjustable Race Components true 07/30/2014
1 2 LL Crankarm Components false 07/30/2014
2 3 HL Mountain Frame - Silver Bikes true 07/30/2019
Expected JSON Format:
{
"rows": [
{
"ProductID": 1,
"Name": "Adjustable Race",
"Category": "Components",
"IsCompete": true,
"ManufacturedOn": "07/30/2014"
},
{
"ProductID": 2,
"Name": "LL Crankarm",
"Category": "Components",
"IsCompete": true,
"ManufacturedOn": "07/30/2014"
},
{
"ProductID": 3,
"Name": "HL Mountain Frame - Silver",
"Category": "Bikes",
"IsCompete": true,
"ManufacturedOn": "07/30/2014"
}
]
}
use pandas to_dict method :
json = {'rows':df.to_dict('records')}
print(json)
{'rows': [{'ProductID': 1,
'Name': 'Adjustable Race',
'Category': 'Components',
'IsCompete': True,
'ManufacturedOn': '07/30/2014'},
{'ProductID': 2,
'Name': 'LL Crankarm',
'Category': 'Components',
'IsCompete': False,
'ManufacturedOn': '07/30/2014'},
{'ProductID': 3,
'Name': 'HL Mountain Frame - Silver',
'Category': 'Bikes',
'IsCompete': True,
'ManufacturedOn': '07/30/2019'}]}

Convert Pandas Dataframe to nested dictionary

I am trying to convert a dataframe to a nested dictionary but no success so far.
Dataframe: clean_data['Model', 'Problem', 'Size']
Here's how my data looks like:
Model Problem Size
lenovo a6020 screen broken 1
lenovo a6020a40 battery 60
bluetooth 60
buttons 60
lenovo k4 wi-fi 3
bluetooth 3
My desired output:
{
"name": "Brand",
"children": [
{
"name": "Lenovo",
"children": [
{
"name": "lenovo a6020",
"children": {
"name": "screen broken",
"size": 1
}
},
{
"name": "lenovo a6020a40",
"children": [
{
"name": "battery",
"size": 60
},
{
"name": "bluetooth",
"size": 60
},
{
"name": "buttons",
"size": 60
}
]
},
{
"name": "lenovo k4",
"children": [
{
"name": "wi-fi",
"size": 3
},
{
"name": "bluetooth",
"size": 3
}
]
}
]
}
]
}
I have tried pandas.DataFrame.to_dict method But it is returning a simple dictionary but I want it like the one mentioned above.
Use:
print (df)
Model Problem size
0 lenovo a6020 screen broken 1
1 lenovo a6020a40 battery 60
2 NaN bluetooth 60
3 NaN buttons 60
4 lenovo k4 wi-fi 3
5 NaN bluetooth 3
#repalce missing values by forward filling
df = df.ffill()
#split Model column by first whitesapces to 2 columns
df[['a','b']] = df['Model'].str.split(n=1, expand=True)
#each level convert to list of dictionaries
#for correct keys use rename
L = (df.rename(columns={'Problem':'name'})
.groupby(['a','b'])['name','size']
.apply(lambda x: x.to_dict('r'))
.rename('children')
.reset_index()
.rename(columns={'b':'name'})
.groupby('a')['name','children']
.apply(lambda x: x.to_dict('r'))
.rename('children')
.reset_index()
.rename(columns={'a':'name'})
.to_dict('r')
)
#print (L)
#create outer level by contructor
d = { "name": "Brand", "children": L}
print (d)
{
'name': 'Brand',
'children': [{
'name': 'lenovo',
'children': [{
'name': 'a6020',
'children': [{
'name': 'screen broken',
'size': 1
}]
}, {
'name': 'a6020a40',
'children': [{
'name': 'battery',
'size': 60
}, {
'name': 'bluetooth',
'size': 60
}, {
'name': 'buttons',
'size': 60
}]
}, {
'name': 'k4',
'children': [{
'name': 'wi-fi',
'size': 3
}, {
'name': 'bluetooth',
'size': 3
}]
}]
}]
}

Python - Adding fields and labels to nested json file

I have a dataframe as follows:
Name_ID | URL | Count | Rating
------------------------------------------------
ABC | www.example.com/ABC | 10 | 5
123 | www.example.com/123 | 9 | 4
XYZ | www.example.com/XYZ | 5 | 2
ABC111 | www.example.com/ABC111 | 5 | 2
ABC121 | www.example.com/ABC121 | 5 | 2
222 | www.example.com/222 | 5 | 3
abc222 | www.example.com/abc222 | 4 | 2
ABCaaa | www.example.com/ABCaaa | 4 | 2
I am trying to create a JSON as follows:
{
"name": "sampledata",
"children": [
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 100
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 100
}
]
}
]
},
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 50
},
{
"name": "ABCaaa",
"size": 50
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "ABC",
"size": 16
},
{
"name": "ABC111",
"size": 16
},
{
"name": "ABC121",
"size": 16
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 50
}
]
}
]
}
]
}
In order to do that:
I am trying to add labels such as "name" and "children" to the json while creating it.
I tried something like
results = [{"name": i, "children": j} for i,j in results.items()]
But it won't label it properly I believe.
Also, add another field with the label `"size"which I am planning to calculate based on the formula:
(Rating*Count*10000)/number_of_children_to_the_immediate_parent
Here is my dirty code:
import pandas as pd
from collections import defaultdict
import json
data =[('ABC', 'www.example.com/ABC', 10 , 5), ('123', 'www.example.com/123', 9, 4), ('XYZ', 'www.example.com/XYZ', 5, 2), ('ABC111', 'www.example.com/ABC111', 5, 2), ('ABC121', 'www.example.com/ABC121', 5, 2), ('222', 'www.example.com/222', 5, 3), ('abc222', 'www.example.com/abc222', 4, 2), ('ABCaaa', 'www.example.com/ABCaaa', 4, 2)]
df = pd.DataFrame(data, columns=['Name', 'URL', 'Count', 'Rating'])
gp = df.groupby(['Count'])
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {}
temp["name"] = name
temp["children"] = []
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp2 = {}
temp2["name"] = n
temp2["children"] = g.reset_index().T.to_dict().values()
for t in temp2["children"]:
t["size"] = (t["Rating"] * t["Count"] * 10000) / len(temp2["children"])
t["name"] = t["Name"]
del t["Count"]
del t["Rating"]
del t["URL"]
del t["Name"]
del t["index"]
temp["children"].append(temp2)
children.append(temp)
dict_json["children"] = children
print json.dumps(dict_json, indent=4)
Though the above code does print what I need, I am looking for more efficient and cleaner way to do the same, mainly because the actual dataset might be even more nested and complicated. Any help/suggestion will be much appreciated.
Quite an interesting problem and a great question!
You can improve your approach by reorganizing the code inside the loops and using list comprehensions. No need to delete things and introduce temp variables inside loops:
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {"name": name, "children": []}
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp["children"].append({
"name": n,
"children": [
{"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)}
for _, row in g.iterrows()
]
})
children.append(temp)
dict_json["children"] = children
Or, a "wrapped" version:
dict_json = {
"name": "flare",
"children": [
{
"name": name,
"children": [
{
"name": n,
"children": [
{
"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)
} for _, row in g.iterrows()
]
} for n, g in group.groupby(['Rating'])
]
} for name, group in gp
]
}
I'm getting the following dictionary printed for you sample input dataframe:
{
"name": "flare",
"children": [
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 40000
},
{
"name": "ABCaaa",
"size": 40000
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "XYZ",
"size": 33333
},
{
"name": "ABC111",
"size": 33333
},
{
"name": "ABC121",
"size": 33333
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 150000
}
]
}
]
},
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 360000
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 500000
}
]
}
]
}
]
}
If I understand correctly what you wan't to do is put a groupby into a nested json, if that is the case then you could use pandas groupby and cast it into a nested list of lists as so:
lol = pd.DataFrame(df.groupby(['Count','Rating'])\
.apply(lambda x: list(x['Name_ID']))).reset_index().values.tolist()
lol should look something like this:
[['10', '5', ['ABC']],
['4', '2', ['abc222', 'ABCaaa']],
['5', '2', ['XYZ ', 'ABC111', 'ABC121']],
['5', '3', ['222 ']],
['9', '4', ['123 ']]]
after that you could loop over lol to put it into a dict, but since you want to set nested items you'l have to use autovivification (check it out):
class autovividict(dict):
def __missing__(self, key):
value = self[key] = type(self)()
return value
d = autovividict()
for l in lol:
d[l[0]][l[1]] = l[2]
now you can use the json pack for printing and exporting:
print json.dumps(d,indent=2)
In case you need more than one groupby, you could concat your groups with pandas, cast to lol, remove any nans, and then loop, let me know if a full example can help.
setup
from io import StringIO
import pandas as pd
txt = """Name_ID,URL,Count,Rating
ABC,www.example.com/ABC,10,5
123,www.example.com/123,9,4
XYZ,www.example.com/XYZ,5,2
ABC111,www.example.com/ABC111,5,2
ABC121,www.example.com/ABC121,5,2
222,www.example.com/222,5,3
abc222,www.example.com/abc222,4,2
ABCaaa,www.example.com/ABCaaa,4,2"""
df = pd.read_csv(StringIO(txt))
size
pre-calculate it
df['size'] = df.Count.mul(df.Rating) \
.mul(10000) \
.div(df.groupby(
['Count', 'Rating']).Name_ID.transform('count')
).astype(int)
solution
create recursive function
def h(d):
if isinstance(d, pd.Series): d = d.to_frame().T
rec_cond = d.index.nlevels > 1 or d.index.nunique() > 1
return {'name': str(d.index[0]), 'size': str(d['size'].iloc[0])} if not rec_cond else \
[dict(name=str(n), children=h(g.xs(n))) for n, g in d.groupby(level=0)]
demo
import json
my_dict = dict(name='flare', children=h(df.set_index(['Count', 'Rating', 'Name_ID'])))
json.dumps(my_dict)
'{"name": "flare", "children": [{"name": "4", "children": [{"name": "2", "children": [{"name": "ABCaaa", "children": {"name": "ABCaaa", "size": "40000"}}, {"name": "abc222", "children": {"name": "abc222", "size": "40000"}}]}]}, {"name": "5", "children": [{"name": "2", "children": [{"name": "ABC111", "children": {"name": "ABC111", "size": "33333"}}, {"name": "ABC121", "children": {"name": "ABC121", "size": "33333"}}, {"name": "XYZ", "children": {"name": "XYZ", "size": "33333"}}]}, {"name": "3", "children": {"name": "222", "size": "150000"}}]}, {"name": "9", "children": [{"name": "4", "children": {"name": "123", "size": "360000"}}]}, {"name": "10", "children": [{"name": "5", "children": {"name": "ABC", "size": "500000"}}]}]}'
my_dict
{'children': [{'children': [{'children': [{'children': {'name': 'ABCaaa',
'size': '40000'},
'name': 'ABCaaa'},
{'children': {'name': 'abc222', 'size': '40000'}, 'name': 'abc222'}],
'name': '2'}],
'name': '4'},
{'children': [{'children': [{'children': {'name': 'ABC111', 'size': '33333'},
'name': 'ABC111'},
{'children': {'name': 'ABC121', 'size': '33333'}, 'name': 'ABC121'},
{'children': {'name': 'XYZ', 'size': '33333'}, 'name': 'XYZ'}],
'name': '2'},
{'children': {'name': '222', 'size': '150000'}, 'name': '3'}],
'name': '5'},
{'children': [{'children': {'name': '123', 'size': '360000'}, 'name': '4'}],
'name': '9'},
{'children': [{'children': {'name': 'ABC', 'size': '500000'}, 'name': '5'}],
'name': '10'}],
'name': 'flare'}

Categories