Get nested JSON from pandas dataframe grouped by multiple columns - python

I have a pandas dataframe:
d = {'key': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'crow', 'crow', 'crow', 'crow'],
'date': ['2021-01-01', '2021-01-01', '2021-01-02', '2021-01-02', '2021-01-01', '2021-01-01','2021-01-02', '2021-01-02', '2021-01-01', '2021-01-01', '2021-01-02', '2021-01-02'],
'class': [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
'count': [12, 3, 5, 5, 3, 1, 4, 1, 7, 3, 8, 2],
'percent': [.8, .2, .5, .5, .75, .25, .8, .2, .7, .3, .8, .2]
}
df = pd.DataFrame(data=d)
df
key date class count percent
0 foo 2021-01-01 1 12 0.80
1 foo 2021-01-01 2 3 0.20
2 foo 2021-01-02 1 5 0.50
3 foo 2021-01-02 2 5 0.50
4 bar 2021-01-01 1 3 0.75
5 bar 2021-01-01 2 1 0.25
6 bar 2021-01-02 1 4 0.80
7 bar 2021-01-02 2 1 0.20
8 crow 2021-01-01 1 7 0.70
9 crow 2021-01-01 2 3 0.30
10 crow 2021-01-02 1 8 0.80
11 crow 2021-01-02 2 2 0.20
I would like to create a nested JSON file that grouped by key and date where count: is a list containing the sums of the counts of key for that day and percent: are lists containing the percentages of the class counts over the total count (there needs to be one list per day containing the percentages of each class).
[
[
{
"key": "foo",
"count": [
15,
10
],
"predictions": [
[
.80,
.20
],
[
.50,
.50,
]
]
},
{
"key": "bar",
"count": [
4,
5
],
"predictions": [
[
.75,
.25
],
[
.80,
.20
]
]
},
{
"key": "crow",
"count": [
10,
10
],
"predictions": [
[
.70,
.30
],
[
.80,
.20
]
]
}
]
]
So far I have:
import json
dfj = dfd.groupby(["key","date"]).apply(lambda x: x.to_dict("r")).to_json(orient="records")
print(json.dumps(json.loads(dfj), indent=2, sort_keys=True))
which returns:
[
[
{
"class": 1,
"count": 3,
"date": "2021-01-01",
"key": "bar",
"percent": 0.75
},
{
"class": 2,
"count": 1,
"date": "2021-01-01",
"key": "bar",
"percent": 0.25
}
],
[
{
"class": 1,
"count": 4,
"date": "2021-01-02",
"key": "bar",
"percent": 0.8
},
{
"class": 2,
"count": 1,
"date": "2021-01-02",
"key": "bar",
"percent": 0.2
}
],
[
{
"class": 1,
"count": 7,
"date": "2021-01-01",
"key": "crow",
"percent": 0.7
},
{
"class": 2,
"count": 3,
"date": "2021-01-01",
"key": "crow",
"percent": 0.3
}
],
[
{
"class": 1,
"count": 8,
"date": "2021-01-02",
"key": "crow",
"percent": 0.8
},
{
"class": 2,
"count": 2,
"date": "2021-01-02",
"key": "crow",
"percent": 0.2
}
],
[
{
"class": 1,
"count": 12,
"date": "2021-01-01",
"key": "foo",
"percent": 0.8
},
{
"class": 2,
"count": 3,
"date": "2021-01-01",
"key": "foo",
"percent": 0.2
}
],
[
{
"class": 1,
"count": 5,
"date": "2021-01-02",
"key": "foo",
"percent": 0.5
},
{
"class": 2,
"count": 5,
"date": "2021-01-02",
"key": "foo",
"percent": 0.5
}
]
]
Any help would be appreciated. Thank you.

You can use:
d = {'count': ('count', 'sum'), 'predictions': ('percent', list)}
g = df.groupby(['key', 'date']).agg(**d).groupby(level=0).agg(list)
dct = [{'key': k, **v} for k, v in g.to_dict('i').items()]
Details:
groupby the given dataframe on key and date and agg using the dictionary d,
groupby the aggregated frame from step 1 on level=0 and agg using list
Finally using to_dict with orient=index to convert the frame from step 2 to dictionary followed by dict comprehension to add the key variable in dictionary.
Result:
[{'key': 'bar', 'count': [4, 5], 'predictions': [[0.75, 0.25], [0.8, 0.2]]},
{'key': 'crow', 'count': [10, 10], 'predictions': [[0.7, 0.3], [0.8, 0.2]]},
{'key': 'foo', 'count': [15, 10], 'predictions': [[0.8, 0.2], [0.5, 0.5]]}]

Related

Parse JSON from Api to pandas dataframe with doubled names

I'm parsing a json and I don't understand how to correctly decompose it into a dataframe.
Json structure i have (api response):
{
"result": {
"data": [],
"totals": [
0
]
},
"timestamp": "2021-11-25 15:19:21"
}
response_ =
{
"result":{
"data":[
{
"dimensions":[
{
"id":"2023-01-10",
"name":""
},
{
"id":"123",
"name":"good3"
}
],
"metrics":[
10,
20,
30,
40
]
},
{
"dimensions":[
{
"id":"2023-01-10",
"name":""
},
{
"id":"234",
"name":"good2"
}
],
"metrics":[
1,
2,
3,
4
]
}
],
"totals":[
11,
22,
33,
44
]
},
"timestamp":"2023-02-07 12:58:40"
}
I don't need "timestamp" and "totals" - just "data". So i do:
...
response_ = requests.post(url, headers=head, data=body)
datas = response_.json()
datas_ = datas['result']['data']
df1 = pd.json_normalize(datas_)
I got:
dimensions
metrics
0
[{'id': '2023-01-10', 'name': ''}, {'id': '123', 'name': 'good1'}]
[10, 20, 30, 40]
1
[{'id': '2023-01-10', 'name': ''}, {'id': '234', 'name': 'good2'}]
[1, 2, 3, 4]
But i need dataframe like:
id_
name_
id
name
metric1
metric2
metric3
metric4
0
2023-01-10
123
good1
10
20
30
40
1
2023-01-10
234
good2
1
2
3
4
When i try like:
df1 = pd.json_normalize(datas_, 'dimensions')
i get all id's and name's in one column.
Explain step by step if possible. Thank you.
Try:
response = {
"result": {
"data": [
{
"dimensions": [
{"id": "2023-01-10", "name": ""},
{"id": "123", "name": "good3"},
],
"metrics": [10, 20, 30, 40],
},
{
"dimensions": [
{"id": "2023-01-10", "name": ""},
{"id": "234", "name": "good2"},
],
"metrics": [1, 2, 3, 4],
},
],
"totals": [11, 22, 33, 44],
},
"timestamp": "2023-02-07 12:58:40",
}
tmp = [
{
**{f"{k}_": v for k, v in d["dimensions"][0].items()},
**{k: v for k, v in d["dimensions"][1].items()},
**{f'metric{i}':m for i, m in enumerate(d['metrics'], 1)}
}
for d in response["result"]["data"]
]
df = pd.DataFrame(tmp)
print(df)
Prints:
id_ name_ id name metric1 metric2 metric3 metric4
0 2023-01-10 123 good3 10 20 30 40
1 2023-01-10 234 good2 1 2 3 4

fetching multiple vales and keys from dict

movies={
'actors':{'prabhas':{'knownAs':'Darling', 'awards':{'nandi':1, 'cinemaa':1, 'siima':1},'remuneration':100, 'hits':{'industry':2, 'super':3,'flops':8}, 'age':41, 'height':6.1, 'mStatus':'single','sRate':'35%'},
'pavan':{'knownAs':'Power Star', 'awards':{'nandi':2, 'cinemaa':2, 'siima':5}, 'hits':{'industry':2, 'super':7,'flops':16}, 'age':48, 'height':5.9, 'mStatus':'married','sRate':'37%','remuneration':50},
},
'actress':{
'tamanna':{'knownAs':'Milky Beauty', 'awards':{'nandi':0, 'cinemaa':1, 'siima':1}, 'remuneration':10, 'hits':{'industry':1, 'super':7,'flops':11}, 'age':28, 'height':5.9, 'mStatus':'single', 'sRate':'40%'},
'rashmika':{'knownAs':'Butter Milky Beauty', 'awards':{'nandi':0, 'cinemaa':0, 'siima':2}, 'remuneration':12,'hits':{'industry':0, 'super':4,'flops':2}, 'age':36, 'height':5.9, 'mStatus':'single', 'sRate':'30%'},
1.What are the total number of Nandi Awards won by actors?
2. What is the success rate of Prince?
3.What is the name of Prince?
you can answer the first question with this:
import jmespath
movies={
"actors": {
"prabhas": {
"knownAs": "Darling",
"awards": {
"nandi": 1,
"cinemaa": 1,
"siima": 1
},
"remuneration": 100,
"hits": {
"industry": 2,
"super": 3,
"flops": 8
},
"age": 41,
"height": 6.1,
"mStatus": "single",
"sRate": "35%"
},
"pavan": {
"knownAs": "Power Star",
"awards": {
"nandi": 2,
"cinemaa": 2,
"siima": 5
},
"hits": {
"industry": 2,
"super": 7,
"flops": 16
},
"age": 48,
"height": 5.9,
"mStatus": "married",
"sRate": "37%",
"remuneration": 50
}
},
"actress": {
"tamanna": {
"knownAs": "Milky Beauty",
"awards": {
"nandi": 0,
"cinemaa": 1,
"siima": 1
},
"remuneration": 10,
"hits": {
"industry": 1,
"super": 7,
"flops": 11
},
"age": 28,
"height": 5.9,
"mStatus": "single",
"sRate": "40%"
},
"rashmika": {
"knownAs": "Butter Milky Beauty",
"awards": {
"nandi": 0,
"cinemaa": 0,
"siima": 2
},
"remuneration": 12,
"hits": {
"industry": 0,
"super": 4,
"flops": 2
},
"age": 36,
"height": 5.9,
"mStatus": "single",
"sRate": "30%"
}
}
}
total_nandies_by_actors = sum(jmespath.search('[]',jmespath.search('actors.*.*.nandi',movies)))
but there is no Prince in the data you've provided

Python Pandas - Convert dataframe into json

I have this pandas.dataframe:
date. pid value interval
0 2021-09-05 00:04:24 1 5.554 2021-09-05 00:00:00
1 2021-09-05 00:06:38 1 4.359 2021-09-05 00:05:00
2 2021-09-05 00:06:46 1 18.364 2021-09-05 00:05:00
3 2021-09-05 00:04:24 2 15.554 2021-09-05 00:00:00
4 2021-09-05 00:06:38 2 3.359 2021-09-05 00:05:00
5 2021-09-05 00:06:46 2 10.364 2021-09-05 00:05:00
which I want to turn it into JSON like this:
{
"2021-09-05 00:00:00": {
"pid1": [
{
"date": "2021-09-05 00:04:24",
"pid": 1,
"value": 5.554,
},
],
"pid2": [
{
"date": "2021-09-05 00:04:24",
"pid": 2,
"value": 15.554,
}
],
},
"2021-09-05 00:05:00": {
"pid1": [
{
"date": "2021-09-05 00:04:24",
"pid": 1,
"value": 4.359,
},
{
"date": "2021-09-05 00:04:24",
"pid": 1,
"value": 18.364,
},
],
"pid2": [
{
"date": "2021-09-05 00:06:38",
"pid": 2,
"value": 3.359,
},{
"date": "2021-09-05 00:06:46",
"pid": 1,
"value": 10.364,
},
],
}
}
Basically I want the group the data by the interval value.
Is there a quick way to format this?
Create helper column with pid, convert to MultiIndex Series and last crate nested dictionary:
s = (df.assign(new = 'pid' + df['pid'].astype(str))
.groupby(['interval','new'])[['date','pid','value']]
.apply(lambda x : x.to_dict(orient= 'records')))
d = {level: s.xs(level).to_dict() for level in s.index.levels[0]}
print (d)
{
'2021-09-05 00:00:00': {
'pid1': [{
'date': '2021-09-05 00:04:24',
'pid': 1,
'value': 5.554
}],
'pid2': [{
'date': '2021-09-05 00:04:24',
'pid': 2,
'value': 15.554
}]
},
'2021-09-05 00:05:00': {
'pid1': [{
'date': '2021-09-05 00:06:38',
'pid': 1,
'value': 4.359
},
{
'date': '2021-09-05 00:06:46',
'pid': 1,
'value': 18.364
}
],
'pid2': [{
'date': '2021-09-05 00:06:38',
'pid': 2,
'value': 3.359
},
{
'date': '2021-09-05 00:06:46',
'pid': 2,
'value': 10.364
}
]
}
}
Last for json use:
import json
json = json.dumps(d)

Complex Pandas Dataframe to Nested Dictionary/JSON

I have 3 Dataframes, I have merged them into a single one, and want to represent the dataframe into a Nested dictionary / json format.
df1: This contains general information about a patient.
>>> df1 = pd.DataFrame({'PatientId' : [1,2], 'Gender' : ['M', 'F'], 'Marital_status':['married', 'unmarried']})
>>> df1
PatientId Gender Marital_status
1 M married
2 F unmarried
df2:
this contains details of each admission of a patient, and the diagnosis.
>>> df2 = pd.DataFrame({'PatientId': [1,1,2,2], 'AdmissionId' : [1,2,1,2], 'Diagnosis_Code': ['DXS', 'SDE', 'DEF', 'ATR'], 'Stay_Duration' : [45,14,79,32]})
>>> df2
PatientId AdmissionId Diagnosis_Code Stay_Duration
1 1 DXS 45
1 2 SDE 14
2 1 DEF 79
2 2 ATR 32
df3:
this dataframe contains all the lab testes reports of a patient carried out in each admission.
>>> df3 = pd.DataFrame(
{
'PatientId':[1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2],
'AdmissionId' : [1,1,1,1,2,2,2,2,1,1,1,1,2,2,2,2],
'LabTest' : [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2],
'LabName' : ['ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'PQR'],
'LabValue' : [5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4],
'IsNormal' : [True, False, True, True, True, False, True, True, True, False, True, True, True, False, True, True]
}
)
>>> df3
PatientId AdmissionId LabTest LabName LabValue IsNormal
1 1 1 ABC 5.7 True
1 1 1 XYZ 1.9 False
1 1 2 ABC 5.6 True
1 1 2 XYZ 2.4 True
1 2 1 PQR 5.7 True
1 2 1 XYZ 1.9 False
1 2 2 ABC 5.6 True
1 2 2 XYZ 2.4 True
2 1 1 ABC 5.7 True
2 1 1 XYZ 1.9 False
2 1 2 ABC 5.6 True
2 1 2 XYZ 2.4 True
2 2 1 PQR 5.7 True
2 2 1 XYZ 1.9 False
2 2 2 ABC 5.6 True
2 2 2 PQR 2.4 True
I want my output to look like this --
"PatientId" : 1
"Gender":M
"Marital_Status" : married
"AdmissionsInfo":
{
"AdmissionID": 1
"Diagnosis": DXS
"Stay_Duration" : 45
"lab reports" :
{
"labtest":1
"labinfo":
{
"labName":ABC
"labValue":5.6
"isNormal":True
},
{
"labName": XYZ
"labValue": 2.4
"isNormal": True
}
"labtest":2
"labinfo":
{
"labName":ABC
"labValue":5.7
"isNormal":True
},
{
"labName": XYZ
"labValue": 1.9
"isNormal":False
}
}
"AdmissionID": 2
"Diagnosis": SDE
"Stay_Duration" : 45
/
/
//
} end of patient 1's all admissions' info
"PatientId" : 2
"Gender": F
"Marital_Status" : unmarried
"AdmissionsInfo":
//
//
and so on }}}
Find below a full pandas (although redundant) solution.
First merge your three dataframes into one callled df_merged:
df_merged = df3.merge(df1, on="PatientId").merge(df2, on=["PatientId", "AdmissionId"])
Now create the hierarchy you need (this part is ugly, but works, happy to receive feedback on it):
(df_merged.groupby(["PatientId", "Gender", "Marital_status", "AdmissionId", "Diagnosis_Code", "Stay_Duration", "LabTest"])
.apply(lambda x: x[["LabName", "LabValue", "IsNormal"]].to_dict("r"))
.reset_index()
.rename(columns={0:"LabInfo"})
.groupby(["PatientId", "Gender", "Marital_status", "AdmissionId", "Diagnosis_Code", "Stay_Duration"])
.apply(lambda x: x[["LabTest", "LabInfo"]].to_dict("r"))
.reset_index()
.rename(columns={0:"LabReports"})
.groupby(["PatientId", "Gender", "Marital_status"])
.apply(lambda x: x[["AdmissionId", "Diagnosis_Code", "Stay_Duration", "LabReports"]].to_dict("r"))
.reset_index()
.rename(columns={0:"AdmissionsInfo"})
.to_json(orient="records"))
And dump that into a json object:
>>> import json
>>> print(json.dumps(json.loads(j), indent=2, sort_keys=False))
And the result:
[
{
"PatientId": 1,
"Gender": "M",
"Marital_status": "married",
"AdmissionsInfo": [
{
"AdmissionId": "1",
"Diagnosis_Code": "DXS",
"Stay_Duration": "45",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
},
{
"AdmissionId": "2",
"Diagnosis_Code": "SDE",
"Stay_Duration": "14",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "PQR",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
}
]
},
{
"PatientId": 2,
"Gender": "F",
"Marital_status": "unmarried",
"AdmissionsInfo": [
{
"AdmissionId": "1",
"Diagnosis_Code": "DEF",
"Stay_Duration": "79",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
},
{
"AdmissionId": "2",
"Diagnosis_Code": "ATR",
"Stay_Duration": "32",
"LabReports": [
{
"LabTest": "1",
"LabInfo": [
{
"LabName": "PQR",
"LabValue": "5.7",
"IsNormal": "True"
},
{
"LabName": "XYZ",
"LabValue": "1.9",
"IsNormal": "False"
}
]
},
{
"LabTest": "2",
"LabInfo": [
{
"LabName": "ABC",
"LabValue": "5.6",
"IsNormal": "True"
},
{
"LabName": "PQR",
"LabValue": "2.4",
"IsNormal": "True"
}
]
}
]
}
]
}
]
In [84]: json_list = []
In [85]: for index1, row1 in df1.iterrows():
...: d = dict(row1)
...: json_list.append(d)
...: for index2, row2 in df2[df2['PatientId'] == row1['PatientId']].iterrows():
...: d2 = dict(row2)
...: del d2['PatientId']
...: d['AdmissionsInfo'] = d2
...: lab_reports_list = []
...: for index3, row3 in df3[(df3['PatientId'] == row2['PatientId']) & (df3['AdmissionId'] == row2['AdmissionId'])].iterrows():
...: d3 = dict(row3)
...: d4 = {}
...: d4['labtest'] = row3['LabTest']
...: d4['labinfo'] = {'labName': row3['LabName'], 'labValue': row3['LabValue'], 'isNormal': row3['IsNormal']}
...: lab_reports_list.append(d4)
...: d2['lab reports'] = lab_reports_list
Output
In [87]: json_list
Out[87]:
[{'PatientId': 1,
'Gender': 'M',
'Marital_status': 'married',
'AdmissionsInfo': {'AdmissionId': 2,
'Diagnosis_Code': 'SDE',
'Stay_Duration': 14,
'lab reports': [{'labtest': 1,
'labinfo': {'labName': 'PQR', 'labValue': 5.7, 'isNormal': True}},
{'labtest': 1,
'labinfo': {'labName': 'XYZ', 'labValue': 1.9, 'isNormal': False}},
{'labtest': 2,
'labinfo': {'labName': 'ABC', 'labValue': 5.6, 'isNormal': True}},
{'labtest': 2,
'labinfo': {'labName': 'XYZ', 'labValue': 2.4, 'isNormal': True}}]}},
{'PatientId': 2,
'Gender': 'F',
'Marital_status': 'unmarried',
'AdmissionsInfo': {'AdmissionId': 2,
'Diagnosis_Code': 'ATR',
'Stay_Duration': 32,
'lab reports': [{'labtest': 1,
'labinfo': {'labName': 'PQR', 'labValue': 5.7, 'isNormal': True}},
{'labtest': 1,
'labinfo': {'labName': 'XYZ', 'labValue': 1.9, 'isNormal': False}},
{'labtest': 2,
'labinfo': {'labName': 'ABC', 'labValue': 5.6, 'isNormal': True}},
{'labtest': 2,
'labinfo': {'labName': 'PQR', 'labValue': 2.4, 'isNormal': True}}]}}]
Arnaud's response does the job but seems not very "Pythonic".
There might be something possible with aggregate functions on DataFrameGroupby. Tryed the .agg(dict) but not functional.
for those wanting to help:
patient_df = pd.DataFrame({'PatientId' : [1,2], 'Gender' : ['M', 'F'], 'Marital_status':['married', 'unmarried']})
admission_df = pd.DataFrame({'PatientId': [1,1,2,2], 'AdmissionId' : [1,2,1,2], 'Diagnosis_Code': ['DXS', 'SDE', 'DEF', 'ATR'], 'Stay_Duration' : [45,14,79,32]})
lab_df = pd.DataFrame(
{
'PatientId':[1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2],
'AdmissionId' : [1,1,1,1,2,2,2,2,1,1,1,1,2,2,2,2],
'LabTest' : [1,1,2,2,1,1,2,2,1,1,2,2,1,1,2,2],
'LabName' : ['ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'ABC', 'XYZ', 'PQR', 'XYZ', 'ABC', 'PQR'],
'LabValue' : [5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4, 5.7, 1.9, 5.6, 2.4],
'IsNormal' : [True, False, True, True, True, False, True, True, True, False, True, True, True, False, True, True]
}
)

Python - Adding fields and labels to nested json file

I have a dataframe as follows:
Name_ID | URL | Count | Rating
------------------------------------------------
ABC | www.example.com/ABC | 10 | 5
123 | www.example.com/123 | 9 | 4
XYZ | www.example.com/XYZ | 5 | 2
ABC111 | www.example.com/ABC111 | 5 | 2
ABC121 | www.example.com/ABC121 | 5 | 2
222 | www.example.com/222 | 5 | 3
abc222 | www.example.com/abc222 | 4 | 2
ABCaaa | www.example.com/ABCaaa | 4 | 2
I am trying to create a JSON as follows:
{
"name": "sampledata",
"children": [
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 100
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 100
}
]
}
]
},
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 50
},
{
"name": "ABCaaa",
"size": 50
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "ABC",
"size": 16
},
{
"name": "ABC111",
"size": 16
},
{
"name": "ABC121",
"size": 16
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 50
}
]
}
]
}
]
}
In order to do that:
I am trying to add labels such as "name" and "children" to the json while creating it.
I tried something like
results = [{"name": i, "children": j} for i,j in results.items()]
But it won't label it properly I believe.
Also, add another field with the label `"size"which I am planning to calculate based on the formula:
(Rating*Count*10000)/number_of_children_to_the_immediate_parent
Here is my dirty code:
import pandas as pd
from collections import defaultdict
import json
data =[('ABC', 'www.example.com/ABC', 10 , 5), ('123', 'www.example.com/123', 9, 4), ('XYZ', 'www.example.com/XYZ', 5, 2), ('ABC111', 'www.example.com/ABC111', 5, 2), ('ABC121', 'www.example.com/ABC121', 5, 2), ('222', 'www.example.com/222', 5, 3), ('abc222', 'www.example.com/abc222', 4, 2), ('ABCaaa', 'www.example.com/ABCaaa', 4, 2)]
df = pd.DataFrame(data, columns=['Name', 'URL', 'Count', 'Rating'])
gp = df.groupby(['Count'])
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {}
temp["name"] = name
temp["children"] = []
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp2 = {}
temp2["name"] = n
temp2["children"] = g.reset_index().T.to_dict().values()
for t in temp2["children"]:
t["size"] = (t["Rating"] * t["Count"] * 10000) / len(temp2["children"])
t["name"] = t["Name"]
del t["Count"]
del t["Rating"]
del t["URL"]
del t["Name"]
del t["index"]
temp["children"].append(temp2)
children.append(temp)
dict_json["children"] = children
print json.dumps(dict_json, indent=4)
Though the above code does print what I need, I am looking for more efficient and cleaner way to do the same, mainly because the actual dataset might be even more nested and complicated. Any help/suggestion will be much appreciated.
Quite an interesting problem and a great question!
You can improve your approach by reorganizing the code inside the loops and using list comprehensions. No need to delete things and introduce temp variables inside loops:
dict_json = {"name": "flare"}
children = []
for name, group in gp:
temp = {"name": name, "children": []}
rgp = group.groupby(['Rating'])
for n, g in rgp:
temp["children"].append({
"name": n,
"children": [
{"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)}
for _, row in g.iterrows()
]
})
children.append(temp)
dict_json["children"] = children
Or, a "wrapped" version:
dict_json = {
"name": "flare",
"children": [
{
"name": name,
"children": [
{
"name": n,
"children": [
{
"name": row["Name"],
"size": row["Rating"] * row["Count"] * 10000 / len(g)
} for _, row in g.iterrows()
]
} for n, g in group.groupby(['Rating'])
]
} for name, group in gp
]
}
I'm getting the following dictionary printed for you sample input dataframe:
{
"name": "flare",
"children": [
{
"name": 4,
"children": [
{
"name": 2,
"children": [
{
"name": "abc222",
"size": 40000
},
{
"name": "ABCaaa",
"size": 40000
}
]
}
]
},
{
"name": 5,
"children": [
{
"name": 2,
"children": [
{
"name": "XYZ",
"size": 33333
},
{
"name": "ABC111",
"size": 33333
},
{
"name": "ABC121",
"size": 33333
}
]
},
{
"name": 3,
"children": [
{
"name": "222",
"size": 150000
}
]
}
]
},
{
"name": 9,
"children": [
{
"name": 4,
"children": [
{
"name": "123",
"size": 360000
}
]
}
]
},
{
"name": 10,
"children": [
{
"name": 5,
"children": [
{
"name": "ABC",
"size": 500000
}
]
}
]
}
]
}
If I understand correctly what you wan't to do is put a groupby into a nested json, if that is the case then you could use pandas groupby and cast it into a nested list of lists as so:
lol = pd.DataFrame(df.groupby(['Count','Rating'])\
.apply(lambda x: list(x['Name_ID']))).reset_index().values.tolist()
lol should look something like this:
[['10', '5', ['ABC']],
['4', '2', ['abc222', 'ABCaaa']],
['5', '2', ['XYZ ', 'ABC111', 'ABC121']],
['5', '3', ['222 ']],
['9', '4', ['123 ']]]
after that you could loop over lol to put it into a dict, but since you want to set nested items you'l have to use autovivification (check it out):
class autovividict(dict):
def __missing__(self, key):
value = self[key] = type(self)()
return value
d = autovividict()
for l in lol:
d[l[0]][l[1]] = l[2]
now you can use the json pack for printing and exporting:
print json.dumps(d,indent=2)
In case you need more than one groupby, you could concat your groups with pandas, cast to lol, remove any nans, and then loop, let me know if a full example can help.
setup
from io import StringIO
import pandas as pd
txt = """Name_ID,URL,Count,Rating
ABC,www.example.com/ABC,10,5
123,www.example.com/123,9,4
XYZ,www.example.com/XYZ,5,2
ABC111,www.example.com/ABC111,5,2
ABC121,www.example.com/ABC121,5,2
222,www.example.com/222,5,3
abc222,www.example.com/abc222,4,2
ABCaaa,www.example.com/ABCaaa,4,2"""
df = pd.read_csv(StringIO(txt))
size
pre-calculate it
df['size'] = df.Count.mul(df.Rating) \
.mul(10000) \
.div(df.groupby(
['Count', 'Rating']).Name_ID.transform('count')
).astype(int)
solution
create recursive function
def h(d):
if isinstance(d, pd.Series): d = d.to_frame().T
rec_cond = d.index.nlevels > 1 or d.index.nunique() > 1
return {'name': str(d.index[0]), 'size': str(d['size'].iloc[0])} if not rec_cond else \
[dict(name=str(n), children=h(g.xs(n))) for n, g in d.groupby(level=0)]
demo
import json
my_dict = dict(name='flare', children=h(df.set_index(['Count', 'Rating', 'Name_ID'])))
json.dumps(my_dict)
'{"name": "flare", "children": [{"name": "4", "children": [{"name": "2", "children": [{"name": "ABCaaa", "children": {"name": "ABCaaa", "size": "40000"}}, {"name": "abc222", "children": {"name": "abc222", "size": "40000"}}]}]}, {"name": "5", "children": [{"name": "2", "children": [{"name": "ABC111", "children": {"name": "ABC111", "size": "33333"}}, {"name": "ABC121", "children": {"name": "ABC121", "size": "33333"}}, {"name": "XYZ", "children": {"name": "XYZ", "size": "33333"}}]}, {"name": "3", "children": {"name": "222", "size": "150000"}}]}, {"name": "9", "children": [{"name": "4", "children": {"name": "123", "size": "360000"}}]}, {"name": "10", "children": [{"name": "5", "children": {"name": "ABC", "size": "500000"}}]}]}'
my_dict
{'children': [{'children': [{'children': [{'children': {'name': 'ABCaaa',
'size': '40000'},
'name': 'ABCaaa'},
{'children': {'name': 'abc222', 'size': '40000'}, 'name': 'abc222'}],
'name': '2'}],
'name': '4'},
{'children': [{'children': [{'children': {'name': 'ABC111', 'size': '33333'},
'name': 'ABC111'},
{'children': {'name': 'ABC121', 'size': '33333'}, 'name': 'ABC121'},
{'children': {'name': 'XYZ', 'size': '33333'}, 'name': 'XYZ'}],
'name': '2'},
{'children': {'name': '222', 'size': '150000'}, 'name': '3'}],
'name': '5'},
{'children': [{'children': {'name': '123', 'size': '360000'}, 'name': '4'}],
'name': '9'},
{'children': [{'children': {'name': 'ABC', 'size': '500000'}, 'name': '5'}],
'name': '10'}],
'name': 'flare'}

Categories