Convert a multi index dataframe to json - python

Here is the multi index data frame
accounting sales
PhNumber age firstName lastName PhNumber age firstName lastName
0 <PH_Number> 29 <first_Name> <last_Name> <PH_Number> 29 <first_Name> <last_Name>
1 <PH_Number> 38 <first_Name> <last_Name> <PH_Number> 48 <first_Name> <last_Name>
How do I convert this to a proper json?
I have used pandas.to_json().
But couldn't get the desired output like this
{ "accounting": [{"firstName": <first_name>,
"lastName": <last_name>,
"age": 29,
"PhNumber": <PH_Number>},
{"firstName": <first_name>,
"lastName": "<last_name>",
"age": 38,
"PhNumber": <PH_Number>}],
"sales": [{"firstName": "<first_name>",
"lastName": "<last_name>",
"age": 29,
"PhNumber": <PH_Number>},
{"firstName": "<first_name>",
"lastName": "<last_name>",
"age": 48,
"PhNumber": <PH_Number>}]}

What you ask is beyond the possibilies of to_json, so you should first compute the Python data structure and then convert it to JSON:
data_struct = {k: df[k].to_dict(orient='records') for k in df.columns.levels[0]}
You can then easily build a JSON file (or string):
print(json.dumps(data_struct, indent=2)
gives:
{
"accounting": [
{
"PhNumber": "<PH_Number>",
"age": 29,
"firstName": "<first_Name>",
"lastName": "<last_Name>"
},
{
"PhNumber": "<PH_Number>",
"age": 38,
"firstName": "<first_Name>",
"lastName": "<last_Name>"
}
],
"sales": [
{
"PhNumber": "<PH_Number>",
"age": 29,
"firstName": "<first_Name>",
"lastName": "<last_Name>"
},
{
"PhNumber": "<PH_Number>",
"age": 48,
"firstName": "<first_Name>",
"lastName": "<last_Name>"
}
]
}

Related

Converting nested json data to csv using pandas dataframe

I have a JSON data like the below:
jsonStr = '''
{
"student_details": [
{
"ID": 101,
"Name": [
{
"First_Name": "AAA",
"Last_Name": "BBB"
},
{
"Father": "AAA1",
"Mother": "BBB1"
}
],
"Phone_Number": [
{
"Student_PhoneNum1": 1111111111,
"Student_PhoneNum2": 1111111112
},
{
"Parent_PhoneNum1": 1111111121,
"Parent_PhoneNum2": 1111111132
}
],
"DOB": "1998-05-05",
"Place_of_Birth": "AA",
"Marks": [
{
"DataStructures": 95,
"ObjectOrientedProgramming": 85,
"DiscreteMathematics": 100,
"AnalysisOfAlgorithm": 99,
"NetworkSecurity": 85
}
],
"DateOfJoining": "2022-05-05"
},
{
"ID": 102,
"Name": [
{
"First_Name": "ZZZ",
"Last_Name": "YYY"
},
{
"Father": "ZZZ1",
"Mother": "YYY1"
}
],
"Phone_Number": [
{
"Student_PhoneNum1": 1111111182,
"Student_PhoneNum2": 1111111182
},
{
"Parent_PhoneNum1": 1111111128,
"Parent_PhoneNum2": 1111111832
}
],
"DOB": "1998-06-10",
"Place_of_Birth": "ZZ",
"Marks": [
{
"DataStructures": 25,
"ObjectOrientedProgramming": 50,
"DiscreteMathematics": 75,
"AnalysisOfAlgorithm": 60,
"NetworkSecurity": 30
}
],
"DateOfJoining": "2022-05-05"
},
{
"ID": 103,
"Name": [
{
"First_Name": "TTT",
"Last_Name": "UUU"
},
{
"Father": "TTT1",
"Mother": "UUU1"
}
],
"Phone_Number": [
{
"Student_PhoneNum1": 1111118753,
"Student_PhoneNum2": 1111111153
},
{
"Parent_PhoneNum1": 1111111523,
"Parent_PhoneNum2": 1111111533
}
],
"DOB": "1999-01-01",
"Place_of_Birth": "TT",
"Marks": [
{
"DataStructures": 50,
"ObjectOrientedProgramming": 75,
"DiscreteMathematics": 65,
"AnalysisOfAlgorithm": 75,
"NetworkSecurity": 40
}
],
"DateOfJoining": "2022-05-06"
}
]
}
'''
I'm trying to convert every key-value pair to a csv file from this data using the below code
import pandas as pd
ar = pd.read_json(jsonStr)
df = pd.json_normalize(ar['student_details'])
print(df)
df.to_csv('CSVresult.csv', index=False)
for accessing the JSON data, I have passed json data header named student_details.
Result:
is there any way to get the data like the below(every key-value pairs in separate columns) without passing the header student_details and the column names directly?(the json data contain a lot of nested data like this)
you can use:
df = pd.DataFrame(jsonStr)
df=df['student_details'].apply(pd.Series).explode('Name').explode('Phone_Number').explode('Marks')
for row in df.to_dict('records'):
row['Name']['ID']=row['ID']
row['Phone_Number']['ID']=row['ID']
def get_values_without_nans(col_name):
return df[col_name].apply(pd.Series).drop_duplicates().groupby("ID").agg(lambda x: np.nan if x.isnull().all() else x.dropna())
name = get_values_without_nans('Name')
phone_number=get_values_without_nans('Phone_Number')
phone_number.index=phone_number.index.astype('int32')
marks=df.set_index('ID').Marks.apply(pd.Series).drop_duplicates()
meta=df[['ID','DOB','Place_of_Birth','DateOfJoining']].drop_duplicates().set_index('ID')
final=meta.join([name,phone_number,marks])
print(final)
'''
ID DOB Place_of_Birth DateOfJoining First_Name Last_Name Father Mother Student_PhoneNum1 Student_PhoneNum2 Parent_PhoneNum1 Parent_PhoneNum2 DataStructures ObjectOrientedProgramming DiscreteMathematics AnalysisOfAlgorithm
0 101 1998-05-05 AA 2022-05-05 AAA BBB AAA1 BBB1 1111111111.0 1111111112.0 1111111121.0 1111111132.0 95 85 100 99
1 102 1998-06-10 ZZ 2022-05-05 ZZZ YYY ZZZ1 YYY1 1111111182.0 1111111182.0 1111111128.0 1111111832.0 25 50 75 60
2 103 1999-01-01 TT 2022-05-06 TTT UUU TTT1 UUU1 1111118753.0 1111111153.0 1111111523.0 1111111533.0 50 75 65 75
'''

Include JSON section numbers as columns in a df while converting JSON to DF

I have a nested JSON as follows:
{
"group": {
"groupname": "grp1",
"groupid": 1,
"city": "London"
},
"persons": {
"0": {
"name": "john",
"age": 12,
"gender": "M",
"groupid": 1
},
"1": {
"name": "maat",
"age": 15,
"gender": "M",
"groupid": 1
},
"2": {
"name": "chrissle",
"age": 10,
"gender": "F",
"groupid": 1
},
"3": {
"name": "stacy",
"age": 11,
"gender": "F",
"groupid": 1
},
"4": {
"name": "mark",
"age": 12,
"gender": "M",
"groupid": 1
},
"5": {
"name": "job",
"age": 12,
"gender": "M",
"groupid": 1
}
},
"group": {
"groupname": "grp1",
"groupid": 2,
"city": "NewYork"
},
"persons": {
"0": {
"name": "will",
"age": 12,
"gender": "M",
"groupid": 2
},
"1": {
"name": "phil",
"age": 15,
"gender": "M",
"groupid": 2
},
"2": {
"name": "winnie",
"age": 10,
"gender": "F",
"groupid": 2
}
}
}
I want to separate the two sections group and persons into two df respectively.
For the second df persons I want to include the section numbers as columns as follows:
id name age gender groupid
0 john 12 M 1
1 maat 15 M 1
2 chrissle 10 F 1
I have loaded the JSON as a list of dict and converted it into a df:
data= pd.DataFrame.from_dict(data)
I can then get persons
personsdf= personsdf['persons']
This will however give me a df with one column that has dict rows for every persons section.
I have tried below to unnest the dict rows:
finaldf= pd.DataFrame()
for index, row in personsdf.iterrows():
row_data=row['personsdf']
row_data = pd.DataFrame.from_dict(row_data)
row_data = row_data.T
finaldf= finaldf.append(row_data, ignore_index=True)
But then I get all the columns except the section number which gets lost.
Is there a better way to approach this?
If I understand you correctly you want to create two dataframes: one for groups and the second for persons:
data = [
{
"group": {"groupname": "grp1", "groupid": 1, "city": "London"},
"persons": {
"0": {"name": "john", "age": 12, "gender": "M", "groupid": 1},
"1": {"name": "maat", "age": 15, "gender": "M", "groupid": 1},
"2": {"name": "chrissle", "age": 10, "gender": "F", "groupid": 1},
"3": {"name": "stacy", "age": 11, "gender": "F", "groupid": 1},
"4": {"name": "mark", "age": 12, "gender": "M", "groupid": 1},
"5": {"name": "job", "age": 12, "gender": "M", "groupid": 1},
},
},
{
"group": {"groupname": "grp1", "groupid": 2, "city": "NewYork"},
"persons": {
"0": {"name": "will", "age": 12, "gender": "M", "groupid": 2},
"1": {"name": "phil", "age": 15, "gender": "M", "groupid": 2},
"2": {"name": "winnie", "age": 10, "gender": "F", "groupid": 2},
},
},
]
df1 = pd.DataFrame([d["group"] for d in data])
df2 = pd.DataFrame(
[{"id": k, **v} for d in data for k, v in d["persons"].items()]
)
print(df1)
print(df2)
Prints:
groupname groupid city
0 grp1 1 London
1 grp1 2 NewYork
id name age gender groupid
0 0 john 12 M 1
1 1 maat 15 M 1
2 2 chrissle 10 F 1
3 3 stacy 11 F 1
4 4 mark 12 M 1
5 5 job 12 M 1
6 0 will 12 M 2
7 1 phil 15 M 2
8 2 winnie 10 F 2

To delete dictionaries with duplicate values for keys in two dictionary lists

I want to remove a dictionary from the l2 list that has duplicate values for the ["name"] key in two dictionary lists.
How do I do this?
l = [
{
"id": 1,
"name": "John"
},
{
"id": 2,
"name": "Tom"
}
]
l2 = [
{
"name": "John",
"gender": "male",
"country": "USA"
},
{
"name": "Alex",
"gender": "male"
"country": "Canada"
},
{
"name": "Sofía",
"gender": "female"
"country": "Mexico"
},
]
Results sought
[
{
"name": "Alex",
"gender": "male"
"country": "Canada"
},
{
"name": "Sofía",
"gender": "female"
"country": "Mexico"
},
]
Try:
>>> [d for d in l2 if d["name"] not in [d1["name"] for d1 in l]]
[{'name': 'Alex', 'gender': 'male', 'country': 'Canada'},
{'name': 'Sofía', 'gender': 'female', 'country': 'Mexico'}]

Python Json data Group it by same last name

Newbie here. I have a Json data that have full name, age, Country and Department. By using python, how can i generate a new Json format with last name as a key and the Json data contain the total number of people that have same last name, list of age and list of departments?
Json as data
{
"John Jane": {
"age": 30,
"Country": "Denmark",
"Department": "Marketing"
},
"Gennie Jane": {
"age": 45,
"Country": "New Zealand",
"Department": "Finance"
},
"Mark Michael": {
"age": 55,
"Country": "Australia",
"Department": "HR"
},
"Jenny Jane": {
"age": 45,
"Country": "United States",
"Department": "IT"
},
"Jane Michael": {
"age": 27,
"Country": "United States",
"Department": "HR"
},
"Scofield Michael": {
"age": 37,
"Country": "England",
"Department": "HR"
}
}
Expected Result:
{
"Michael": {
"count": 3, // number of people that have same last name,
"age": {
"age1": 55,
"age2": 27,
"age3": 37
},
"Country": {
"Country1":"Australia",
"Country2":"United States",
"Country3":"England"
},
"Department": {
"Department1": "HR",
"Department2": "HR",
"Department3": "HR"
},
...
...
...
}
}
In my point of view, using dict for 'age', 'Country' or 'Department' is not necessary and more complicate, using list should be better.
import json
text = """{
"John Jane": {
"age": 30,
"Country": "Denmark",
"Department": "Marketing"
},
"Gennie Jane": {
"age": 45,
"Country": "New Zealand",
"Department": "Finance"
},
"Mark Michael": {
"age": 55,
"Country": "Australia",
"Department": "HR"
},
"Jenny Jane": {
"age": 45,
"Country": "United States",
"Department": "IT"
},
"Jane Michael": {
"age": 27,
"Country": "United States",
"Department": "HR"
},
"Scofield Michael": {
"age": 37,
"Country": "England",
"Department": "HR"
}
}"""
dictionary = json.loads(text)
result = {}
for key, value in dictionary.items():
last_name = key.split()[1]
if last_name in result:
result[last_name]['count'] += 1
result[last_name]['age'].append(value['age'])
result[last_name]['Country'].append(value['Country'])
result[last_name]['Department'].append(value['Department'])
else:
result[last_name] = {'count':1, 'age':[value['age']], 'Country':[value['Country']], 'Department':[value['Department']]}
print(result)
{'Jane': {'count': 3, 'age': [30, 45, 45], 'Country': ['Denmark', 'New Zealand', 'United States'], 'Department': ['Marketing', 'Finance', 'IT']}, 'Michael': {'count': 3, 'age': [55, 27, 37], 'Country': ['Australia', 'United States', 'England'], 'Department': ['HR', 'HR', 'HR']}}

Get different values from repeating item JSON

I have this json derived dict:
{
"stats": [
{
"name": "Jengas",
"time": 166,
"uid": "177098244407558145",
"id": 1
},
{
"name": "- k",
"time": 20,
"uid": "199295228664872961",
"id": 2
},
{
"name": "MAD MARX",
"time": "0",
"uid": "336539711785009153",
"id": 3
},
{
"name": "loli",
"time": 20,
"uid": "366299640976375818",
"id": 4
},
{
"name": "Woona",
"time": 20,
"uid": "246996981178695686",
"id": 5
}
]
}
I want to get the "time" from everybody in the list and use it with sort.
So the result I get has this:
TOP 10:
Jengas: 166
Loli: 20
My first try is to list different values from repeating item.
Right now the code is:
with open('db.json') as json_data:
topvjson = json.load(json_data)
print(topvjson)
d = topvjson['stats'][0]['time']
print(d)
Extract the stats list, apply sort to it with the appropriate key:
from json import loads
data = loads("""{
"stats": [{
"name": "Jengas",
"time": 166,
"uid": "177098244407558145",
"id": 1
}, {
"name": "- k",
"time": 20,
"uid": "199295228664872961",
"id": 2
}, {
"name": "MAD MARX",
"time": "0",
"uid": "336539711785009153",
"id": 3
}, {
"name": "loli",
"time": 20,
"uid": "366299640976375818",
"id": 4
}, {
"name": "Woona",
"time": 20,
"uid": "246996981178695686",
"id": 5
}]
}""")
stats = data['stats']
stats.sort(key = lambda entry: int(entry['time']), reverse=True)
print("TOP 10:")
for entry in stats[:10]:
print("%s: %d" % (entry['name'], int(entry['time'])))
This prints:
TOP 10:
Jengas: 166
- k: 20
loli: 20
Woona: 20
MAD MARX: 0
Note that your time is neither an integer nor string: there are both 0 and "0" in the dataset. That's why you need the conversion int(...).
You can sort the list of dict values like:
Code:
top_three = [(x[1], -x[0]) for x in sorted(
(-int(user['time']), user['name']) for user in stats['stats'])][:3]
This works by taking the time and the name and building a tuple. The tuples can the be sorted, and then the names can be extracted (via: x[1]) after the sort.
Test Code:
stats = {
"stats": [{
"name": "Jengas",
"time": 166,
"uid": "177098244407558145",
"id": 1
}, {
"name": "- k",
"time": 20,
"uid": "199295228664872961",
"id": 2
}, {
"name": "MAD MARX",
"time": "0",
"uid": "336539711785009153",
"id": 3
}, {
"name": "loli",
"time": 20,
"uid": "366299640976375818",
"id": 4
}, {
"name": "Woona",
"time": 20,
"uid": "246996981178695686",
"id": 5
}]
}
top_three = [x[1] for x in sorted(
(-int(user['time']), user['name']) for user in stats['stats'])][:3]
print(top_three)
Results:
[('Jengas', 166), ('- k', 20), ('Woona', 20)]
Here's a way to do it using the built-in sorted() function:
data = {
"stats": [
{
"name": "Jengas",
"time": 166,
"uid": "177098244407558145",
"id": 1
},
{
etc ...
}
]
}
print('TOP 3')
sorted_by_time = sorted(data['stats'], key=lambda d: int(d['time']), reverse=True)
for i, d in enumerate(sorted_by_time, 1):
if i > 3: break
print('{name}: {time}'.format(**d))
Output:
TOP 3
Jengas: 166
- k: 20
loli: 20

Categories