Flatten nested json into multiple rows - python

I am working on a heavily nested json file that contains multiple dictionaries and lists. Some of the keys are similar and I am wondering if I can put each dictionary into different row. It doesn't matter, if some columns are not the same, the values can be left blank.
Json file (in similar form to the original)
{
"ID": "01",
"session": [
{
"A": [
"abc",
"cde",
"efj"
],
"B": 14,
"C": 14,
"D": [
{
"F": {
"a": "ldjf",
"b": "kdj",
"ID": "01",
"c": "kjasgfk",
"d": [
"pw"
],
"e": "dsg"
}
},
{
"F": {
"a": "ldjewiorf",
"ID": "01",
"c": "kjasnbgfk",
"d": "mbxzc" ,
"e": "dsg"
}
},
{
"F": {
"f": "1232",
"g": "rege",
"h": "en-gb",
"i": "dfkj34",
"j": "iyt658"
}
}
],
"properties": {
"AA":"esg",
"BB": "skdjghk",
"CC": "adfkh",
"DD": "sdlkfh"
}
},
{
"A": [
"abc",
"cde",
"efj"
],
"B": 16,
"C": 14,
"D": [
{
"F": {
"a": "sdg",
"b": "sg",
"ID": "01",
"c": "sg",
"d": "shfh",
"e": "weitu"
}
},
{
"F": {
"f": "1232",
"m": "sdg",
"n": "en-sdg",
"o": "eqe",
"p": "sdg"
}
}
],
"properties": {
"AA":"ekjhsg",
"BB": "skkldjghk",
"CC": "adfyurkh",
"DD": "sdlkfmlh"
}
}
],
"G": {
"A1": {
"year": 2016,
"month": 5,
"dayOfMonth": 1,
"hourOfDay": 0,
"minute": 0,
"second": 0
},
"A2": "ksjdf",
"A3": "s38764",
"A4": [
{
"year": 2016,
"month": 5,
"dayOfMonth": 1,
"hourOfDay": 0,
"minute": 0,
"second": 0
}
]
}
}
I tried this code, but puts the whole file in 1 row with multiple columns:
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
Is there a solution of what I am trying to do or is it impossible?

Related

Create one 'list' by userID

I want to create a list per user so i got this jsonfile:
data = [
{
"id": "1",
"price": 1,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"price": 3,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"price":8,
},
]
I'm on python and I want to have a result like
for the user with 'id':1 [1,10,10]
and for the user with "id": "2": [3,8]
so two lists corresponding to the prices according to the ids
is it possible to do that in python ?
note, in fact user id are UUID type and randomly generated.
edit: quantity was a mistake all data are price and id, sorry
collections.defaultdict to the rescue.
Assuming you really do have mixed quantitys and prices and you don't care about mixing them into the same list,
from collections import defaultdict
data = [
{
"id": "1",
"price": 1,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"quantity": 3,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"price": 8,
},
]
by_id = defaultdict(list)
for item in data:
item = item.copy() # we need to mutate the item
id = item.pop("id")
# whatever is the other value in the dict, grab that:
other_value = item.popitem()[1]
by_id[id].append(other_value)
print(dict(by_id))
The output is
{'1': [1, 10, 10], '2': [3, 8]}
If you actually only do have prices, the loop is simpler:
by_id = defaultdict(list)
for item in data:
by_id[item["id"]].append(item.get("price"))
or
by_id = defaultdict(list)
for item in data:
by_id[item["id"]].append(item["price"])
to fail fast when the price is missing.
first :
you structur data : {[]}, is not supported in python.
assume your data is :
my_json = [
{
"id": "1",
"price": 1,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"quantity": 3,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"price":8,
},
]
then you can achive with this:
results = {}
for data in my_json:
if data.get('id') not in results:
results[data.get('id')] = [data.get('price') or data.get('quantity')]
else:
results[data.get('id')].append(data.get('price') or data.get('quantity'))
print(results)
output:
{'1': [1, 10, 10], '2': [3, 8]}
Maybe like this:
data = [
{
"id": "1",
"price": 1,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"quantity": 3,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"price": 8,
}
]
result = {}
for item in data:
try:
result[item['id']].append(item.get('price'))
except KeyError:
result[item['id']] = [item.get('price')]
print(result)
Where None is put in place of the missing price for that entry, quantity key ignored.
Result:
{'1': [1, 10, 10], '2': [None, 8]}
A simple loop that enumerates your list (it's not JSON) in conjunction with setdefault() is all you need:
data = [
{
"id": "1",
"price": 1,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"price": 3,
},
{
"id": "1",
"price": 10,
},
{
"id": "2",
"price": 8,
}
]
dict_ = {}
for d in data:
dict_.setdefault(d['id'], []).append(d['price'])
print(dict_)
Output:
{'1': [1, 10, 10], '2': [3, 8]}
Note:
This will fail (KeyError) if either 'id' or 'price' is missing from the dictionaries in the list

Include JSON section numbers as columns in a df while converting JSON to DF

I have a nested JSON as follows:
{
"group": {
"groupname": "grp1",
"groupid": 1,
"city": "London"
},
"persons": {
"0": {
"name": "john",
"age": 12,
"gender": "M",
"groupid": 1
},
"1": {
"name": "maat",
"age": 15,
"gender": "M",
"groupid": 1
},
"2": {
"name": "chrissle",
"age": 10,
"gender": "F",
"groupid": 1
},
"3": {
"name": "stacy",
"age": 11,
"gender": "F",
"groupid": 1
},
"4": {
"name": "mark",
"age": 12,
"gender": "M",
"groupid": 1
},
"5": {
"name": "job",
"age": 12,
"gender": "M",
"groupid": 1
}
},
"group": {
"groupname": "grp1",
"groupid": 2,
"city": "NewYork"
},
"persons": {
"0": {
"name": "will",
"age": 12,
"gender": "M",
"groupid": 2
},
"1": {
"name": "phil",
"age": 15,
"gender": "M",
"groupid": 2
},
"2": {
"name": "winnie",
"age": 10,
"gender": "F",
"groupid": 2
}
}
}
I want to separate the two sections group and persons into two df respectively.
For the second df persons I want to include the section numbers as columns as follows:
id name age gender groupid
0 john 12 M 1
1 maat 15 M 1
2 chrissle 10 F 1
I have loaded the JSON as a list of dict and converted it into a df:
data= pd.DataFrame.from_dict(data)
I can then get persons
personsdf= personsdf['persons']
This will however give me a df with one column that has dict rows for every persons section.
I have tried below to unnest the dict rows:
finaldf= pd.DataFrame()
for index, row in personsdf.iterrows():
row_data=row['personsdf']
row_data = pd.DataFrame.from_dict(row_data)
row_data = row_data.T
finaldf= finaldf.append(row_data, ignore_index=True)
But then I get all the columns except the section number which gets lost.
Is there a better way to approach this?
If I understand you correctly you want to create two dataframes: one for groups and the second for persons:
data = [
{
"group": {"groupname": "grp1", "groupid": 1, "city": "London"},
"persons": {
"0": {"name": "john", "age": 12, "gender": "M", "groupid": 1},
"1": {"name": "maat", "age": 15, "gender": "M", "groupid": 1},
"2": {"name": "chrissle", "age": 10, "gender": "F", "groupid": 1},
"3": {"name": "stacy", "age": 11, "gender": "F", "groupid": 1},
"4": {"name": "mark", "age": 12, "gender": "M", "groupid": 1},
"5": {"name": "job", "age": 12, "gender": "M", "groupid": 1},
},
},
{
"group": {"groupname": "grp1", "groupid": 2, "city": "NewYork"},
"persons": {
"0": {"name": "will", "age": 12, "gender": "M", "groupid": 2},
"1": {"name": "phil", "age": 15, "gender": "M", "groupid": 2},
"2": {"name": "winnie", "age": 10, "gender": "F", "groupid": 2},
},
},
]
df1 = pd.DataFrame([d["group"] for d in data])
df2 = pd.DataFrame(
[{"id": k, **v} for d in data for k, v in d["persons"].items()]
)
print(df1)
print(df2)
Prints:
groupname groupid city
0 grp1 1 London
1 grp1 2 NewYork
id name age gender groupid
0 0 john 12 M 1
1 1 maat 15 M 1
2 2 chrissle 10 F 1
3 3 stacy 11 F 1
4 4 mark 12 M 1
5 5 job 12 M 1
6 0 will 12 M 2
7 1 phil 15 M 2
8 2 winnie 10 F 2

How to merge and create dict of dicts from a dictionary [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 1 year ago.
Improve this question
I have a dictionary like the one below in which the value of one of the list elements will be a key somewhere in the same dictionary.
{"a": ["b", "c"], "b": ["D"], "c": ["A", "B", "C"], "A": ["abc", "aab", "aba"], "B": ["bcd", "bdc"], "C": ["dab", "dbc", "def", "dgr"], "abc": ["eee", "ehj"], "eee": ["ghi"], "aab": ["tuv", "xuv"], "ehj": ["giu"], "aba": ["suv", "ruv"]}
I want to merge all of them as below.
{"a": [{"b": ["D"]}, {"c": [{"A": [{"abc": [{"eee": ["ghi"], "ehj": ["giu"]}, {"aab": ["tuv", "xuv"]}, {"aba": ["suv", "ruv"]}]}, {"B": ["bcd", "bdc"]}, {"C": ["dab", "dbc", "def", "dgr"]}]}]}]}
JSON Format:
{
"a": [{
"b": ["D"]
}, {
"c": [{
"A": [{
"abc": [{
"eee": ["ghi"],
"ehj": ["giu"]
}, {
"aab": ["tuv", "xuv"]
}, {
"aba": ["suv", "ruv"]
}]
}, {
"B": ["bcd", "bdc"]
}, {
"C": ["dab", "dbc", "def", "dgr"]
}]
}]
}]
}
Also, the number of values (list of key elements) of a key is not equal.
Thanks for your help!
You can use recursion:
import json
d = {"a": ["b", "c"], "b": ["D"], "c": ["A", "B", "C"], "A": ["abc", "aab", "aba"], "B": ["bcd", "bdc"], "C": ["dab", "dbc", "def", "dgr"], "abc": ["eee", "ehj"], "eee": ["ghi"], "aab": ["tuv", "xuv"], "ehj": ["giu"], "aba": ["suv", "ruv"]}
def merge(n):
r = [(i, merge(i)) if i in d else i for i in d[n]]
if all(isinstance(j, str) for j in r):
return r
return [{a:b} for a, b in r] if any(any(isinstance(j, dict) for j in b) for _, b in r) else \
[{a:b for a, b in r}]
result = {a:merge(a) for a in d if all(a not in b for b in d.values())}
print(json.dumps(result, indent=4))
Output:
{
"a": [
{
"b": [
"D"
]
},
{
"c": [
{
"A": [
{
"abc": [
{
"eee": [
"ghi"
],
"ehj": [
"giu"
]
}
]
},
{
"aab": [
"tuv",
"xuv"
]
},
{
"aba": [
"suv",
"ruv"
]
}
]
},
{
"B": [
"bcd",
"bdc"
]
},
{
"C": [
"dab",
"dbc",
"def",
"dgr"
]
}
]
}
]
}

How to flatten a nested JSON recursively, with flatten_json

This question is specific to using flatten_json from GitHub Repo: flatten
The package is on pypi flatten-json and can be installed with pip install flatten-json
This question is specific to the following component of the package:
def flatten_json(nested_json: dict, exclude: list=[''], sep: str='_') -> dict:
"""
Flatten a list of nested dicts.
"""
out = dict()
def flatten(x: (list, dict, str), name: str='', exclude=exclude):
if type(x) is dict:
for a in x:
if a not in exclude:
flatten(x[a], f'{name}{a}{sep}')
elif type(x) is list:
i = 0
for a in x:
flatten(a, f'{name}{i}{sep}')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
Use recursion to flatten nested dicts
Thinking Recursively in Python
Flattening JSON objects in Python
How nested can data be?:
flatten_json has been used to unpack a file that ended up being over 100000 columns
Can the flattened JSON, be unflattened?:
Yes, this question doesn't cover that. However, if you install the flatten package, there is an unflatten method, but I haven't tested it.
How to flatten a JSON or dict is a common question, to which there are many answers.
This answer focuses on using flatten_json to recursively flatten a nested dict or JSON.
Assumptions:
This answer assumes you already have the JSON or dict loaded into some variable (e.g. file, api, etc.)
In this case we will use data
How is data loaded into flatten_json:
It accepts a dict, as shown by the function type hint.
The most common forms of data:
Just a dict: {}
flatten_json(data)
List of dicts: [{}, {}, {}]
[flatten_json(x) for x in data]
JSON with with top level keys, where the values repeat: {1: {}, 2: {}, 3: {}}
[flatten_json(data[key]) for key in data]
Other
{'key': [{}, {}, {}]}: [flatten_json(x) for x in data['key']]
Practical Examples:
I typically flatten data into a pandas.DataFrame for further analysis.
Load pandas with import pandas as pd
flatten_json returns a dict, which can be saved directly using the csv packages.
Data 1:
{
"id": 1,
"class": "c1",
"owner": "myself",
"metadata": {
"m1": {
"value": "m1_1",
"timestamp": "d1"
},
"m2": {
"value": "m1_2",
"timestamp": "d2"
},
"m3": {
"value": "m1_3",
"timestamp": "d3"
},
"m4": {
"value": "m1_4",
"timestamp": "d4"
}
},
"a1": {
"a11": [
]
},
"m1": {},
"comm1": "COMM1",
"comm2": "COMM21529089656387",
"share": "xxx",
"share1": "yyy",
"hub1": "h1",
"hub2": "h2",
"context": [
]
}
Flatten 1:
df = pd.DataFrame([flatten_json(data)])
id class owner metadata_m1_value metadata_m1_timestamp metadata_m2_value metadata_m2_timestamp metadata_m3_value metadata_m3_timestamp metadata_m4_value metadata_m4_timestamp comm1 comm2 share share1 hub1 hub2
1 c1 myself m1_1 d1 m1_2 d2 m1_3 d3 m1_4 d4 COMM1 COMM21529089656387 xxx yyy h1 h2
Data 2:
[{
'accuracy': 17,
'activity': [{
'activity': [{
'confidence': 100,
'type': 'STILL'
}
],
'timestampMs': '1542652'
}
],
'altitude': -10,
'latitudeE7': 3777321,
'longitudeE7': -122423125,
'timestampMs': '1542654',
'verticalAccuracy': 2
}, {
'accuracy': 17,
'activity': [{
'activity': [{
'confidence': 100,
'type': 'STILL'
}
],
'timestampMs': '1542652'
}
],
'altitude': -10,
'latitudeE7': 3777321,
'longitudeE7': -122423125,
'timestampMs': '1542654',
'verticalAccuracy': 2
}, {
'accuracy': 17,
'activity': [{
'activity': [{
'confidence': 100,
'type': 'STILL'
}
],
'timestampMs': '1542652'
}
],
'altitude': -10,
'latitudeE7': 3777321,
'longitudeE7': -122423125,
'timestampMs': '1542654',
'verticalAccuracy': 2
}
]
Flatten 2:
df = pd.DataFrame([flatten_json(x) for x in data])
accuracy activity_0_activity_0_confidence activity_0_activity_0_type activity_0_timestampMs altitude latitudeE7 longitudeE7 timestampMs verticalAccuracy
17 100 STILL 1542652 -10 3777321 -122423125 1542654 2
17 100 STILL 1542652 -10 3777321 -122423125 1542654 2
17 100 STILL 1542652 -10 3777321 -122423125 1542654 2
Data 3:
{
"1": {
"VENUE": "JOEBURG",
"COUNTRY": "HAE",
"ITW": "XAD",
"RACES": {
"1": {
"NO": 1,
"TIME": "12:35"
},
"2": {
"NO": 2,
"TIME": "13:10"
},
"3": {
"NO": 3,
"TIME": "13:40"
},
"4": {
"NO": 4,
"TIME": "14:10"
},
"5": {
"NO": 5,
"TIME": "14:55"
},
"6": {
"NO": 6,
"TIME": "15:30"
},
"7": {
"NO": 7,
"TIME": "16:05"
},
"8": {
"NO": 8,
"TIME": "16:40"
}
}
},
"2": {
"VENUE": "FOOBURG",
"COUNTRY": "ABA",
"ITW": "XAD",
"RACES": {
"1": {
"NO": 1,
"TIME": "12:35"
},
"2": {
"NO": 2,
"TIME": "13:10"
},
"3": {
"NO": 3,
"TIME": "13:40"
},
"4": {
"NO": 4,
"TIME": "14:10"
},
"5": {
"NO": 5,
"TIME": "14:55"
},
"6": {
"NO": 6,
"TIME": "15:30"
},
"7": {
"NO": 7,
"TIME": "16:05"
},
"8": {
"NO": 8,
"TIME": "16:40"
}
}
}
}
Flatten 3:
df = pd.DataFrame([flatten_json(data[key]) for key in data])
VENUE COUNTRY ITW RACES_1_NO RACES_1_TIME RACES_2_NO RACES_2_TIME RACES_3_NO RACES_3_TIME RACES_4_NO RACES_4_TIME RACES_5_NO RACES_5_TIME RACES_6_NO RACES_6_TIME RACES_7_NO RACES_7_TIME RACES_8_NO RACES_8_TIME
JOEBURG HAE XAD 1 12:35 2 13:10 3 13:40 4 14:10 5 14:55 6 15:30 7 16:05 8 16:40
FOOBURG ABA XAD 1 12:35 2 13:10 3 13:40 4 14:10 5 14:55 6 15:30 7 16:05 8 16:40
Other Examples:
Python Pandas - Flatten Nested JSON
handling nested json in pandas
How to flatten a nested JSON from the NASA Weather Insight API in Python

How can I merge and sum two dictionary key and items?

I have two dictionarys
dict1 = {
"list": {
"alpha": {
"a": {
"score": 1,
"visit": 2
},
"b": {
"score": 3,
"visit": 4
}
},
"beta" : {
"a": {
"score": 1,
"visit": 2
},
"b": {
"score": 3,
"visit": 4
}
}
}
}
dict2 = {
"list": {
"alpha": {
"a": {
"score": 1,
"visit": 2
},
"c": {
"score": 5,
"visit": 6
}
},
"beta" : {
"a": {
"score": 1,
"visit": 2
},
"c": {
"score": 5,
"visit": 6
}
}
}
}
I want to merge dictionary like this
dict1 = {
"list": {
"alpha": {
"a" : {
"score": 2,
"visit": 4
},
"b": {
"score": 3,
"visit": 4
},
"c": {
"score": 5,
"visit": 6
}
},
"beta": {
"a": {
"score": 2,
"visit": 4
},
"b": {
"score": 3,
"visit": 4
},
"c": {
"score": 5,
"visit": 6
}
}
}
}
Condition 1. value is always new dictionary or int (not str)
Condition 2. If the same key exists at the same depth, the value of that key must be sum.
I think maybe I can solve this problem using for loops.
But Python seems to have a simpler and faster way.
this is my best.
code:
def logic(d1, d2, inconflict = lambda v1,v2 : v1+v2) :
for k in d2:
if k in d1 :
if isinstance(d1[k], dict) and isinstance(d2[k], dict) :
logic(d1[k], d2[k], inconflict)
elif isinstance(d1[k], int) :
d1[k] = inconflict(d1[k], d2[k])
else :
d1[k] = d2[k]
return d1
print logic(dict1, dict2)
It's a recursive data structure; let's use recursion.
Edit: missed the python-2.6 tag, no dict comprehensions there. Edit2: Copy values in case they exist in only one of the two, otherwise you'll run into surprises with references to the same dictionary being inside two separate dictionaries.
import copy
def recursively_sum(var1, var2):
"""If var1 and var2 are ints, return their sum. Otherwise they are dicts
and we recursively sum for each key that is either in var1 or var2."""
if var1 is None:
return copy.deepcopy(var2)
elif var2 is None:
return copy.deepcopy(var1)
elif isinstance(var1, int):
return var1 + var2
else:
result = {}
for key in set(var1) | set(var2):
result[key] = recursively_sum(var1.get(key), var2.get(key))
return result

Categories