Convert nested JSON to Dataframe with columns referencing nested paths - python

I am trying to convert a nested JSON into a CSV file with three columns: the level 0 key, the branch, and the lowest level leaf.
For example, in the JSON below:
{
"protein": {
"meat": {
"chicken": {},
"beef": {},
"pork": {}
},
"powder": {
"^ISOPURE": {},
"substitute": {}
}
},
"carbs": {
"_vegetables": {
"veggies": {
"lettuce": {},
"carrots": {},
"corn": {}
}
},
"bread": {
"white": {},
"multigrain": {
"whole wheat": {}
},
"other": {}
}
},
"fat": {
"healthy": {
"avocado": {}
},
"unhealthy": {}
}
}
I want to create an output like this (didn't include entire tree example just to get point across):
level 0
branch
leaf
protein
protein.meat
chicken
protein
protein.meat
beef
I tried using json normalize but the actual file will not have paths that I can use to identify the nested fields as each dictionary is unique.
This returns the level 0 field but I need to have these as rows, not columns. Any help would be very much appreciated.
I created a function that pcan unnest the json based on key values like this:
import json
with open('path/to/json') as m:
my_json = json.load(m)
def unnest_json(data):
for key, value in data.items():
print(str(key)+'.'+str(value))
if isinstance(value, dict):
unnest_json(value)
elif isinstance(value, list):
for val in value:
if isinstance(val, str):
pass
elif isinstance(val, list):
pass
else:
unnest_json(val)
unnest_json(my_json)

Probably not the cleanest approach but I think you can use some sort of recursive function (traverse in below code) to convert the dictionary into a list of column values and then convert them to pandas DataFrame.
data = {
"protein": {
"meat": {
"chicken": {},
"beef": {},
"pork": {}
},
"powder": {
"^ISOPURE": {},
"substitute": {}
}
},
"carbs": {
"_vegetables": {
"veggies": {
"lettuce": {},
"carrots": {},
"corn": {}
}
},
"bread": {
"white": {},
"multigrain": {
"whole wheat": {}
},
"other": {}
}
},
"fat": {
"healthy": {
"avocado": {}
},
"unhealthy": {}
}
}
def traverse(col_values, dictionary, rows):
for key in dictionary:
new_col_values = list(col_values)
if dictionary[key]:
new_col_values[1] += '.' + key
traverse(new_col_values, dictionary[key], rows)
else:
new_col_values[2] = key
rows.append(new_col_values)
rows = []
for key in data:
traverse([key, str(key), None], data[key], rows)
import pandas as pd
df = pd.DataFrame(rows, columns=["level 0", "branch", "leaf"])
print(df)

Related

MongoDB Python Update/ Insert dict in dict without overwriting

I can't insert my new document value (dict) without overwriting my existing data. I've looked through all different resources and can't find an answer.
I've also though of putting the values from first_level_dict into a list "first_level_dict" : [dict1, dict2] but I won't know how to append the dict eighter.
Sample Data:
# Create the document
target_dict = {
"_id": 55,
"Root_dict": {
"first_level_dict": {
"second_level_dict1": {"Content1": "Value1"}
}
},
"Root_key": "Root_value"
}
collection.insert_one(target_dict)
The result I'm looking for:
result_dict = {
"_id": 55,
"Root_dict": {
"first_level_dict": {
"second_level_dict1": {"Content1": "Value1"},
"second_level_dict2": {"Content2": "Value2"}
}
},
"Root_key": "Root_value"
}
Update: New Values example 2:
# New Values Sample
new_values = {
"_id": 55,
"Root_dict": {
"first_level_dict": {
"secon_level_dict2": {"Content2": "Value2"},
"secon_level_dict3": {"Content3": "Value3"}
}
}
collection.insert_one(target_dict)
Update: The result I'm looking for example 2:
result_dict = {
"_id": 55,
"Root_dict": {
"first_level_dict": {
"second_level_dict1": {"Content1": "Value1"},
"second_level_dict2": {"Content2": "Value2"},
"second_level_dict3": {"Content3": "Value3"},
}
},
"Root_key": "Root_value"
}
What I've tried:
# Update document "$setOnInsert"
q = {"_id": 55}
target_dict = {"$set": {"Root_dict": {"first_level_dict": {"second_level_dict2": {"Content2": "Value2"}}}}}
collection.update_one(q, target_dict)
What I've tried example 2:
# Update document
q = {"_id": 55}
target_dict = {"$set": {"Root_dict.first_level_dict": {
"second_level_dict2": {"Content2": "Value2"},
"second_level_dict3": {"Content3": "Value3"}}}}
collection.update_one(q, target_dict)
Try using the dot notation:
target_dict = {$set: {"Root_dict.first_level_dict.second_level_dict2": {"Content2": "Value2"}}}
Additionally, to update/add multiple fields (for "example 2"):
target_dict = {$set: {
"Root_dict.first_level_dict.second_level_dict2": {"Content2": "Value2"},
"Root_dict.first_level_dict.second_level_dict3": {"Content3": "Value3"}
}
}

How to move sub-dictionary to another dictionary?

I am writing a function that takes 2 strings as inputs and would move a section of the dictionary to another.
def move(item_to_move, destination):
# do something....
My initial dictionary looks like this.
directories = {
'beers': {
'ipa': {
'stone': {}
}
},
'wines': {
'red': {
'cabernet': {}
}
},
'other' : {}
}
I would like to move either a subsection or section of the dictionary to another section. The sections are represented by each key of the path delimited by a '/'. For example, the inputs for my function would be:
item_to_move='beers/ipa'
destination='other'
move(directories, item_to_move,destination)
The output would be:
{
'wines': {
'red': {
'cabernet': {}
},
},
'other' :{
'beers': {
'ipa': {
'stone': {}
} }
},
}
NOTE: I am assuming all input paths for items_to_move are valid.
Find the origin's parent dictionary and the target's dictionary, then update the the target's dictionary with the origin's key and value (removing it from the origin's parent):
def move(tree,originPath,targetPath):
originKey = None
for originName in originPath.split("/"):
originParent = originParent[originKey] if originKey else tree
originKey = originName
targetDict = tree
for targetName in targetPath.split("/"):
targetDict = targetDict[targetName]
targetDict.update({originKey:originParent.pop(originKey)})
output:
directories = {
'beers': {
'ipa': {
'stone': {}
}
},
'wines': {
'red': {
'cabernet': {}
}
},
'other' : {}
}
move(directories,'beers/ipa','other')
print(directories)
{ 'beers': {},
'wines': { 'red': {'cabernet': {}} },
'other': { 'ipa': {'stone': {}} }
}

Comparing Nested Python dict with list and dict

I've seen similar questions but none that exactly match what I'm doing and I believe other developers might face same issue if they are working with MongoDB.
I'm looking to compare two nested dict objects with dict and arrays and return a dict with additions and deletion (like you would git diff two files)
Here is what I have so far:
def dict_diff(alpha, beta, recurse_adds=False, recurse_dels=False):
"""
:return: differences between two python dict with adds and dels
example:
(This is the expected output)
{
'adds':
{
'specific_hours': [{'ends_at': '2015-12-25'}],
}
'dels':
{
'specific_hours': [{'ends_at': '2015-12-24'}],
'subscription_products': {'review_management': {'thiswillbedeleted': 'deleteme'}}
}
}
"""
if type(alpha) is dict and type(beta) is dict:
a_keys = alpha.keys()
b_keys = beta.keys()
dels = {}
adds = {}
for key in a_keys:
if type(alpha[key]) is list:
if alpha[key] != beta[key]:
adds[key] = dict_diff(alpha[key], beta[key], recurse_adds=True)
dels[key] = dict_diff(alpha[key], beta[key], recurse_dels=True)
elif type(alpha[key]) is dict:
if alpha[key] != beta[key]:
adds[key] = dict_diff(alpha[key], beta[key], recurse_adds=True)
dels[key] = dict_diff(alpha[key], beta[key], recurse_dels=True)
elif key not in b_keys:
dels[key] = alpha[key]
elif alpha[key] != beta[key]:
adds[key] = beta[key]
dels[key] = alpha[key]
for key in b_keys:
if key not in a_keys:
adds[key] = beta[key]
elif type(alpha) is list and type(beta) is list:
index = 0
adds=[]
dels=[]
for elem in alpha:
if alpha[index] != beta[index]:
dels.append(alpha[index])
adds.append(beta[index])
# print('update', adds, dels)
index+=1
else:
raise Exception("dict_diff function can only get dict objects")
if recurse_adds:
if bool(adds):
return adds
return {}
if recurse_dels:
if bool(dels):
return dels
return {}
return {'adds': adds, 'dels': dels}
The result I'm getting now is:
{'adds': {'specific_hours': [{'ends_at': '2015-12-24',
'open_hours': ['07:30-11:30', '12:30-21:30'],
'starts_at': '2015-12-22'},
{'ends_at': '2015-01-03',
'open_hours': ['07:30-11:30'],
'starts_at': '2015-01-0'}],
'subscription_products': {'review_management': {}}},
'dels': {'specific_hours': [{'ends_at': '2015-12-24',
'open_hours': ['07:30-11:30', '12:30-21:30'],
'starts_at': '2015-12-2'},
{'ends_at': '2015-01-03',
'open_hours': ['07:30-11:30'],
'starts_at': '2015-01-0'}],
'subscription_products': {'review_management': {'thiswillbedeleted': 'deleteme'}}}}
And this is the two objects I'm trying to compare:
alpha = {
'specific_hours': [
{
"starts_at": "2015-12-2",
"ends_at": "2015-12-24",
"open_hours": [
"07:30-11:30",
"12:30-21:30"
]
},
{
"starts_at": "2015-01-0",
"ends_at": "2015-01-03",
"open_hours": [
"07:30-11:30"
]
}
],
'subscription_products': {'presence_management':
{'expiration_date': 1953291600,
'payment_type': {
'free': 'iamfree',
'test': "test",
},
},
'review_management':
{'expiration_date': 1511799660,
'payment_type': {
'free': 'iamfree',
'test': "test",
},
'thiswillbedeleted': "deleteme",
}
},
}
beta = {
'specific_hours': [
{
"starts_at": "2015-12-22",
"ends_at": "2015-12-24",
"open_hours": [
"07:30-11:30",
"12:30-21:30"
]
},
{
"starts_at": "2015-01-0",
"ends_at": "2015-01-03",
"open_hours": [
"07:30-11:30"
]
}
],
'subscription_products': {'presence_management':
{'expiration_date': 1953291600,
'payment_type': {
'free': 'iamfree',
'test': "test",
},
},
'review_management':
{'expiration_date': 1511799660,
'payment_type': {
'free': 'iamfree',
'test': "test",
},
}
},
}

Python get a dict entry containing certain field

I have a nested dict in Python containing YAML structures like
- id: left_time
type: u2
doc: Time left
and I want to obtain pairs like {id: doc}. For this example I want it to be: {"left_time": "Time left"}. The problem is I need to walk through them recursively.
My attempt is
def get_dict_recursively(search_dict, field):
fields_found = []
name = ""
for key, value in search_dict.items():
if key == "id":
name = value
if key == field:
fields_found.append({name: value})
elif isinstance(value, dict):
results = get_dict_recursively(value, field)
for result in results:
fields_found.append({name: result})
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
more_results = get_dict_recursively(item, field)
for another_result in more_results:
fields_found.append({name: another_result})
return fields_found
calling it like
get_dict_recursively(dict, "doc")
where
dict = {
meta:
id: foo
title: Foo
types:
data:
seq:
- id: left_time
type: u2
doc: Time left
gps:
seq:
- id: gps_st
type: b2
- id: sats
type: b6
doc: Number of satellites
}
There's a mistake, but I can't find it out.
Let's first state your example data as a dict:
data = {
"meta": {
"id": "foo",
"title": "Foo"
},
"types": {
"data": {
"seq": [
{
"id": "left_time",
"type": "u2",
"doc": "Time left"
}
]
},
"gps": {
"seq": [
{
"id": "gps_st",
"type": "b2"
},
{
"id": "sats",
"type": "b6",
"doc": "Number of satellites"
}
]
}
}
}
Next, we can simplify your recursive function to look like this:
def extract_docs(data):
result = []
if isinstance(data, list):
for d in data:
result += extract_docs(d)
elif isinstance(data, dict):
if "id" in data and "doc" in data:
result.append((data["id"], data["doc"]))
else:
for d in data.values():
result += extract_docs(d)
return result
With this you get
>>> dict(extract_docs(data))
{'sats': 'Number of satellites', 'left_time': 'Time left'}

Python - Getting the intersection of two Json-Files

i'm looking for an option to calculate the intersection of two JSON-Files. I have been searching for it and found that i can use sets for my problem. This works "okay". But i have to get a more detailed view of the intersection. And this is where the problems are starting.
How i calc the intersection:
def calcIntersect(ValidationFile, json_object1, json_object2):
with open(ValidationFile) as schema_file:
schema = j.load(schema_file)
js.Draft4Validator.check_schema(schema)
with open(json_object1) as spec_file:
spec1 = j.load(spec_file, object_pairs_hook=OrderedDict)
js.validate(spec1, schema)
with open(json_object2) as spec_file:
spec2 = j.load(spec_file, object_pairs_hook=OrderedDict)
js.validate(spec2, schema)
x = set(spec1) & set(spec2)
print(x)
Example Data1:
{
"Car":{
"Brand":"Audi",
"Nationality":"Germany",
"Modelname":"A6"
},
"Engine":{
"cubic capacity":"2967",
"Enginetype":"V6",
"Fuel":"Diesel",
"MaxSpeed":"250"
},
"Colors":{
"Carcolor":"Black",
"Interiorrcolor":"white"
}
}
Example Data2:
{
"Car":{
"Brand":"Audi",
"Nationality":"USA",
"Modelname":"A6"
},
"Engine":{
"cubic capacity":"2995",
"Enginetype":"V6",
"Fuel":"Petrol",
"MaxSpeed":"250"
},
"Colors":{
"Carcolor":"Black",
"Interiorrcolor":"Black"
}
}
Example-Output:
{'Car', 'Colors', 'Engine'}
This are just the "Keys" but i need the dictonaries. At the moment it is giving me this keys to say that there is a intersection in it. Maybe in 'Car' there is in both Files a "Audi" and the nationality is different because one car is produced in America and the other car is produced in Germany. But still it returns 'Car' and not the "Audi".
I hope i were able to describe my problem for a bit. It's my first question..
The following lines, inspired by #likeon's answer, will give you a dictionary whose keys will be the keys of the intersecting objects in your specs, and the values an array containing the intersecting objects.
intersect = { key: [o, spec2[key]] for key, o in spec1.iteritems()
if key in spec2 };
Edit:
If you are using python 3, you must use itemsinstead of iteritems:
intersect = { key: [o, spec2[key]] for key, o in spec1.items()
if key in spec2 };
Why you don't just iterate over spec1 and compare values with spec2 like that:
x = {k: v for k, v in spec1.iteritems() if k in spec2 and spec2[k] == v}
You'll need a recursive solution:
json1 = {
"Car": {
"Brand": "Audi",
"Nationality": "Germany",
"Modelname": "A6"
},
"Engine": {
"cubic capacity": "2967",
"Enginetype": "V6",
"Fuel": "Diesel",
"MaxSpeed": "250"
},
"Colors": {
"Carcolor": "Black",
"Interiorrcolor": "white"
}
}
json2 = {
"Car": {
"Brand": "Audi",
"Nationality": "USA",
"Modelname": "A6"
},
"Engine": {
"cubic capacity": "2995",
"Enginetype": "V6",
"Fuel": "Petrol",
"MaxSpeed": "250"
},
"Colors": {
"Carcolor": "Black",
"Interiorrcolor": "Black"
}
}
def common_dict(d1, d2):
output = {}
for k in set(d1) & set(d2):
o1, o2 = d1[k], d2[k]
if isinstance(o1, dict) and isinstance(o2, dict):
output[k] = common_dict(o1, o2)
elif o1 == o2:
output[k] = o1
return output
print common_dict(json1, json2)
# {'Engine': {'MaxSpeed': '250', 'Enginetype': 'V6'}, 'Car': {'Brand': 'Audi', 'Modelname': 'A6'}, 'Colors': {'Carcolor': 'Black'}}

Categories