Comparing 2 json files using phyton and outputting the difference in new output file - python

I am trying to compare 2 json files to do a delta check between them.
Exising json:
{
"rules": [
{
"customer_id": "abc123",
"id": "1",
},
{
"customer_id": "xyz456",
"id": "2",
}
]
}
Updated json:
{
"rules": [
{
"customer_id": "abc123",
"id": "1",
},
{
"customer_id": "def456",
"id": "3",
},
{
"customer_id": "xyz789",
"id": "2",
}
]
}
What i want is my code to get the new objects from the new json(in this case id:3 and customer id def456)
however i also want to keep the original existing values (id:2 customer id should remain as xyz456 instead of updated to the new value xyz789)
Here is my current code:
import json
# Opening JSON file
f = open('1.json',)
y = open('2.json',)
# returns JSON object as a dictionary
less_data = json.load(f)
more_data = json.load(y)
# Iterating through the json list
for x in more_data['rules']:
for y in less_data['rules']:
if x['id']== y['id']:
print("x:" + x['id'],"y:" + y['id'])
break
print(x['id'] + " is not found")
//take action to add in new objects into json output
running the program i get the following output:
x:1 y:1
1 is found
x:3 y:1
3 is not found
x:3 y:2
3 is not found
x:2 y:1
2 is not found
x:2 y:2
2 is found
I only want 3 is not found to be printed once after running till the end of the inner for loop instead of printing it out every iteration. Any help would be appreaciated

You can try flatten the JSON & compare its keys as dict\json datatype is unordered kind of datatype:
If you list ordering matters
import collections
from itertools import chain
def flatten(d, parent_key='', sep='__'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
elif isinstance(v, list):
for idx, value in enumerate(v):
items.extend(flatten(value, new_key + sep + str(idx), sep).items())
else:
items.append((new_key, v))
return dict(items)
a1 = {'rules': [{'customer_id': 'abc123', 'id': '1'}, {'customer_id': 'xyz456', 'id': '2'}]}
a2 = {'rules': [{'customer_id': 'abc123', 'id': '1'}, {'customer_id': 'def456', 'id': '3'}, {'customer_id': 'xyz789', 'id': '2'}]}
flatten_a1 = flatten(a1)
flatten_a2 = flatten(a2)
print(flatten_a1)
print(flatten_a2)
Output:
>>> flatten_a1 => {'rules__0__customer_id': 'abc123', 'rules__0__id': '1', 'rules__1__customer_id': 'xyz456', 'rules__1__id': '2'}
>>> flatten_a2 => {'rules__0__customer_id': 'abc123', 'rules__0__id': '1', 'rules__1__customer_id': 'def456', 'rules__1__id': '3', 'rules__2__customer_id': 'xyz789', 'rules__2__id': '2'}
from flatten_a1 & flatten_a2 value, you can easily find the differences as data structure comes in single level (i.e. not in nested format)
You can find:
which keys are missing
keys are present but values are different

Related

python dynamic nested dictionary to csv

The obtained output below are from query results.
{'_id': ObjectId('651f3e6e5723b7c1'), 'fruits': {'pineapple': '2', 'grape': '0', 'apple': 'unknown'},'day': 'Tues', 'month': 'July', 'address': 'long', 'buyer': 'B1001', 'seller': 'S1301', 'date': {'date': 210324}}
{'_id': ObjectId('651f3e6e5723b7c1'), 'fruits': {'lemon': '2', 'grape': '0', 'apple': 'unknown', 'strawberry': '1'},'day': 'Mon', 'month': 'January', 'address': 'longer', 'buyer': 'B1001', 'seller': 'S1301', 'date': {'date': 210324}}
#worked but not with fruits and dynamic header
date = json.dumps(q['date']) #convert it to string
date = re.split("(:|\}| )", date)[4] #and split to get value
for q in db.fruits.aggregate(query):
print('"' + q['day'] + '","' + q['month'] + '","' + date + '","' + q['time'] + '","' + q['buyer'] + '","' + q['seller'] + '"')
#below close to what I want but having issue with nested and repeated rows
ffile = open("fruits.csv", "w")
w = csv.DictWriter(ffile, q.keys())
w.writeheader()
w.writerow(q)
I want to create a csv from it.
I am able to get everything exactly like the below table shown but not the fruits. I am stuck at nested dictionary field, and with the dynamic table header.
Mongoexport doesn’t work for me at the moment.
The field fruits could have more different nested key and value for each time.
I am currently still trying/exploring on csv.writer and try to add condition if i found nested dict. [will update answer if i manage to create the csv]
A hint to create this csv will be nice to have.
Thank you if anyone is sharing the link to similar question.
Not a problem!
We'll need to flatten the deep structure so we can all possible keys from there to form a CSV with. That requires a recursive function (flatten_dict here) to take an input dict and turn it into an output dict that contains no more dicts; here, the keys are tuples, e.g. ('foo', 'bar', 'baz').
We run that function over all input rows, gathering up the keys we've encountered along the way to the known_keys set.
That set is sorted (since we assume that the original dicts don't really have an intrinsic order either) and the dots joined to re-form the CSV header row.
Then, the flattened rows are simply iterated over and written (taking care to write an empty string for non-existent values).
The output is e.g.
_id,address,buyer,date.date,day,fruits.apple,fruits.grape,fruits.lemon,fruits.pineapple,fruits.strawberry,month,seller
651f3e6e5723b7c1,long,B1001,210324,Tues,unknown,0,,2,,July,S1301
651f3e6e5723b7c2,longer,B1001,210324,Mon,unknown,0,2,,1,January,S1301
import csv
import sys
rows = [
{
"_id": "651f3e6e5723b7c1",
"fruits": {"pineapple": "2", "grape": "0", "apple": "unknown"},
"day": "Tues",
"month": "July",
"address": "long",
"buyer": "B1001",
"seller": "S1301",
"date": {"date": 210324},
},
{
"_id": "651f3e6e5723b7c2",
"fruits": {
"lemon": "2",
"grape": "0",
"apple": "unknown",
"strawberry": "1",
},
"day": "Mon",
"month": "January",
"address": "longer",
"buyer": "B1001",
"seller": "S1301",
"date": {"date": 210324},
},
]
def flatten_dict(d: dict) -> dict:
"""
Flatten hierarchical dicts into a dict of path tuples -> deep values.
"""
out = {}
def _flatten_into(into, pairs, prefix=()):
for key, value in pairs:
p_key = prefix + (key,)
if isinstance(value, list):
_flatten_into(into, enumerate(list), p_key)
elif isinstance(value, dict):
_flatten_into(into, value.items(), p_key)
else:
out[p_key] = value
_flatten_into(out, d.items())
return out
known_keys = set()
flat_rows = []
for row in rows:
flat_row = flatten_dict(row)
known_keys |= set(flat_row.keys())
flat_rows.append(flat_row)
ordered_keys = sorted(known_keys)
writer = csv.writer(sys.stdout)
writer.writerow([".".join(map(str, key)) for key in ordered_keys])
for flat_row in flat_rows:
writer.writerow([str(flat_row.get(key, "")) for key in ordered_keys])

Python: Convert multiple columns of CSV file to nested Json

This is my input CSV file with multiple columns, I would like to convert this csv file to a json file with department, departmentID, and one nested field called customer and put first and last nested to this field.
department, departmentID, first, last
fans, 1, Caroline, Smith
fans, 1, Jenny, White
students, 2, Ben, CJ
students, 2, Joan, Carpenter
...
Output json file what I need:
[
{
"department" : "fans",
"departmentID: "1",
"customer" : [
{
"first" : "Caroline",
"last" : "Smith"
},
{
"first" : "Jenny",
"last" : "White"
}
]
},
{
"department" : "students",
"departmentID":2,
"user" :
[
{
"first" : "Ben",
"last" : "CJ"
},
{
"first" : "Joan",
"last" : "Carpenter"
}
]
}
]
my code:
from csv import DictReader
from itertools import groupby
with open('data.csv') as csvfile:
r = DictReader(csvfile, skipinitialspace=True)
data = [dict(d) for d in r]
groups = []
uniquekeys = []
for k, g in groupby(data, lambda r: (r['group'], r['groupID'])):
groups.append({
"group": k[0],
"groupID": k[1],
"user": [{k:v for k, v in d.items() if k != 'group'} for d in list(g)]
})
uniquekeys.append(k)
pprint(groups)
My issue is: groupID shows twice in the data, in and out nested json. What I want is group and groupID as grouby key.
The issue was you mixed the names of the keys so this line
"user": [{k:v for k, v in d.items() if k != 'group'} for d in list(g)]
did not strip them properly from your dictionary there was no such key. So nothing was deleted.
I do not fully understand what keys you want so the following example assumes that data.csv looks exactly like in your question department and departmentID but the script converts it to group and groupID
from csv import DictReader
from itertools import groupby
from pprint import pprint
with open('data.csv') as csvfile:
r = DictReader(csvfile, skipinitialspace=True)
data = [dict(d) for d in r]
groups = []
uniquekeys = []
for k, g in groupby(data, lambda r: (r['department'], r['departmentID'])):
groups.append({
"group": k[0],
"groupID": k[1],
"user": [{k:v for k, v in d.items() if k not in ['department','departmentID']} for d in list(g)]
})
uniquekeys.append(k)
pprint(groups)
Output:
[{'group': 'fans',
'groupID': '1',
'user': [{'first': 'Caroline', 'last': 'Smith'},
{'first': 'Jenny', 'last': 'White'}]},
{'group': 'students',
'groupID': '2',
'user': [{'first': 'Ben', 'last': 'CJ'},
{'first': 'Joan', 'last': 'Carpenter'}]}]
I used different keys so it would be really obvious which line does what and easy to customize it for different keys in input or output

How do I force one specific key to come at the top of a python dict or JSON dump [duplicate]

Is there any way in Python 2.6 to supply a custom key or cmp function to JSON's sort_keys?
I've got a list of dicts coming from JSON like so:
[
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
},
{
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
]
...which I've stored into the list variable assigndb. I'd like to be able to load in the JSON, modify it, and serialized it back out with dumps (or whatever), keeping the orders of the keys intact.
So far, I've tried something like this:
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
def key_func(k):
return ordering[k]
# renumber assignments sequentially
for (i, a) in enumerate(assigndb):
a["n"] = "%03d" % (i+1)
s = json.dumps(assigndb, indent=2, sort_keys=True, key=key_func)
...but of course dumps doesn't support a custom key like list.sort() does. Something with a custom JSONEncoder maybe? I can't seem to get it going.
An idea (tested with 2.7):
import json
import collections
json.encoder.c_make_encoder = None
d = collections.OrderedDict([("b", 2), ("a", 1)])
json.dumps(d)
# '{"b": 2, "a": 1}'
See: OrderedDict + issue6105. The c_make_encoder hack seems only to be needed for Python 2.x. Not a direct solution because you have to change dicts for OrderedDicts, but it may be still usable. I checked the json library (encode.py) and the ordered is hardcoded:
if _sort_keys:
items = sorted(dct.items(), key=lambda kv: kv[0])
This is kind of ugly, but in case tokland's solution does not work for you:
data = [{'category': 'nestedloops', 'title': 'Number Puzzle I', 'url': 'number-puzzle-i.html', 'n': '087', 'points': '60', 'key': 'numberpuzzles1'}, {'category': 'nestedloops', 'title': 'Getting Individual Digits', 'url': 'getting-individual-digits.html', 'n': '088', 'points': '80', 'key': 'gettingindividualdigits'}]
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
outlist = []
for d in data:
outlist.append([])
for k in sorted(d.keys(), key=lambda k: ordering[k]):
outlist[-1].append(json.dumps({k: d[k]}))
for i, l in enumerate(outlist):
outlist[i] = "{" + ",".join((s[1:-1] for s in outlist[i])) + "}"
s = "[" + ",".join(outlist) + "]"
Compact yet powerful recursive implementation with "prepended" and "appended" keys: https://gist.github.com/jeromerg/91f73d5867c5fa04ee7dbc0c5a03d611
def sort_recursive(node, first_keys, last_keys):
""" Sort the dictionary entries in a whole JSON object tree"""
fixed_placements = {
**{key: (0, idx) for idx, key in enumerate(first_keys)},
**{key: (2, idx) for idx, key in enumerate(last_keys)},
}
return _sort_recursive(node, lambda key: fixed_placements.get(key, (1, key)))
def _sort_recursive(node, key_fn):
if isinstance(node, list):
return [_sort_recursive(val, key_fn) for val in node]
elif isinstance(node, dict):
sorted_keys = sorted(node.keys(), key=key_fn)
return {k:_sort_recursive(node[k], key_fn) for k in sorted_keys}
else:
return node
I had the same problem and collections.OrderedDict was just not fit for the task because it ordered everything alphabetically. So I wrote something similar to Andrew Clark's solution:
def json_dumps_sorted(data, **kwargs):
sorted_keys = kwargs.get('sorted_keys', tuple())
if not sorted_keys:
return json.dumps(data)
else:
out_list = []
for element in data:
element_list = []
for key in sorted_keys:
if key in element:
element_list.append(json.dumps({key: element[key]}))
out_list.append('{{{}}}'.format(','.join((s[1:-1] for s in element_list))))
return '[{}]'.format(','.join(out_list))
You use it like this:
json_string = json_dumps_sorted([
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
}, {
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
], sorted_keys=(
'key',
'url',
'title',
'category',
'flags',
'points',
'n'
))
Thanks. I needed to put a timestamp key:value at the top of my JSON object no matter what. Obviously sorting the keys screwed this up as it starts with "t".
Using something like this, while putting the timestamp key in the dict_data right away worked:
d = collections.OrderedDict(dict_data)

How to perform quick upleveling in python?

I have the following object in python:
{
name: John,
age: {
years:18
},
computer_skills: {
years:4
},
mile_runner: {
years:2
}
}
I have an array with 100 people with the same structure.
What is the best way to go through all 100 people and make it such that there is no more "years"? In other words, each object in the 100 would look something like:
{
name: John,
age:18,
computer_skills:4,
mile_runner:2
}
I know I can do something in pseudocode:
for(item in list):
if('years' in (specific key)):
specifickey = item[(specific key)][(years)]
But is there a smarter/more efficent way?
Your pseudo-code is already pretty good I think:
for person in persons:
for k, v in person.items():
if isinstance(v, dict) and 'years' in v:
person[k] = v['years']
This overwrites every property which is a dictionary that has a years property with that property’s value.
Unlike other solutions (like dict comprehensions), this will modify the object in-place, so no new memory to keep everything is required.
def flatten(d):
ret = {}
for key, value in d.iteritems():
if isinstance(value, dict) and len(value) == 1 and "years" in value:
ret[key] = value["years"]
else:
ret[key] = value
return ret
d = {
"name": "John",
"age": {
"years":18
},
"computer_skills": {
"years":4
},
"mile_runner": {
"years":2
}
}
print flatten(d)
Result:
{'age': 18, 'mile_runner': 2, 'name': 'John', 'computer_skills': 4}
Dictionary comprehension:
import json
with open("input.json") as f:
cont = json.load(f)
print {el:cont[el]["years"] if "years" in cont[el] else cont[el] for el in cont}
prints
{u'age': 18, u'mile_runner': 2, u'name': u'John', u'computer_skills': 4}
where input.json contains
{
"name": "John",
"age": {
"years":18
},
"computer_skills": {
"years":4
},
"mile_runner": {
"years":2
}
}
Linear with regards to number of elements, you can't really hope for any lower.
As people said in the comments, it isn't exactly clear what your "object" is, but assuming that you actually have a list of dicts like this:
list = [{
'name': 'John',
'age': {
'years': 18
},
'computer_skills': {
'years':4
},
'mile_runner': {
'years':2
}
}]
Then you can do something like this:
for item in list:
for key in item:
try:
item[key] = item[key]['years']
except (TypeError, KeyError):
pass
Result:
list = [{'age': 18, 'mile_runner': 2, 'name': 'John', 'computer_skills': 4}]

custom JSON sort_keys order in Python

Is there any way in Python 2.6 to supply a custom key or cmp function to JSON's sort_keys?
I've got a list of dicts coming from JSON like so:
[
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
},
{
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
]
...which I've stored into the list variable assigndb. I'd like to be able to load in the JSON, modify it, and serialized it back out with dumps (or whatever), keeping the orders of the keys intact.
So far, I've tried something like this:
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
def key_func(k):
return ordering[k]
# renumber assignments sequentially
for (i, a) in enumerate(assigndb):
a["n"] = "%03d" % (i+1)
s = json.dumps(assigndb, indent=2, sort_keys=True, key=key_func)
...but of course dumps doesn't support a custom key like list.sort() does. Something with a custom JSONEncoder maybe? I can't seem to get it going.
An idea (tested with 2.7):
import json
import collections
json.encoder.c_make_encoder = None
d = collections.OrderedDict([("b", 2), ("a", 1)])
json.dumps(d)
# '{"b": 2, "a": 1}'
See: OrderedDict + issue6105. The c_make_encoder hack seems only to be needed for Python 2.x. Not a direct solution because you have to change dicts for OrderedDicts, but it may be still usable. I checked the json library (encode.py) and the ordered is hardcoded:
if _sort_keys:
items = sorted(dct.items(), key=lambda kv: kv[0])
This is kind of ugly, but in case tokland's solution does not work for you:
data = [{'category': 'nestedloops', 'title': 'Number Puzzle I', 'url': 'number-puzzle-i.html', 'n': '087', 'points': '60', 'key': 'numberpuzzles1'}, {'category': 'nestedloops', 'title': 'Getting Individual Digits', 'url': 'getting-individual-digits.html', 'n': '088', 'points': '80', 'key': 'gettingindividualdigits'}]
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
outlist = []
for d in data:
outlist.append([])
for k in sorted(d.keys(), key=lambda k: ordering[k]):
outlist[-1].append(json.dumps({k: d[k]}))
for i, l in enumerate(outlist):
outlist[i] = "{" + ",".join((s[1:-1] for s in outlist[i])) + "}"
s = "[" + ",".join(outlist) + "]"
Compact yet powerful recursive implementation with "prepended" and "appended" keys: https://gist.github.com/jeromerg/91f73d5867c5fa04ee7dbc0c5a03d611
def sort_recursive(node, first_keys, last_keys):
""" Sort the dictionary entries in a whole JSON object tree"""
fixed_placements = {
**{key: (0, idx) for idx, key in enumerate(first_keys)},
**{key: (2, idx) for idx, key in enumerate(last_keys)},
}
return _sort_recursive(node, lambda key: fixed_placements.get(key, (1, key)))
def _sort_recursive(node, key_fn):
if isinstance(node, list):
return [_sort_recursive(val, key_fn) for val in node]
elif isinstance(node, dict):
sorted_keys = sorted(node.keys(), key=key_fn)
return {k:_sort_recursive(node[k], key_fn) for k in sorted_keys}
else:
return node
I had the same problem and collections.OrderedDict was just not fit for the task because it ordered everything alphabetically. So I wrote something similar to Andrew Clark's solution:
def json_dumps_sorted(data, **kwargs):
sorted_keys = kwargs.get('sorted_keys', tuple())
if not sorted_keys:
return json.dumps(data)
else:
out_list = []
for element in data:
element_list = []
for key in sorted_keys:
if key in element:
element_list.append(json.dumps({key: element[key]}))
out_list.append('{{{}}}'.format(','.join((s[1:-1] for s in element_list))))
return '[{}]'.format(','.join(out_list))
You use it like this:
json_string = json_dumps_sorted([
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
}, {
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
], sorted_keys=(
'key',
'url',
'title',
'category',
'flags',
'points',
'n'
))
Thanks. I needed to put a timestamp key:value at the top of my JSON object no matter what. Obviously sorting the keys screwed this up as it starts with "t".
Using something like this, while putting the timestamp key in the dict_data right away worked:
d = collections.OrderedDict(dict_data)

Categories