python dynamic nested dictionary to csv - python

The obtained output below are from query results.
{'_id': ObjectId('651f3e6e5723b7c1'), 'fruits': {'pineapple': '2', 'grape': '0', 'apple': 'unknown'},'day': 'Tues', 'month': 'July', 'address': 'long', 'buyer': 'B1001', 'seller': 'S1301', 'date': {'date': 210324}}
{'_id': ObjectId('651f3e6e5723b7c1'), 'fruits': {'lemon': '2', 'grape': '0', 'apple': 'unknown', 'strawberry': '1'},'day': 'Mon', 'month': 'January', 'address': 'longer', 'buyer': 'B1001', 'seller': 'S1301', 'date': {'date': 210324}}
#worked but not with fruits and dynamic header
date = json.dumps(q['date']) #convert it to string
date = re.split("(:|\}| )", date)[4] #and split to get value
for q in db.fruits.aggregate(query):
print('"' + q['day'] + '","' + q['month'] + '","' + date + '","' + q['time'] + '","' + q['buyer'] + '","' + q['seller'] + '"')
#below close to what I want but having issue with nested and repeated rows
ffile = open("fruits.csv", "w")
w = csv.DictWriter(ffile, q.keys())
w.writeheader()
w.writerow(q)
I want to create a csv from it.
I am able to get everything exactly like the below table shown but not the fruits. I am stuck at nested dictionary field, and with the dynamic table header.
Mongoexport doesn’t work for me at the moment.
The field fruits could have more different nested key and value for each time.
I am currently still trying/exploring on csv.writer and try to add condition if i found nested dict. [will update answer if i manage to create the csv]
A hint to create this csv will be nice to have.
Thank you if anyone is sharing the link to similar question.

Not a problem!
We'll need to flatten the deep structure so we can all possible keys from there to form a CSV with. That requires a recursive function (flatten_dict here) to take an input dict and turn it into an output dict that contains no more dicts; here, the keys are tuples, e.g. ('foo', 'bar', 'baz').
We run that function over all input rows, gathering up the keys we've encountered along the way to the known_keys set.
That set is sorted (since we assume that the original dicts don't really have an intrinsic order either) and the dots joined to re-form the CSV header row.
Then, the flattened rows are simply iterated over and written (taking care to write an empty string for non-existent values).
The output is e.g.
_id,address,buyer,date.date,day,fruits.apple,fruits.grape,fruits.lemon,fruits.pineapple,fruits.strawberry,month,seller
651f3e6e5723b7c1,long,B1001,210324,Tues,unknown,0,,2,,July,S1301
651f3e6e5723b7c2,longer,B1001,210324,Mon,unknown,0,2,,1,January,S1301
import csv
import sys
rows = [
{
"_id": "651f3e6e5723b7c1",
"fruits": {"pineapple": "2", "grape": "0", "apple": "unknown"},
"day": "Tues",
"month": "July",
"address": "long",
"buyer": "B1001",
"seller": "S1301",
"date": {"date": 210324},
},
{
"_id": "651f3e6e5723b7c2",
"fruits": {
"lemon": "2",
"grape": "0",
"apple": "unknown",
"strawberry": "1",
},
"day": "Mon",
"month": "January",
"address": "longer",
"buyer": "B1001",
"seller": "S1301",
"date": {"date": 210324},
},
]
def flatten_dict(d: dict) -> dict:
"""
Flatten hierarchical dicts into a dict of path tuples -> deep values.
"""
out = {}
def _flatten_into(into, pairs, prefix=()):
for key, value in pairs:
p_key = prefix + (key,)
if isinstance(value, list):
_flatten_into(into, enumerate(list), p_key)
elif isinstance(value, dict):
_flatten_into(into, value.items(), p_key)
else:
out[p_key] = value
_flatten_into(out, d.items())
return out
known_keys = set()
flat_rows = []
for row in rows:
flat_row = flatten_dict(row)
known_keys |= set(flat_row.keys())
flat_rows.append(flat_row)
ordered_keys = sorted(known_keys)
writer = csv.writer(sys.stdout)
writer.writerow([".".join(map(str, key)) for key in ordered_keys])
for flat_row in flat_rows:
writer.writerow([str(flat_row.get(key, "")) for key in ordered_keys])

Related

Statistics on a list of dictionaries considering multiples keys

I have a list of dicts:
input = [{'name':'A', 'Status':'Passed','id':'x1'},
{'name':'A', 'Status':'Passed','id':'x2'},
{'name':'A','Status':'Failed','id':'x3'},
{'name':'B', 'Status':'Passed','id':'x4'},
{'name':'B', 'Status':'Passed','id':'x5'}]
I want an output like :
output = [{'name':'A', 'Passed':'2', 'Failed':'1', 'Total':'3', '%Pass':'66%'},
{'name':'B', 'Passed':'2', 'Failed':'0', 'Total':'2', '%Pass':'100%'},
{'name':'Total', 'Passed':'4', 'Failed':'1', 'Total':'5', '%Pass':'80%'}]\
i started retrieving the different names by using a lookup :
lookup = {(d["name"]): d for d in input [::-1]}
names= [e for e in lookup.values()]
names= names[::-1]
and after using the list comprehension something like :\
for name in names :
name_passed = sum(["Passed" and "name" for d in input if 'Status' in d and name in d])
name_faled = sum(["Failed" and "name" for d in input if 'Status' in d and name in d])\
But i am not sure if there is a smartest way ? a simple loop and comparing dict values will be more simple!?
Assuming your input entries will always be grouped according to the "name" key-value pair:
entries = [
{"name": "A", "Status": "Passed", "id": "x1"},
{"name": "A", "Status": "Passed", "id": "x2"},
{"name": "A", "Status": "Failed", "id": "x3"},
{"name": "B", "Status": "Passed", "id": "x4"},
{"name": "B", "Status": "Passed", "id": "x5"}
]
def to_grouped(entries):
from itertools import groupby
from operator import itemgetter
for key, group_iter in groupby(entries, key=itemgetter("name")):
group = list(group_iter)
total = len(group)
passed = sum(1 for entry in group if entry["Status"] == "Passed")
failed = total - passed
perc_pass = (100 // total) * passed
yield {
"name": key,
"Passed": str(passed),
"Failed": str(failed),
"Total": str(total),
"%Pass": f"{perc_pass:.0f}%"
}
print(list(to_grouped(entries)))
Output:
[{'name': 'A', 'Passed': '2', 'Failed': '1', 'Total': '3', '%Pass': '66%'}, {'name': 'B', 'Passed': '2', 'Failed': '0', 'Total': '2', '%Pass': '100%'}]
This will not create the final entry you're looking for, which sums the statistics of all other entries. Though, that shouldn't be too hard to do.

Comparing 2 json files using phyton and outputting the difference in new output file

I am trying to compare 2 json files to do a delta check between them.
Exising json:
{
"rules": [
{
"customer_id": "abc123",
"id": "1",
},
{
"customer_id": "xyz456",
"id": "2",
}
]
}
Updated json:
{
"rules": [
{
"customer_id": "abc123",
"id": "1",
},
{
"customer_id": "def456",
"id": "3",
},
{
"customer_id": "xyz789",
"id": "2",
}
]
}
What i want is my code to get the new objects from the new json(in this case id:3 and customer id def456)
however i also want to keep the original existing values (id:2 customer id should remain as xyz456 instead of updated to the new value xyz789)
Here is my current code:
import json
# Opening JSON file
f = open('1.json',)
y = open('2.json',)
# returns JSON object as a dictionary
less_data = json.load(f)
more_data = json.load(y)
# Iterating through the json list
for x in more_data['rules']:
for y in less_data['rules']:
if x['id']== y['id']:
print("x:" + x['id'],"y:" + y['id'])
break
print(x['id'] + " is not found")
//take action to add in new objects into json output
running the program i get the following output:
x:1 y:1
1 is found
x:3 y:1
3 is not found
x:3 y:2
3 is not found
x:2 y:1
2 is not found
x:2 y:2
2 is found
I only want 3 is not found to be printed once after running till the end of the inner for loop instead of printing it out every iteration. Any help would be appreaciated
You can try flatten the JSON & compare its keys as dict\json datatype is unordered kind of datatype:
If you list ordering matters
import collections
from itertools import chain
def flatten(d, parent_key='', sep='__'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
elif isinstance(v, list):
for idx, value in enumerate(v):
items.extend(flatten(value, new_key + sep + str(idx), sep).items())
else:
items.append((new_key, v))
return dict(items)
a1 = {'rules': [{'customer_id': 'abc123', 'id': '1'}, {'customer_id': 'xyz456', 'id': '2'}]}
a2 = {'rules': [{'customer_id': 'abc123', 'id': '1'}, {'customer_id': 'def456', 'id': '3'}, {'customer_id': 'xyz789', 'id': '2'}]}
flatten_a1 = flatten(a1)
flatten_a2 = flatten(a2)
print(flatten_a1)
print(flatten_a2)
Output:
>>> flatten_a1 => {'rules__0__customer_id': 'abc123', 'rules__0__id': '1', 'rules__1__customer_id': 'xyz456', 'rules__1__id': '2'}
>>> flatten_a2 => {'rules__0__customer_id': 'abc123', 'rules__0__id': '1', 'rules__1__customer_id': 'def456', 'rules__1__id': '3', 'rules__2__customer_id': 'xyz789', 'rules__2__id': '2'}
from flatten_a1 & flatten_a2 value, you can easily find the differences as data structure comes in single level (i.e. not in nested format)
You can find:
which keys are missing
keys are present but values are different

python trasform data from csv to array of dictionaries and group by field value

I have csv like this:
id,company_name,country,country_id
1,batstop,usa, xx
2,biorice,italy, yy
1,batstop,italy, yy
3,legstart,canada, zz
I want an array of dictionaries to import to firebase. I need to group the different country informations for the same company in a nested list of dictionaries. This is the desired output:
[ {'id':'1', 'agency_name':'batstop', countries [{'country':'usa','country_id':'xx'}, {'country':'italy','country_id':'yy'}]} ,
{'id':'2', 'agency_name':'biorice', countries [{'country':'italy','country_id':'yy'}]},
{'id':'3', 'legstart':'legstart', countries [{'country':'canada','country_id':'zz'}]} ]
Recently I had a similar task, the groupby function from itertools and the itemgetter function from operator - both standard python libraries - helped me a lot. Here's the code considering your csv, note how defining the primary keys of your csv dataset is important.
import csv
import json
from operator import itemgetter
from itertools import groupby
primary_keys = ['id', 'company_name']
# Start extraction
with open('input.csv', 'r') as file:
# Read data from csv
reader = csv.DictReader(file)
# Sort data accordingly to primary keys
reader = sorted(reader, key=itemgetter(*primary_keys))
# Create a list of tuples
# Each tuple containing a dict of the group primary keys and its values, and a list of the group ordered dicts
groups = [(dict(zip(primary_keys, _[0])), list(_[1])) for _ in groupby(reader, key=itemgetter(*primary_keys))]
# Create formatted dict to be converted into firebase objects
group_dicts = []
for group in groups:
group_dict = {
"id": group[0]['id'],
"agency_name": group[0]['company_name'],
"countries": [
dict(country=_['country'], country_id=_['country_id']) for _ in group[1]
],
}
group_dicts.append(group_dict)
print("\n".join([json.dumps(_, indent=2) for _ in group_dicts]))
Here's the output:
{
"id": "1",
"agency_name": "batstop",
"countries": [
{
"country": "usa",
"country_id": " xx"
},
{
"country": "italy",
"country_id": " yy"
}
]
}
{
"id": "2",
"agency_name": "biorice",
"countries": [
{
"country": "italy",
"country_id": " yy"
}
]
}
{
"id": "3",
"agency_name": "legstart",
"countries": [
{
"country": "canada",
"country_id": " zz"
}
]
}
There's no external library,
Hope it suits you well!
You can try this, you may have to change a few parts to get it working with your csv, but hope it's enough to get you started:
csv = [
"1,batstop,usa, xx",
"2,biorice,italy, yy",
"1,batstop,italy, yy",
"3,legstart,canada, zz"
]
output = {} # dictionary useful to avoid searching in list for existing ids
# Parse each row
for line in csv:
cols = line.split(',')
id = int(cols[0])
agency_name = cols[1]
country = cols[2]
country_id = cols[3]
if id in output:
output[id]['countries'].append([{'country': country,
'country_id': country_id}])
else:
output[id] = {'id': id,
'agency_name': agency_name,
'countries': [{'country': country,
'country_id': country_id}]
}
# Put into list
json_output = []
for key in output.keys():
json_output.append( output[key] )
# Check output
for row in json_output:
print(row)

How do I force one specific key to come at the top of a python dict or JSON dump [duplicate]

Is there any way in Python 2.6 to supply a custom key or cmp function to JSON's sort_keys?
I've got a list of dicts coming from JSON like so:
[
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
},
{
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
]
...which I've stored into the list variable assigndb. I'd like to be able to load in the JSON, modify it, and serialized it back out with dumps (or whatever), keeping the orders of the keys intact.
So far, I've tried something like this:
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
def key_func(k):
return ordering[k]
# renumber assignments sequentially
for (i, a) in enumerate(assigndb):
a["n"] = "%03d" % (i+1)
s = json.dumps(assigndb, indent=2, sort_keys=True, key=key_func)
...but of course dumps doesn't support a custom key like list.sort() does. Something with a custom JSONEncoder maybe? I can't seem to get it going.
An idea (tested with 2.7):
import json
import collections
json.encoder.c_make_encoder = None
d = collections.OrderedDict([("b", 2), ("a", 1)])
json.dumps(d)
# '{"b": 2, "a": 1}'
See: OrderedDict + issue6105. The c_make_encoder hack seems only to be needed for Python 2.x. Not a direct solution because you have to change dicts for OrderedDicts, but it may be still usable. I checked the json library (encode.py) and the ordered is hardcoded:
if _sort_keys:
items = sorted(dct.items(), key=lambda kv: kv[0])
This is kind of ugly, but in case tokland's solution does not work for you:
data = [{'category': 'nestedloops', 'title': 'Number Puzzle I', 'url': 'number-puzzle-i.html', 'n': '087', 'points': '60', 'key': 'numberpuzzles1'}, {'category': 'nestedloops', 'title': 'Getting Individual Digits', 'url': 'getting-individual-digits.html', 'n': '088', 'points': '80', 'key': 'gettingindividualdigits'}]
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
outlist = []
for d in data:
outlist.append([])
for k in sorted(d.keys(), key=lambda k: ordering[k]):
outlist[-1].append(json.dumps({k: d[k]}))
for i, l in enumerate(outlist):
outlist[i] = "{" + ",".join((s[1:-1] for s in outlist[i])) + "}"
s = "[" + ",".join(outlist) + "]"
Compact yet powerful recursive implementation with "prepended" and "appended" keys: https://gist.github.com/jeromerg/91f73d5867c5fa04ee7dbc0c5a03d611
def sort_recursive(node, first_keys, last_keys):
""" Sort the dictionary entries in a whole JSON object tree"""
fixed_placements = {
**{key: (0, idx) for idx, key in enumerate(first_keys)},
**{key: (2, idx) for idx, key in enumerate(last_keys)},
}
return _sort_recursive(node, lambda key: fixed_placements.get(key, (1, key)))
def _sort_recursive(node, key_fn):
if isinstance(node, list):
return [_sort_recursive(val, key_fn) for val in node]
elif isinstance(node, dict):
sorted_keys = sorted(node.keys(), key=key_fn)
return {k:_sort_recursive(node[k], key_fn) for k in sorted_keys}
else:
return node
I had the same problem and collections.OrderedDict was just not fit for the task because it ordered everything alphabetically. So I wrote something similar to Andrew Clark's solution:
def json_dumps_sorted(data, **kwargs):
sorted_keys = kwargs.get('sorted_keys', tuple())
if not sorted_keys:
return json.dumps(data)
else:
out_list = []
for element in data:
element_list = []
for key in sorted_keys:
if key in element:
element_list.append(json.dumps({key: element[key]}))
out_list.append('{{{}}}'.format(','.join((s[1:-1] for s in element_list))))
return '[{}]'.format(','.join(out_list))
You use it like this:
json_string = json_dumps_sorted([
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
}, {
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
], sorted_keys=(
'key',
'url',
'title',
'category',
'flags',
'points',
'n'
))
Thanks. I needed to put a timestamp key:value at the top of my JSON object no matter what. Obviously sorting the keys screwed this up as it starts with "t".
Using something like this, while putting the timestamp key in the dict_data right away worked:
d = collections.OrderedDict(dict_data)

custom JSON sort_keys order in Python

Is there any way in Python 2.6 to supply a custom key or cmp function to JSON's sort_keys?
I've got a list of dicts coming from JSON like so:
[
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
},
{
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
]
...which I've stored into the list variable assigndb. I'd like to be able to load in the JSON, modify it, and serialized it back out with dumps (or whatever), keeping the orders of the keys intact.
So far, I've tried something like this:
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
def key_func(k):
return ordering[k]
# renumber assignments sequentially
for (i, a) in enumerate(assigndb):
a["n"] = "%03d" % (i+1)
s = json.dumps(assigndb, indent=2, sort_keys=True, key=key_func)
...but of course dumps doesn't support a custom key like list.sort() does. Something with a custom JSONEncoder maybe? I can't seem to get it going.
An idea (tested with 2.7):
import json
import collections
json.encoder.c_make_encoder = None
d = collections.OrderedDict([("b", 2), ("a", 1)])
json.dumps(d)
# '{"b": 2, "a": 1}'
See: OrderedDict + issue6105. The c_make_encoder hack seems only to be needed for Python 2.x. Not a direct solution because you have to change dicts for OrderedDicts, but it may be still usable. I checked the json library (encode.py) and the ordered is hardcoded:
if _sort_keys:
items = sorted(dct.items(), key=lambda kv: kv[0])
This is kind of ugly, but in case tokland's solution does not work for you:
data = [{'category': 'nestedloops', 'title': 'Number Puzzle I', 'url': 'number-puzzle-i.html', 'n': '087', 'points': '60', 'key': 'numberpuzzles1'}, {'category': 'nestedloops', 'title': 'Getting Individual Digits', 'url': 'getting-individual-digits.html', 'n': '088', 'points': '80', 'key': 'gettingindividualdigits'}]
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
outlist = []
for d in data:
outlist.append([])
for k in sorted(d.keys(), key=lambda k: ordering[k]):
outlist[-1].append(json.dumps({k: d[k]}))
for i, l in enumerate(outlist):
outlist[i] = "{" + ",".join((s[1:-1] for s in outlist[i])) + "}"
s = "[" + ",".join(outlist) + "]"
Compact yet powerful recursive implementation with "prepended" and "appended" keys: https://gist.github.com/jeromerg/91f73d5867c5fa04ee7dbc0c5a03d611
def sort_recursive(node, first_keys, last_keys):
""" Sort the dictionary entries in a whole JSON object tree"""
fixed_placements = {
**{key: (0, idx) for idx, key in enumerate(first_keys)},
**{key: (2, idx) for idx, key in enumerate(last_keys)},
}
return _sort_recursive(node, lambda key: fixed_placements.get(key, (1, key)))
def _sort_recursive(node, key_fn):
if isinstance(node, list):
return [_sort_recursive(val, key_fn) for val in node]
elif isinstance(node, dict):
sorted_keys = sorted(node.keys(), key=key_fn)
return {k:_sort_recursive(node[k], key_fn) for k in sorted_keys}
else:
return node
I had the same problem and collections.OrderedDict was just not fit for the task because it ordered everything alphabetically. So I wrote something similar to Andrew Clark's solution:
def json_dumps_sorted(data, **kwargs):
sorted_keys = kwargs.get('sorted_keys', tuple())
if not sorted_keys:
return json.dumps(data)
else:
out_list = []
for element in data:
element_list = []
for key in sorted_keys:
if key in element:
element_list.append(json.dumps({key: element[key]}))
out_list.append('{{{}}}'.format(','.join((s[1:-1] for s in element_list))))
return '[{}]'.format(','.join(out_list))
You use it like this:
json_string = json_dumps_sorted([
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
}, {
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
], sorted_keys=(
'key',
'url',
'title',
'category',
'flags',
'points',
'n'
))
Thanks. I needed to put a timestamp key:value at the top of my JSON object no matter what. Obviously sorting the keys screwed this up as it starts with "t".
Using something like this, while putting the timestamp key in the dict_data right away worked:
d = collections.OrderedDict(dict_data)

Categories