Python: Convert multiple columns of CSV file to nested Json - python

This is my input CSV file with multiple columns, I would like to convert this csv file to a json file with department, departmentID, and one nested field called customer and put first and last nested to this field.
department, departmentID, first, last
fans, 1, Caroline, Smith
fans, 1, Jenny, White
students, 2, Ben, CJ
students, 2, Joan, Carpenter
...
Output json file what I need:
[
{
"department" : "fans",
"departmentID: "1",
"customer" : [
{
"first" : "Caroline",
"last" : "Smith"
},
{
"first" : "Jenny",
"last" : "White"
}
]
},
{
"department" : "students",
"departmentID":2,
"user" :
[
{
"first" : "Ben",
"last" : "CJ"
},
{
"first" : "Joan",
"last" : "Carpenter"
}
]
}
]
my code:
from csv import DictReader
from itertools import groupby
with open('data.csv') as csvfile:
r = DictReader(csvfile, skipinitialspace=True)
data = [dict(d) for d in r]
groups = []
uniquekeys = []
for k, g in groupby(data, lambda r: (r['group'], r['groupID'])):
groups.append({
"group": k[0],
"groupID": k[1],
"user": [{k:v for k, v in d.items() if k != 'group'} for d in list(g)]
})
uniquekeys.append(k)
pprint(groups)
My issue is: groupID shows twice in the data, in and out nested json. What I want is group and groupID as grouby key.

The issue was you mixed the names of the keys so this line
"user": [{k:v for k, v in d.items() if k != 'group'} for d in list(g)]
did not strip them properly from your dictionary there was no such key. So nothing was deleted.
I do not fully understand what keys you want so the following example assumes that data.csv looks exactly like in your question department and departmentID but the script converts it to group and groupID
from csv import DictReader
from itertools import groupby
from pprint import pprint
with open('data.csv') as csvfile:
r = DictReader(csvfile, skipinitialspace=True)
data = [dict(d) for d in r]
groups = []
uniquekeys = []
for k, g in groupby(data, lambda r: (r['department'], r['departmentID'])):
groups.append({
"group": k[0],
"groupID": k[1],
"user": [{k:v for k, v in d.items() if k not in ['department','departmentID']} for d in list(g)]
})
uniquekeys.append(k)
pprint(groups)
Output:
[{'group': 'fans',
'groupID': '1',
'user': [{'first': 'Caroline', 'last': 'Smith'},
{'first': 'Jenny', 'last': 'White'}]},
{'group': 'students',
'groupID': '2',
'user': [{'first': 'Ben', 'last': 'CJ'},
{'first': 'Joan', 'last': 'Carpenter'}]}]
I used different keys so it would be really obvious which line does what and easy to customize it for different keys in input or output

Related

Manipulating api response into a list of object key/value pairs

Having trouble manipulating this data for my front end.
Here is my API response, which is a list of dictionaries:
{
"name": "bob",
"age": 22,
"gender": male
},
{
"name": "zack",
"age": 43,
"gender": male
}
Here is the desired output, another list of dictionaries:
{id: 0, column: "age", bob: 22, zack: 43},
{id: 1, column: "gender", bob: male, zack: male}
So if another name was returned, it would simply be added as a key and grab the corresponding value for age/gender etc..
Here is the output I'm currently getting:
{id: 0, column: "age", bob: 22},
{id: 1, column: "gender", bob: male},
{id: 0, column: "age", zack: 43},
{id: 1, column: "gender", zack: male}
So for each list of dicts, I'm selecting the columns, using the name has a type of identifier, and assigning the corresponding value for a particular column.
I'm having trouble adding each person's name (the key in this case) to the same list of dicts with the corresponding value, age, gender, etc.. Here is the code I currently have.
my_list = []
counter = 0
for d in data:
for k, v in d.items():
dict = {}
length = len(d.keys())
if counter == length:
counter = 0
value = d['name']
dict['id'] = counter
dict['column'] = k
dict[value] = v
my_list.append(dict)
counter += 1
Can someone point me in the right direction?
Use:
data = [{
"name": "bob",
"age": 22,
"gender": "male"
},
{
"name": "zack",
"age": 43,
"gender": "male"
}]
keys = ["age", "gender"]
lookup = {key: {"id": i, "column" : key} for i, key in enumerate(keys)}
for d in data:
name = d.pop("name")
for key, value in d.items():
lookup[key][name] = value
res = list(lookup.values())
print(res)
Output
[{'id': 0, 'column': 'age', 'bob': 22, 'zack': 43}, {'id': 1, 'column': 'gender', 'bob': 'male', 'zack': 'male'}]
Or an alternative that does not alter the original dictionary:
keys = ["age", "gender"]
lookup = {key: {"id": i, "column" : key} for i, key in enumerate(keys)}
for d in data:
name = d["name"]
for key in (d.keys() - {"name"}):
lookup[key][name] = d[key]
res = list(lookup.values())
print(res)
Output
[{'id': 0, 'column': 'age', 'bob': 22, 'zack': 43}, {'id': 1, 'column': 'gender', 'bob': 'male', 'zack': 'male'}]
UPDATE
If the keys are not known before hand, you could do:
lookup = {}
for d in data:
name = d["name"]
for key in (d.keys() - {"name"}):
if key not in lookup:
lookup[key] = {key: {"id": len(lookup), "column": key}}
lookup[key][name] = d[key]
res = list(lookup.values())
print(res)

Statistics on a list of dictionaries considering multiples keys

I have a list of dicts:
input = [{'name':'A', 'Status':'Passed','id':'x1'},
{'name':'A', 'Status':'Passed','id':'x2'},
{'name':'A','Status':'Failed','id':'x3'},
{'name':'B', 'Status':'Passed','id':'x4'},
{'name':'B', 'Status':'Passed','id':'x5'}]
I want an output like :
output = [{'name':'A', 'Passed':'2', 'Failed':'1', 'Total':'3', '%Pass':'66%'},
{'name':'B', 'Passed':'2', 'Failed':'0', 'Total':'2', '%Pass':'100%'},
{'name':'Total', 'Passed':'4', 'Failed':'1', 'Total':'5', '%Pass':'80%'}]\
i started retrieving the different names by using a lookup :
lookup = {(d["name"]): d for d in input [::-1]}
names= [e for e in lookup.values()]
names= names[::-1]
and after using the list comprehension something like :\
for name in names :
name_passed = sum(["Passed" and "name" for d in input if 'Status' in d and name in d])
name_faled = sum(["Failed" and "name" for d in input if 'Status' in d and name in d])\
But i am not sure if there is a smartest way ? a simple loop and comparing dict values will be more simple!?
Assuming your input entries will always be grouped according to the "name" key-value pair:
entries = [
{"name": "A", "Status": "Passed", "id": "x1"},
{"name": "A", "Status": "Passed", "id": "x2"},
{"name": "A", "Status": "Failed", "id": "x3"},
{"name": "B", "Status": "Passed", "id": "x4"},
{"name": "B", "Status": "Passed", "id": "x5"}
]
def to_grouped(entries):
from itertools import groupby
from operator import itemgetter
for key, group_iter in groupby(entries, key=itemgetter("name")):
group = list(group_iter)
total = len(group)
passed = sum(1 for entry in group if entry["Status"] == "Passed")
failed = total - passed
perc_pass = (100 // total) * passed
yield {
"name": key,
"Passed": str(passed),
"Failed": str(failed),
"Total": str(total),
"%Pass": f"{perc_pass:.0f}%"
}
print(list(to_grouped(entries)))
Output:
[{'name': 'A', 'Passed': '2', 'Failed': '1', 'Total': '3', '%Pass': '66%'}, {'name': 'B', 'Passed': '2', 'Failed': '0', 'Total': '2', '%Pass': '100%'}]
This will not create the final entry you're looking for, which sums the statistics of all other entries. Though, that shouldn't be too hard to do.

Comparing 2 json files using phyton and outputting the difference in new output file

I am trying to compare 2 json files to do a delta check between them.
Exising json:
{
"rules": [
{
"customer_id": "abc123",
"id": "1",
},
{
"customer_id": "xyz456",
"id": "2",
}
]
}
Updated json:
{
"rules": [
{
"customer_id": "abc123",
"id": "1",
},
{
"customer_id": "def456",
"id": "3",
},
{
"customer_id": "xyz789",
"id": "2",
}
]
}
What i want is my code to get the new objects from the new json(in this case id:3 and customer id def456)
however i also want to keep the original existing values (id:2 customer id should remain as xyz456 instead of updated to the new value xyz789)
Here is my current code:
import json
# Opening JSON file
f = open('1.json',)
y = open('2.json',)
# returns JSON object as a dictionary
less_data = json.load(f)
more_data = json.load(y)
# Iterating through the json list
for x in more_data['rules']:
for y in less_data['rules']:
if x['id']== y['id']:
print("x:" + x['id'],"y:" + y['id'])
break
print(x['id'] + " is not found")
//take action to add in new objects into json output
running the program i get the following output:
x:1 y:1
1 is found
x:3 y:1
3 is not found
x:3 y:2
3 is not found
x:2 y:1
2 is not found
x:2 y:2
2 is found
I only want 3 is not found to be printed once after running till the end of the inner for loop instead of printing it out every iteration. Any help would be appreaciated
You can try flatten the JSON & compare its keys as dict\json datatype is unordered kind of datatype:
If you list ordering matters
import collections
from itertools import chain
def flatten(d, parent_key='', sep='__'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, collections.MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
elif isinstance(v, list):
for idx, value in enumerate(v):
items.extend(flatten(value, new_key + sep + str(idx), sep).items())
else:
items.append((new_key, v))
return dict(items)
a1 = {'rules': [{'customer_id': 'abc123', 'id': '1'}, {'customer_id': 'xyz456', 'id': '2'}]}
a2 = {'rules': [{'customer_id': 'abc123', 'id': '1'}, {'customer_id': 'def456', 'id': '3'}, {'customer_id': 'xyz789', 'id': '2'}]}
flatten_a1 = flatten(a1)
flatten_a2 = flatten(a2)
print(flatten_a1)
print(flatten_a2)
Output:
>>> flatten_a1 => {'rules__0__customer_id': 'abc123', 'rules__0__id': '1', 'rules__1__customer_id': 'xyz456', 'rules__1__id': '2'}
>>> flatten_a2 => {'rules__0__customer_id': 'abc123', 'rules__0__id': '1', 'rules__1__customer_id': 'def456', 'rules__1__id': '3', 'rules__2__customer_id': 'xyz789', 'rules__2__id': '2'}
from flatten_a1 & flatten_a2 value, you can easily find the differences as data structure comes in single level (i.e. not in nested format)
You can find:
which keys are missing
keys are present but values are different

How do I force one specific key to come at the top of a python dict or JSON dump [duplicate]

Is there any way in Python 2.6 to supply a custom key or cmp function to JSON's sort_keys?
I've got a list of dicts coming from JSON like so:
[
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
},
{
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
]
...which I've stored into the list variable assigndb. I'd like to be able to load in the JSON, modify it, and serialized it back out with dumps (or whatever), keeping the orders of the keys intact.
So far, I've tried something like this:
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
def key_func(k):
return ordering[k]
# renumber assignments sequentially
for (i, a) in enumerate(assigndb):
a["n"] = "%03d" % (i+1)
s = json.dumps(assigndb, indent=2, sort_keys=True, key=key_func)
...but of course dumps doesn't support a custom key like list.sort() does. Something with a custom JSONEncoder maybe? I can't seem to get it going.
An idea (tested with 2.7):
import json
import collections
json.encoder.c_make_encoder = None
d = collections.OrderedDict([("b", 2), ("a", 1)])
json.dumps(d)
# '{"b": 2, "a": 1}'
See: OrderedDict + issue6105. The c_make_encoder hack seems only to be needed for Python 2.x. Not a direct solution because you have to change dicts for OrderedDicts, but it may be still usable. I checked the json library (encode.py) and the ordered is hardcoded:
if _sort_keys:
items = sorted(dct.items(), key=lambda kv: kv[0])
This is kind of ugly, but in case tokland's solution does not work for you:
data = [{'category': 'nestedloops', 'title': 'Number Puzzle I', 'url': 'number-puzzle-i.html', 'n': '087', 'points': '60', 'key': 'numberpuzzles1'}, {'category': 'nestedloops', 'title': 'Getting Individual Digits', 'url': 'getting-individual-digits.html', 'n': '088', 'points': '80', 'key': 'gettingindividualdigits'}]
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
outlist = []
for d in data:
outlist.append([])
for k in sorted(d.keys(), key=lambda k: ordering[k]):
outlist[-1].append(json.dumps({k: d[k]}))
for i, l in enumerate(outlist):
outlist[i] = "{" + ",".join((s[1:-1] for s in outlist[i])) + "}"
s = "[" + ",".join(outlist) + "]"
Compact yet powerful recursive implementation with "prepended" and "appended" keys: https://gist.github.com/jeromerg/91f73d5867c5fa04ee7dbc0c5a03d611
def sort_recursive(node, first_keys, last_keys):
""" Sort the dictionary entries in a whole JSON object tree"""
fixed_placements = {
**{key: (0, idx) for idx, key in enumerate(first_keys)},
**{key: (2, idx) for idx, key in enumerate(last_keys)},
}
return _sort_recursive(node, lambda key: fixed_placements.get(key, (1, key)))
def _sort_recursive(node, key_fn):
if isinstance(node, list):
return [_sort_recursive(val, key_fn) for val in node]
elif isinstance(node, dict):
sorted_keys = sorted(node.keys(), key=key_fn)
return {k:_sort_recursive(node[k], key_fn) for k in sorted_keys}
else:
return node
I had the same problem and collections.OrderedDict was just not fit for the task because it ordered everything alphabetically. So I wrote something similar to Andrew Clark's solution:
def json_dumps_sorted(data, **kwargs):
sorted_keys = kwargs.get('sorted_keys', tuple())
if not sorted_keys:
return json.dumps(data)
else:
out_list = []
for element in data:
element_list = []
for key in sorted_keys:
if key in element:
element_list.append(json.dumps({key: element[key]}))
out_list.append('{{{}}}'.format(','.join((s[1:-1] for s in element_list))))
return '[{}]'.format(','.join(out_list))
You use it like this:
json_string = json_dumps_sorted([
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
}, {
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
], sorted_keys=(
'key',
'url',
'title',
'category',
'flags',
'points',
'n'
))
Thanks. I needed to put a timestamp key:value at the top of my JSON object no matter what. Obviously sorting the keys screwed this up as it starts with "t".
Using something like this, while putting the timestamp key in the dict_data right away worked:
d = collections.OrderedDict(dict_data)

custom JSON sort_keys order in Python

Is there any way in Python 2.6 to supply a custom key or cmp function to JSON's sort_keys?
I've got a list of dicts coming from JSON like so:
[
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
},
{
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
]
...which I've stored into the list variable assigndb. I'd like to be able to load in the JSON, modify it, and serialized it back out with dumps (or whatever), keeping the orders of the keys intact.
So far, I've tried something like this:
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
def key_func(k):
return ordering[k]
# renumber assignments sequentially
for (i, a) in enumerate(assigndb):
a["n"] = "%03d" % (i+1)
s = json.dumps(assigndb, indent=2, sort_keys=True, key=key_func)
...but of course dumps doesn't support a custom key like list.sort() does. Something with a custom JSONEncoder maybe? I can't seem to get it going.
An idea (tested with 2.7):
import json
import collections
json.encoder.c_make_encoder = None
d = collections.OrderedDict([("b", 2), ("a", 1)])
json.dumps(d)
# '{"b": 2, "a": 1}'
See: OrderedDict + issue6105. The c_make_encoder hack seems only to be needed for Python 2.x. Not a direct solution because you have to change dicts for OrderedDicts, but it may be still usable. I checked the json library (encode.py) and the ordered is hardcoded:
if _sort_keys:
items = sorted(dct.items(), key=lambda kv: kv[0])
This is kind of ugly, but in case tokland's solution does not work for you:
data = [{'category': 'nestedloops', 'title': 'Number Puzzle I', 'url': 'number-puzzle-i.html', 'n': '087', 'points': '60', 'key': 'numberpuzzles1'}, {'category': 'nestedloops', 'title': 'Getting Individual Digits', 'url': 'getting-individual-digits.html', 'n': '088', 'points': '80', 'key': 'gettingindividualdigits'}]
ordering = {'key': 0, 'url': 1, 'title': 2, 'category': 3,
'flags': 4, 'points': 5, 'n': 6}
outlist = []
for d in data:
outlist.append([])
for k in sorted(d.keys(), key=lambda k: ordering[k]):
outlist[-1].append(json.dumps({k: d[k]}))
for i, l in enumerate(outlist):
outlist[i] = "{" + ",".join((s[1:-1] for s in outlist[i])) + "}"
s = "[" + ",".join(outlist) + "]"
Compact yet powerful recursive implementation with "prepended" and "appended" keys: https://gist.github.com/jeromerg/91f73d5867c5fa04ee7dbc0c5a03d611
def sort_recursive(node, first_keys, last_keys):
""" Sort the dictionary entries in a whole JSON object tree"""
fixed_placements = {
**{key: (0, idx) for idx, key in enumerate(first_keys)},
**{key: (2, idx) for idx, key in enumerate(last_keys)},
}
return _sort_recursive(node, lambda key: fixed_placements.get(key, (1, key)))
def _sort_recursive(node, key_fn):
if isinstance(node, list):
return [_sort_recursive(val, key_fn) for val in node]
elif isinstance(node, dict):
sorted_keys = sorted(node.keys(), key=key_fn)
return {k:_sort_recursive(node[k], key_fn) for k in sorted_keys}
else:
return node
I had the same problem and collections.OrderedDict was just not fit for the task because it ordered everything alphabetically. So I wrote something similar to Andrew Clark's solution:
def json_dumps_sorted(data, **kwargs):
sorted_keys = kwargs.get('sorted_keys', tuple())
if not sorted_keys:
return json.dumps(data)
else:
out_list = []
for element in data:
element_list = []
for key in sorted_keys:
if key in element:
element_list.append(json.dumps({key: element[key]}))
out_list.append('{{{}}}'.format(','.join((s[1:-1] for s in element_list))))
return '[{}]'.format(','.join(out_list))
You use it like this:
json_string = json_dumps_sorted([
{
"key": "numberpuzzles1",
"url": "number-puzzle-i.html",
"title": "Number Puzzle I",
"category": "nestedloops",
"points": "60",
"n": "087"
}, {
"key": "gettingindividualdigits",
"url": "getting-individual-digits.html",
"title": "Getting Individual Digits",
"category": "nestedloops",
"points": "80",
"n": "088"
}
], sorted_keys=(
'key',
'url',
'title',
'category',
'flags',
'points',
'n'
))
Thanks. I needed to put a timestamp key:value at the top of my JSON object no matter what. Obviously sorting the keys screwed this up as it starts with "t".
Using something like this, while putting the timestamp key in the dict_data right away worked:
d = collections.OrderedDict(dict_data)

Categories