Merge json files in Python

Merge json files in Python - python

I'm trying to merge 2 json files in Python. Here are the files:
test1.json
{
"version": "1.0",
"data": {
"admin1": {
"id": "1",
"location": "NY"
},
"admin2": {
"id": "2",
"name": "Bob",
"location": "LA",
"admin_key": {
"adminvalue1": "admin1",
"adminvalue2": "admin2"
}
},
"admin3": {
"name": "john"
}
}
}
test2.json
{
"data": {
"user1": {
"name": "jane",
"phone": "555-666-7777",
"enail": "jane#jane.com"
},
"user2": {
"location": "LA",
"id": "5"
},
"user3": {
"description": "user",
"location": "NY",
"name": "zoe",
"phone": "111-222-3333",
"user_key": {
"uservalue1": "user1",
"uservalue2": "user2"
}
}
}
}
I have this code to merge the two files
import json
with open("test1.json", "r") as data1_file:
data1 = json.load(data1_file)
with open("test2.json", "r") as data2_file:
data2 = json.load(data2_file)
data1.update(data2)
with open("out.json", "w") as out_file:
json.dump(data1, out_file, indent=4)
The output I'm getting is this. It only has test2.json contents under "data".
{
"version": "1.0",
"data": {
"user1": {
"name": "jane",
"phone": "555-666-7777",
"enail": "jane#jane.com"
},
"user2": {
"location": "LA",
"id": "5"
},
"user3": {
"description": "user",
"location": "NY",
"name": "zoe",
"phone": "111-222-3333",
"user_key": {
"uservalue1": "user1",
"uservalue2": "user2"
}
}
}
}
I want the output to have contents of both files under "data" like below
{
"version": "1.0",
"data": {
"admin1": {
"id": "1",
"location": "NY"
},
"admin2": {
"id": "2",
"name": "Bob",
"location": "LA",
"admin_key": {
"adminvalue1": "admin1",
"adminvalue2": "admin2"
}
},
"admin3": {
"name": "john"
},
"user1": {
"name": "jane",
"phone": "555-666-7777",
"enail": "jane#jane.com"
},
"user2": {
"location": "LA",
"id": "5"
},
"user3": {
"description": "user",
"location": "NY",
"name": "zoe",
"phone": "111-222-3333",
"user_key": {
"uservalue1": "user1",
"uservalue2": "user2"
}
}
}
}
How can I achieve this? Thanks!

You need to merge the "sub-dictionary" data1['data'], not data1 itself. In the current code, you are updating data1 with data2, so that data2['data'] overwrites data1['data'].
So replace data1.update(data2) with:
data1['data'].update(data2['data'])

I think this is what you are looking for:
https://stackoverflow.com/a/7205107/8786297
def merge(a, b, path=None):
"merges b into a"
if path is None: path = []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif a[key] == b[key]:
pass # same leaf value
else:
raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
else:
a[key] = b[key]
return a

Related

Explode json without pandas

I have a JSON object:
{
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
I want to get the result as a new JSON, but without using pandas (and all those explode, flatten and normalize functions...). Is there any option to get this structure without using pandas or having an Out of memory issue?
The output should be:
{ "id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{ "id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24",
},
{ "id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{ "id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
},

You can simply loop over the list associated with "geography" and build new dictionaries that you will add to a newly created list:
dict_in = {
"data": {
"geography": [
{
"id": "1",
"state": "USA",
"properties": [
{
"code": "CMD-01",
"value": "34"
},
{
"code": "CMD-02",
"value": "24"
}
]
},
{
"id": "2",
"state": "Canada",
"properties": [
{
"code": "CMD-04",
"value": "50"
},
{
"code": "CMD-05",
"value": "60"
}
]
}
]
}
}
import json
rec_out = []
for obj in dict_in["data"]["geography"]:
for prop in obj["properties"]:
dict_out = {
"id": obj["id"],
"state": obj["state"]
}
dict_out.update(prop)
rec_out.append(dict_out)
print(json.dumps(rec_out, indent=4))
Output:
[
{
"id": "1",
"state": "USA",
"code": "CMD-01",
"value": "34"
},
{
"id": "1",
"state": "USA",
"code": "CMD-02",
"value": "24"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-04",
"value": "50"
},
{
"id": "2",
"state": "Canada",
"code": "CMD-05",
"value": "60"
}
]

How to extract data from complex JSON object?

I am trying to extract data from the json file I got from a get request.
{
"data": [
{
"type": "Projects",
"id": "102777c7-50a7-592d-1b65-621d5850a5bb",
"attributes": {
"name": "Hydroelectric Project Updated from Postman",
"projectid": "001"
},
"relationships": {
"Accounts": "Account1"
"Notes": "Note1"
}
},
{
"type": "Projects",
"id": "102c7131-d797-c085-d248-621d5820494f",
"attributes": {
"name": "Ana Hydroelectric Project",
"projectid": "002"
},
"relationships": {
"Accounts": "Account1"
"Notes": "Note1"
}
},
{
"type": "Projects",
"id": "1041f300-5acf-4bd9-2ec4-621d58bbe6bc",
"attributes": {
"name": "Methane Capture Project",
"projectid": "003"
},
"relationships": {
"Accounts": "Account1"
"Notes": "Note1"
}
}
]
}
I have an empty dictionary that stores projectid as Key.
projectids = {
001:"",
002:"",
003:"",
004:"",
}
I was looking for a way to find "projectid" inside "attributes" and the corresponding value for "id" and populate the dictionary projectids with the key(['attributes']['projectid']) and values(id):
{
"001": "102777c7-50a7-592d-1b65-621d5850a5bb",
"002": "102c7131-d797-c085-d248-621d5820494f",
"003": "1041f300-5acf-4bd9-2ec4-621d58bbe6bc",
"004": ""
}

You can try this, assuming data is your variable for the response from the GET request
# this solution will populate for all project ids
projectids = {}
for item in data['data']:
projectids[item['attributes']['projectid']] = item['id']
Output:
{
'001': '102777c7-50a7-592d-1b65-621d5850a5bb',
'002': '102c7131-d797-c085-d248-621d5820494f',
'003': '1041f300-5acf-4bd9-2ec4-621d58bbe6bc'
}
if you're trying to match with already existing projectids in a dict then try
# this solution will search for only pre-specified project ids
projectids = {
"001": "",
"002": "",
"003": "",
"004": "",
}
for idx in projectids.keys():
# find the index of matching dict from data['data']
# will return None if match is not found
matching_index = next((i for i, item in enumerate(data['data']) if
item["attributes"]["projectid"] == idx), None)
if matching_index is not None:
projectids[idx] = data['data'][matching_index]['id']

If data is your input data from the question, then:
projectids = {f"{i:>03}": "" for i in range(1, 5)}
out = {
**projectids,
**{d["attributes"]["projectid"]: d["id"] for d in data["data"]},
}
print(out)
Prints:
{
"001": "102777c7-50a7-592d-1b65-621d5850a5bb",
"002": "102c7131-d797-c085-d248-621d5820494f",
"003": "1041f300-5acf-4bd9-2ec4-621d58bbe6bc",
"004": "",
}

Simply try this:
json_data = {
"data": [
{
"type": "Projects",
"id": "102777c7-50a7-592d-1b65-621d5850a5bb",
"attributes": {
"name": "Hydroelectric Project Updated from Postman",
"projectid": "001"
},
"relationships": {
"Accounts": "Account1",
"Notes": "Note1"
}
},
{
"type": "Projects",
"id": "102c7131-d797-c085-d248-621d5820494f",
"attributes": {
"name": "Ana Hydroelectric Project",
"projectid": "002"
},
"relationships": {
"Accounts": "Account1",
"Notes": "Note1"
}
},
{
"type": "Projects",
"id": "1041f300-5acf-4bd9-2ec4-621d58bbe6bc",
"attributes": {
"name": "Methane Capture Project",
"projectid": "003"
},
"relationships": {
"Accounts": "Account1",
"Notes": "Note1"
}
}
]
}
Just asumme the above json data and try the following code:
project_ids = {item['attributes']['projectid']:item['id'] for item in json_data['data']}
expected output:
{'001': '102777c7-50a7-592d-1b65-621d5850a5bb',
'002': '102c7131-d797-c085-d248-621d5820494f',
'003': '1041f300-5acf-4bd9-2ec4-621d58bbe6bc'}

Merge Json with same key value pairs

I got a resultant json from an API in the following format
[{
"Uid": "40cc6103-1cf0-4735-b882-d14d32018e58",
"Id": "9e1a0057-4570-4a6e-8ff5-88b2facbaf4e",
"Details": {
"Name": "Kiran"
}
}, {
"Uid": "40cc6103-1cf0-4735-b882-d14d32018e58",
"Id": "9e1a0057-4570-4a6e-8ff5-88b2facbaf4e",
"Details": {
"Age": "24"
}
},
{
"Uid": "196f5865-e9fe-4847-86ae-97d0bf57b816",
"Id": "84909ecb-c92e-48a7-bcaa-d478bf3a9220",
"Details": {
"Name": "Shreyas"
}
}
]
since the Uid and Id are same for multiple entires, can I club them togeather with Details key being the comma seperate key,value pair? Something like mentioned below
[{
"Uid": "40cc6103-1cf0-4735-b882-d14d32018e58",
"Id": "9e1a0057-4570-4a6e-8ff5-88b2facbaf4e",
"Details": {
"Name": "Kiran",
"Age": "24"
}
},
{
"Uid": "196f5865-e9fe-4847-86ae-97d0bf57b816",
"Id": "84909ecb-c92e-48a7-bcaa-d478bf3a9220",
"Details": {
"Name": "Shreyas"
}
}]
Please Guide me on this for the approach to be followed. Thanks

What you need is the dictionary function update(). Here's an example:
A = [{
"Uid": "40cc6103-1cf0-4735-b882-d14d32018e58",
"Id": "9e1a0057-4570-4a6e-8ff5-88b2facbaf4e",
"Details": {
"Name": "Kiran"
}
}, {
"Uid": "40cc6103-1cf0-4735-b882-d14d32018e58",
"Id": "9e1a0057-4570-4a6e-8ff5-88b2facbaf4e",
"Details": {
"Age": "24"
}
},
{
"Uid": "196f5865-e9fe-4847-86ae-97d0bf57b816",
"Id": "84909ecb-c92e-48a7-bcaa-d478bf3a9220",
"Details": {
"Name": "Shreyas"
}
}
]
B = []
def find(uid, id_):
for i, d in enumerate(B):
if d['Uid'] == uid and d['Id'] == id_:
return i
return -1
for d in A:
if (i := find(d['Uid'], d['Id'])) < 0:
B.append(d)
else:
B[i]['Details'].update(d['Details'])
print(B)
Prettyfied output:
[
{
"Uid": "40cc6103-1cf0-4735-b882-d14d32018e58",
"Id": "9e1a0057-4570-4a6e-8ff5-88b2facbaf4e",
"Details": {
"Name": "Kiran",
"Age": "24"
}
},
{
"Uid": "196f5865-e9fe-4847-86ae-97d0bf57b816",
"Id": "84909ecb-c92e-48a7-bcaa-d478bf3a9220",
"Details": {
"Name": "Shreyas"
}
}
]
Note:
This could be very inefficient if your API response contains very large numbers of dictionaries. You might need a completely different approach

You should iterate over the list and merge with accumulator with (Uid, Id) as key:
from typing import Dict, List
l = [{
"Uid": "40cc6103-1cf0-4735-b882-d14d32018e58",
"Id": "9e1a0057-4570-4a6e-8ff5-88b2facbaf4e",
"Details": {
"Name": "Kiran"
}
}, {
"Uid": "40cc6103-1cf0-4735-b882-d14d32018e58",
"Id": "9e1a0057-4570-4a6e-8ff5-88b2facbaf4e",
"Details": {
"Age": "24"
}
},
{
"Uid": "196f5865-e9fe-4847-86ae-97d0bf57b816",
"Id": "84909ecb-c92e-48a7-bcaa-d478bf3a9220",
"Details": {
"Name": "Shreyas"
}
}
]
def mergeItem(it: Dict, acc: Dict) -> Dict:
uid = it["Uid"]
id = it["Id"]
if (uid, id) in acc:
acc[(uid, id)] = {"Uid": uid, "Id": id, "Details": {**acc[(uid, id)]["Details"], **it["Details"]}}
else:
acc[(uid, id)] = {"Uid": uid, "Id": id, "Details": it["Details"]}
return acc
def mergeList(a:List) -> Dict:
acc = {}
for v in a:
acc = mergeItem(v, acc)
return acc
print(list(mergeList(l).values()))
# [
# {
# 'Uid': '40cc6103-1cf0-4735-b882-d14d32018e58',
# 'Id': '9e1a0057-4570-4a6e-8ff5-88b2facbaf4e',
# 'Details': {'Name': 'Kiran', 'Age': '24'}},
# {
# 'Uid': '196f5865-e9fe-4847-86ae-97d0bf57b816',
# 'Id': '84909ecb-c92e-48a7-bcaa-d478bf3a9220',
# 'Details': {'Name': 'Shreyas'}
# }
# ]

How to use S3 Select for Nested Parquet Objects

I have dumped data into a parquet file.
When I use
SELECT * FROM s3object s LIMIT 1
it gives me the following result.
{
"name": "John",
"age": "45",
"country": "USA",
"experience": [{
"company": {
"name": "ABC",
"years": "10",
"position": "Manager"
}
},
{
"company": {
"name": "BBC",
"years": "2",
"position": "Assistant"
}
}
]
}
I want to filter the result where company.name = "ABC"
so, the output should be looks like following.
{
"name": "John",
"age": "45",
"country": "USA",
"experience": [{
"company": {
"name": "ABC",
"years": "10",
"position": "Manager"
}
}
]
}
or this
{
"name": "John",
"age": "45",
"country": "USA",
"experience.company.name": "ABC",
"experience.company.years": "10",
"experience.company.position": "Manager"
}
Any support is highly appreciated.
Thanks.

AWS DynamoDB Stream python convert native format

I've a Lambda function triggered by a DynamoDB Stream. My problem is the strange format of the event received(with type for each key/value).
Does exists a workaround to convert a whole document in a native python format(without any types). I'm looking for a dynamic solution because in the future I want use this lambda with other DynamoDB table Streams which have different format(multiple dict/list levels)
Example:
{
"Records": [
{
"eventID": "1",
"eventVersion": "1.0",
"dynamodb": {
"Keys": {
"Id": {
"N": "101"
}
},
"NewImage": {
"Message": {
"S": "New item!"
},
"Id": {
"N": "101"
}
},
"StreamViewType": "NEW_AND_OLD_IMAGES",
"SequenceNumber": "111",
"SizeBytes": 26
},
"awsRegion": "us-west-2",
"eventName": "INSERT",
"eventSourceARN": "arn:aws:dynamodb:us-west-2:account-id:table/ExampleTableWithStream/stream/2015-06-27T00:48:05.899",
"eventSource": "aws:dynamodb"
},
{
"eventID": "2",
"eventVersion": "1.0",
"dynamodb": {
"OldImage": {
"Message": {
"S": "New item!"
},
"Id": {
"N": "101"
}
},
"SequenceNumber": "222",
"Keys": {
"Id": {
"N": "101"
}
},
"SizeBytes": 59,
"NewImage": {
"Message": {
"S": "This item has changed"
},
"Id": {
"N": "101"
}
},
"StreamViewType": "NEW_AND_OLD_IMAGES"
},
"awsRegion": "us-west-2",
"eventName": "MODIFY",
"eventSourceARN": "arn:aws:dynamodb:us-west-2:account-id:table/ExampleTableWithStream/stream/2015-06-27T00:48:05.899",
"eventSource": "aws:dynamodb"
},
{
"eventID": "3",
"eventVersion": "1.0",
"dynamodb": {
"Keys": {
"Id": {
"N": "101"
}
},
"SizeBytes": 38,
"SequenceNumber": "333",
"OldImage": {
"Message": {
"S": "This item has changed"
},
"Id": {
"N": "101"
}
},
"StreamViewType": "NEW_AND_OLD_IMAGES"
},
"awsRegion": "us-west-2",
"eventName": "REMOVE",
"eventSourceARN": "arn:aws:dynamodb:us-west-2:account-id:table/ExampleTableWithStream/stream/2015-06-27T00:48:05.899",
"eventSource": "aws:dynamodb"
}
]
}
Thanks

I've been using this, it served us well until now:
from boto3.dynamodb.types import TypeDeserializer
serializer = TypeDeserializer()
def deserialize(data):
if isinstance(data, list):
return [deserialize(v) for v in data]
if isinstance(data, dict):
try:
return serializer.deserialize(data)
except TypeError:
return {k: deserialize(v) for k, v in data.items()}
else:
return data

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Merge json files in Python - python

You need to merge the "sub-dictionary" data1['data'], not data1 itself. In the current code, you are updating data1 with data2, so that data2['data'] overwrites data1['data']. So replace data1.update(data2) with: data1['data'].update(data2['data'])

Related

Explode json without pandas

How to extract data from complex JSON object?

Merge Json with same key value pairs

How to use S3 Select for Nested Parquet Objects

AWS DynamoDB Stream python convert native format

Categories

Resources