Python parse json object within object within object - python

I've been successful in reading in and parsing a json string using python including normalising and exploding arrays. However I have a second json format which I'm struggling with and making little head way.
I need to pull out the 'Entities' and 'Transitions' within the 'data' object.
My json structure is:
{
"model": {
"id": "639b2970ac4d16767484b2bd",
"name": "TestImport",
"description": "",
"type": "LineageModel",
"referenceModelType": null,
"owner": {
"id": "639b2904ac4d167674849e1e",
"name": "xyz",
"firstName": null,
"lastName": null
},
"members": {
"totalUsers": 1,
"totalGroups": 0,
"users": {
"Owner": [
{
"joinTime": "2022-12-15T14:04:32.076Z",
"modificationTime": "2022-12-15T14:04:32.076Z",
"email": "sxyz",
"username": null,
"hasPendingSiteInvite": false,
"isDisabled": false,
"id": "639b2904ac4d167674849e1e",
"name": "xyz",
"firstName": null,
"lastName": null
}
]
},
"groups": {}
},
"$type": "ModelInformation"
},
"data": {
"version": "simple",
"roots": [
"8a44e4d6-4062-4e1c-a46c-c7787cab4405",
"d1494635-9005-4337-8eab-227265b29332"
],
"entities": {
"29380f60-620e-4314-9969-4ad6fe5bbea6": {
"name": "Element",
"children": [],
"id": "29380f60-620e-4314-9969-4ad6fe5bbea6",
"properties": {}
},
"86361ab4-6002-4f3b-b6ca-7e35acd69f9b": {
"name": "Application",
"children": [
"29380f60-620e-4314-9969-4ad6fe5bbea6"
],
"id": "86361ab4-6002-4f3b-b6ca-7e35acd69f9b",
"properties": {}
},
"223d9749-feb2-425d-b512-17b5322cda96": {
"name": "S_Group",
"children": [
"86361ab4-6002-4f3b-b6ca-7e35acd69f9b"
],
"id": "223d9749-feb2-425d-b512-17b5322cda96",
"properties": {}
}
},
"transitions": {
"c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5": {
"source": "29380f60-620e-4314-9969-4ad6fe5bbea6",
"target": "040677a5-820f-4d17-ae50-1296c0e36273",
"id": "c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5",
"properties": {}
}
},
"metadata": {
"queries": "{\"version\":\"3\"}",
"views": "{\"views\":[{\"name\":\"MainView\",\"description\":\"\",\"applyOnLoad\":true,\"view\":{\"version\":20,\"zoom\":{\"scale\":1,\"layerWidth\":250,\"layerSpacing\":40},\"collapsed\":{\"expanded\":[\"961847ad-1693-47b5-aa29-88ee07700b5e\",\"0da88727-e272-44a4-adeb-5a0465970490\",\"223d9749-feb2-425d-b512-17b5322cda96\",\"86361ab4-6002-4f3b-b6ca-7e35acd69f9b\"],\"collapsed\":[]},\"trace\":{\"enabled\":true,\"lock\":null,\"highlightedTraceDepth\":1,\"isHighlightedTraceDepthAll\":true,\"isTraversable\":true},\"selection\":[],\"queries\":{\"styled\":[],\"filtered\":[],\"filterType\":{\"6eecd266-2c2e-46d1-a00a-ceec18a87cdb\":\"show\"},\"expandedModules\":[]},\"settings\":{\"hideEmptyContainers\":false,\"hideFilteredLayers\":false,\"expandFilteredEntities\":false,\"portHintsEnabled\":true,\"autoBundleTransitions\":1,\"autoStyleTransitions\":1,\"autoHideTransitions\":1,\"maxSpanningTransitionDepth\":10,\"rootEntityType\":\"Layer\"}},\"options\":{\"zoom\":true,\"collapsed\":true,\"trace\":true,\"selection\":true,\"queries\":true,\"settings\":true},\"id\":\"VIEW-tcPIi3jP\"}]}"
},
"queries": [
{
"id": "9eb5c7fe-8d4c-4d94-b1a7-fee08bb2f663",
"name": "TestID",
"description": "",
"displayRules": "[{\"id\":\"e119af2d-5061-43f3-901a-4c70f402bb5d\",\"type\":\"PROPERTY\",\"staticColour\":\"#E47833\",\"dynamicColouring\":false,\"cls\":\"dr-19\",\"property\":\"TestID\",\"prefix\":\"\",\"suffix\":\"\",\"center\":false,\"left\":false}]",
"querySource": "not isEmpty(TestID)",
"modulePath": "Uncategorised",
"importedFromModel": null
}
],
"propertyDefinitions": {
"propertyDefinitionsId": "2ab6a0c3-8d24-4235-9783-e241437bf860",
"modelId": "639b2970ac4d16767484b2bd",
"properties": {
"TestID": {
"type": "Number",
"defaultValue": null,
"options": [],
"optionInfos": {}
}
}
}
},
"importedModels": {},
"importedModelsForQueries": {},
"propertyDefinitionsForImports": {},
"templateCollections": {}
}
I have been using the following to convert a json to data frame:
fInput = 'filepath to json file'
with open(fInput, encoding='utf-16') as inputfile:
df = pd.read_json(inputfile)
fOutput = 'output file path'
df.to_csv(fOutput, encoding='utf-16, index = false)
I then normalise columns using which for other json formats works
pd.json_normalize(df['column'])
and I explode arrays in the columns using:
df2 = pd.DataFrame([d, tup.id) for df.intertuples() for d in tup.columnName])
What I can't work out is how to pull into a data frame the 'entities' object from the 'data' object. Once I can get to that then I should be able to parse the content.
I had got to:
df = df["data"]
df = df["entities"]
When I print that it looks promising, but if I try to output to csv it fails with "'dict' object has no attributes" so I'm going wrong somewhere. The traceback for the error is:
AttributeError
Input [48], in <cell line: 14>()
12 df = df["entities"]
13 print(df)
14 df.to_csv(fOutput, encoding='utf-16', index=false)
AttributeError: 'dict' object has no attribute 'to_csv'
Any pointers appreciated.

Couldn't you just do this?
I saved the json data you gave in a file named data.txt
import json
with open('data.txt','r') as file:
data = json.load(file)
entities = data['data']['entities']
transitions = data['data']['transitions']
print(f'Entities: {entities}\nTransitions: {transitions}')
Output
Entities: {'29380f60-620e-4314-9969-4ad6fe5bbea6': {'name': 'Element', 'children': [], 'id': '29380f60-620e-4314-9969-4ad6fe5bbea6', 'properties': {}}, '86361ab4-6002-4f3b-b6ca-7e35acd69f9b': {'name': 'Application', 'children': ['29380f60-620e-4314-9969-4ad6fe5bbea6'], 'id': '86361ab4-6002-4f3b-b6ca-7e35acd69f9b', 'properties': {}}, '223d9749-feb2-425d-b512-17b5322cda96': {'name': 'S_Group', 'children': ['86361ab4-6002-4f3b-b6ca-7e35acd69f9b'], 'id': '223d9749-feb2-425d-b512-17b5322cda96', 'properties': {}}}
Transitions: {'c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5': {'source': '29380f60-620e-4314-9969-4ad6fe5bbea6', 'target': '040677a5-820f-4d17-ae50-1296c0e36273', 'id': 'c4e2026a-9c57-4bb0-b2e2-f7068d9c6fe5', 'properties': {}}}

Related

Python: Convert json with extra data error into CSV

I have a JSON in below format which I receive from a different team and not allowed to make any changes to it:
{
"content": [
{
"id": "5603bbaae412390b73f0c7f",
"name": "ABC",
"description": "Test",
"rsid": "pwcs",
"type": "project",
"owner": {
"id": 529932
},
"created": "2015-09-24T09:00:26Z"
},
{
"id": "56094673e4b0a7e17e310b83",
"name": "secores",
"description": "Panel",
"rsid": "pwce",
"type": "project",
"owner": {
"id": 520902
},
"created": "2015-09-28T13:53:55Z"
}
],
"totalPages": 9,
"totalElements": 8592,
"number": 0,
"numberOfElements": 1000,
"firstPage": true,
"lastPage": false,
"sort": null,
"size": 1000
}
{
"content": [
{
"id": "5bf2cc64d977553780706050",
"name": "Services Report",
"description": "",
"rsid": "pcie",
"type": "project",
"owner": {
"id": 518013
},
"created": "2018-11-19T14:44:52Z"
},
{
"id": "5bf2d56e40b39312e3e167d0",
"name": "Standard form",
"description": "",
"rsid": "wcu",
"type": "project",
"owner": {
"id": 521114
},
"created": "2018-11-19T15:23:26Z"
}
],
"totalPages": 9,
"totalElements": 8592,
"number": 1,
"numberOfElements": 1000,
"firstPage": false,
"lastPage": false,
"sort": null,
"size": 1000
}
{
"content": [
{
"id": "5d95e7d6187c6d6376fd1bad",
"name": "New Project",
"description": "",
"rsid": "pcinforrod",
"type": "project",
"owner": {
"id": 200904228
},
"created": "2019-10-03T12:21:42Z"
},
{
"id": "5d95fc6e56d2e82519629b96",
"name": "Demo - 10/03",
"description": "",
"rsid": "sitedev",
"type": "project",
"owner": {
"id": 20001494
},
"created": "2019-10-03T13:49:34Z"
}
],
"totalPages": 9,
"totalElements": 8592,
"number": 2,
"numberOfElements": 1000,
"firstPage": false,
"lastPage": false,
"sort": null,
"size": 1000
}
I am trying to convert it into CSV using below code:
import csv
import json
with open("C:\python\SampleJSON.json",'rb') as file:
data = json.load(file)
fname = "workspaceExcelDemo.csv"
with open(fname,"w", encoding="utf-8", newline='') as file:
csv_file = csv.writer(file)
csv_file.writerow(["id","name","rsid"])
for item in data["content"]:
csv_file.writerow([item['id'],item['name'],item['rsid']])
However I am getting below error message while executing the above piece of code:
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 35 column 1 (char 937)
How do I convert the above JSON into CSV without making any changes to the JSON file?
If I understand your question and the comments well you could use the json.dumps method:
import csv
import json
with open("C:\python\SampleJSON.json",'rb') as file:
data = [json.loads(line) for line in file]
"""
The json.dumps method converts a Python object to a JSON formatted string.
The json.loads method parses a JSON string into a native Python object.
Replacing the "=" character with an empty string.
"""
data = json.loads(json.dumps(data).replace("=", ""))
fname = "workspaceExcelDemo.csv"
with open(fname, "w", encoding="utf-8", newline='') as file:
csv_file = csv.writer(file)
csv_file.writerow(["id", "name", "rsid"])
for item in data[0]["content"]:
csv_file.writerow([item['id'], item['name'], item['rsid']])

Python Avro, how to write data to a modified schema?

I'm new to Avro and I'm trying to perform basic tasks like read data from data.avro and now I want to write data to data.avro.
My problem is : ...is not an example of the schema...
I don't understand where is my mistake and I'd appreciate your help :
from avro import schema, datafile, io
from avro.datafile import DataFileWriter, DataFileReader
from avro.io import DatumWriter, DatumReader
OUTFILE_NAME = '4_2.avro'
SCHEMA_STR = """{
"namespace": "advdaba",
"type": "record",
"name": "Conference",
"fields": [
{ "name": "name", "type": "string" },
{ "name": "time", "type": "long" },
{ "name": "location", "type": "string" },
{ "name": "speakers", "type": {"type":"array","items":"string"} },
{ "name": "participants", "type": {"type": "array", "items": "string"} },
{ "name": "seating", "type": {"type": "map", "values": "int"} }
]
}"""
SCHEMA = schema.parse(SCHEMA_STR)
# read data writtent with the old schema
reader = DataFileReader(open("example.avro", "rb"), DatumReader())
#data = []
for example in reader:
print(example)
#data.append(example)
reader.close()
# generate data for new schema
data = {
'name': 'Foo',
'time': 25612345,
'location': 'Berne',
'speakers': ['Jean', 'Elton'],
'participants': ['John', 'Michel', 'Jacques'],
'seating': [{'John': 1}, {'Michel': 2}, {'Jacques': 3}]
}
rec_writer = io.DatumWriter(SCHEMA)
df_writer = datafile.DataFileWriter(
open(OUTFILE_NAME, 'wb'),
rec_writer,
writers_schema=SCHEMA,
codec='deflate'
)
df_writer.append(data)
df_writer.close()
I think for what I understand, it's even possible to write the old and new data in the same .avro file
[EDIT]
after debugging the problem comes from seating
The problem was in seating
The solution is
'seating': {"John": 1, "Michel": 2, "Jacques": 3}

extract all Json key values

I'm not advanced with Python Json. I have these Json result:
{
"href": "https://api.spotify.com/v1/users/wizzler/playlists",
"items": [
{
"collaborative": false,
"external_urls": {
"spotify": "http://open.spotify.com/user/wizzler/playlists/53Y8wT46QIMz5H4WQ8O22c"
},
"href": "https://api.spotify.com/v1/users/wizzler/playlists/53Y8wT46QIMz5H4WQ8O22c",
"id": "53Y8wT46QIMz5H4WQ8O22c",
"images": [],
"name": "Wizzlers Big Playlist",
"owner": {
"external_urls": {
"spotify": "http://open.spotify.com/user/wizzler"
},
"href": "https://api.spotify.com/v1/users/wizzler",
"id": "wizzler",
"type": "user",
"uri": "spotify:user:wizzler"
},
"public": true,
"snapshot_id": "bNLWdmhh+HDsbHzhckXeDC0uyKyg4FjPI/KEsKjAE526usnz2LxwgyBoMShVL+z+",
"tracks": {
"href": "https://api.spotify.com/v1/users/wizzler/playlists/53Y8wT46QIMz5H4WQ8O22c/tracks",
"total": 30
},
"type": "playlist",
"uri": "spotify:user:wizzler:playlist:53Y8wT46QIMz5H4WQ8O22c"
},
{
"collaborative": false,
"external_urls": {
"spotify": "http://open.spotify.com/user/wizzlersmate/playlists/1AVZz0mBuGbCEoNRQdYQju"
},
"href": "https://api.spotify.com/v1/users/wizzlersmate/playlists/1AVZz0mBuGbCEoNRQdYQju",
"id": "1AVZz0mBuGbCEoNRQdYQju",
"images": [],
"name": "Another Playlist",
"owner": {
"external_urls": {
"spotify": "http://open.spotify.com/user/wizzlersmate"
},
"href": "https://api.spotify.com/v1/users/wizzlersmate",
"id": "wizzlersmate",
"type": "user",
"uri": "spotify:user:wizzlersmate"
},
"public": true,
"snapshot_id": "Y0qg/IT5T02DKpw4uQKc/9RUrqQJ07hbTKyEeDRPOo9LU0g0icBrIXwVkHfQZ/aD",
"tracks": {
"href": "https://api.spotify.com/v1/users/wizzlersmate/playlists/1AVZz0mBuGbCEoNRQdYQju/tracks",
"total": 58
},
"type": "playlist",
"uri": "spotify:user:wizzlersmate:playlist:1AVZz0mBuGbCEoNRQdYQju"
}
],
"limit": 9,
"next": null,
"offset": 0,
"previous": null,
"total": 9
}
Now I need to extract only the Playlist ids. How to do that?
Edit:
I get the Json Data from doing:
r = requests.get(BASE_URL + 'users/' + user_id + '/playlists', headers=headers)
r = r.json()
print(r) returning me the Json Data. When I try to data = json.load(r)
I get these error! AttributeError: 'dict' object has no attribute 'read'
First, load the JSON file using the built in json library.
import json
with open('path/to/json/file.json') as f:
data = json.load(f)
Then, use a list comprehension to get only the IDs.
playlist_ids = [item['id'] for item in data['items']]
Edit: Or, if you've got your JSON parsed already, just use the list comprehension. Don't do r = r.json(), that will reset the request object to the data. Set it to some other variable, data is OK - data = r.json()
playlist_ids = [item['id'] for item in data['items']]
Edit 2: If you only want it where the owner ID is "wizzler", then add a if clause to the list comprehension.
playlist_ids = [item['id'] for item in data['items'] if item['owner']['id'] == 'wizzler']

Python how to pick 3rd occurence in nested json array

I am working with one of my requirement
My requirement: I need to pick and print only 3rd "id" from "syrap" list from the nested json file. I am not getting desired output. Any help will be appreciated.
Test file:
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{ "process": "abc",
"mix": "0303",
"syrap":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"rate": 0.55,
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
Expected output in a csv:
0001,donut,abc,0303,1003
My code:
import requests
import json
import csv
f = open('testdata.json')
data = json.load(f)
f.close()
f = csv.writer(open('testout.csv', 'wb+'))
for item in data:
f.writerow([item['id'], item[type], item['batters'][0]['process'],
item['batters'][0]['mix'],
item['batters'][0]['syrap'][0]['id'],
item['batters'][0]['syrap'][1]['id'],
item['batters'][0]['syrap'][2]['id'])
Here is some sample code showing how you can iterate through json content parsed as a dictionary:
import json
json_str = '''{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{ "process": "abc",
"mix": "0303",
"syrap":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"rate": 0.55,
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
'''
jsondict = json.loads(json_str)
syrap_node = jsondict['batters']['syrap']
for item in syrap_node:
print (f'id:{item["id"]} type: {item["type"]}')
Simply, data[“batters”][“syrap”][2][“id”]
Much better way to achieve this would be
f = open('testout.csv', 'wb+')
with f:
fnames = ['id','type','process','mix','syrap']
writer = csv.DictWriter(f, fieldnames=fnames)
writer.writeheader()
for item in data:
print item
writer.writerow({'id' : item['id'], 'type': item['type'],
'process' : item['batters']['process'],
'mix': item['batters']['mix'],
'syrap': item['batters']['syrap'][2]['id']})
You need to make sure that data is actually a list. if it is not a list, don't use for loop.
simply,
writer.writerow({'id' : data['id'], 'type': data['type'],
'process' : data['batters']['process'],
'mix': data['batters']['mix'],
'syrap': data['batters']['syrap'][2]['id']})

Flatten nested json to csv with nested column names

I have rather very weird requirement now. I have below json and somehow I have to convert it into flat csv.
[
{
"authorizationQualifier": "SDA",
"authorizationInformation": " ",
"securityQualifier": "ASD",
"securityInformation": " ",
"senderQualifier": "ASDAD",
"senderId": "FADA ",
"receiverQualifier": "ADSAS",
"receiverId": "ADAD ",
"date": "140101",
"time": "0730",
"standardsId": null,
"version": "00501",
"interchangeControlNumber": "123456789",
"acknowledgmentRequested": "0",
"testIndicator": "T",
"functionalGroups": [
{
"functionalIdentifierCode": "ADSAD",
"applicationSenderCode": "ASDAD",
"applicationReceiverCode": "ADSADS",
"date": "20140101",
"time": "07294900",
"groupControlNumber": "123456789",
"responsibleAgencyCode": "X",
"version": "005010X221A1",
"transactions": [
{
"name": "ASDADAD",
"transactionSetIdentifierCode": "adADS",
"transactionSetControlNumber": "123456789",
"implementationConventionReference": null,
"segments": [
{
"BPR03": "ad",
"BPR14": "QWQWDQ",
"BPR02": "1.57",
"BPR13": "23223",
"BPR01": "sad",
"BPR12": "56",
"BPR10": "32424",
"BPR09": "12313",
"BPR08": "DA",
"BPR07": "123456789",
"BPR06": "12313",
"BPR05": "ASDADSAD",
"BPR16": "21313",
"BPR04": "SDADSAS",
"BPR15": "11212",
"id": "aDSASD"
},
{
"TRN02": "2424",
"TRN03": "35435345",
"TRN01": "3435345",
"id": "FSDF"
},
{
"REF02": "fdsffs",
"REF01": "sfsfs",
"id": "fsfdsfd"
},
{
"DTM02": "2432424",
"id": "sfsfd",
"DTM01": "234243"
}
],
"loops": [
{
"id": "24324234234",
"segments": [
{
"N101": "sfsfsdf",
"N102": "sfsf",
"id": "dgfdgf"
},
{
"N301": "sfdssfdsfsf",
"N302": "effdssf",
"id": "fdssf"
},
{
"N401": "sdffssf",
"id": "sfds",
"N402": "sfdsf",
"N403": "23424"
},
{
"PER06": "Wsfsfdsfsf",
"PER05": "sfsf",
"PER04": "23424",
"PER03": "fdfbvcb",
"PER02": "Pedsdsf",
"PER01": "sfsfsf",
"id": "fdsdf"
}
]
},
{
"id": "2342",
"segments": [
{
"N101": "sdfsfds",
"N102": "vcbvcb",
"N103": "dsfsdfs",
"N104": "343443",
"id": "fdgfdg"
},
{
"N401": "dfsgdfg",
"id": "dfgdgdf",
"N402": "dgdgdg",
"N403": "234244"
},
{
"REF02": "23423342",
"REF01": "fsdfs",
"id": "sfdsfds"
}
]
}
]
}
]
}
]
}
]
The column header name corresponding to deeper key-value make take nested form, like functionalGroups[0].transactions[0].segments[0].BPR15.
I am able to do this in java using this github project (here you can find the output format I desire in the explanation) in one line:
flatJson = JSONFlattener.parseJson(new File("files/simple.json"), "UTF-8");
The output was:
date,securityQualifier,testIndicator,functionalGroups[1].functionalIdentifierCode,functionalGroups[1].date,functionalGroups[1].applicationReceiverCode, ...
140101,00,T,HP,20140101,ETIN,...
But I want to do this in python. I tried as suggested in this answer:
with open('data.json') as data_file:
data = json.load(data_file)
df = json_normalize(data, record_prefix=True)
with open('temp2.csv', "w", newline='\n') as csv_file:
csv_file.write(df.to_csv())
However, for column functionalGroups, it dumps json as a cell value.
I also tried as suggested in this answer:
with open('data.json') as f: # this ensures opening and closing file
a = json.loads(f.read())
df = pandas.DataFrame(a)
print(df.transpose())
But this also seem to do the same:
0
acknowledgmentRequested 0
authorizationInformation
authorizationQualifier SDA
date 140101
functionalGroups [{'functionalIdentifierCode': 'ADSAD', 'applic...
interchangeControlNumber 123456789
receiverId ADAD
receiverQualifier ADSAS
securityInformation
securityQualifier ASD
senderId FADA
senderQualifier ASDAD
standardsId None
testIndicator T
time 0730
version 00501
Is it possible to do what I desire in python?

Categories