I have n of very complex Python dictionaries with big depth level (~5) and I don't know how to merge them properly and fast, not to iterate over them for a milion times.
What is worth mentioning - that dicts have strict structure as you will see below.
I was trying solutions connected with:
defaultdict
merge operator
Version of Python - 3.9
d1 = {
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
d2 = {
"name": "Louis",
"places": [
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
d3 = {
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
And in that case output should be
d_merged = {
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
},
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
Your task is quite specific, so universal solution is not possible. I'd suggest you to merge all "places", "subplaces" and "subsubplaces" in nested dictionary to clean up all possible duplicates and then modify data to match desired format.
from itertools import groupby
from operator import itemgetter
from collections import defaultdict
def merge_places(*dicts):
if not dicts:
return
# check all dicts have same names
# https://docs.python.org/3/library/itertools.html#itertools-recipes
g = groupby(dicts, itemgetter("name"))
if next(g, True) and next(g, False):
raise ValueError("Dictionaries names are not equal")
places = defaultdict(lambda: defaultdict(set)) # set values are unique
for d in dicts:
for place in d["places"]:
for subplace in place["subplaces"]:
for subsubplace in subplace["subsubplaces"]:
places[place["code"]][subplace["name"]].add(subsubplace["name"])
return {
"name": d["name"], # always exists as dicts aren't empty
"places": [
{
"code": code,
"subplaces": [
{
"name": name,
"subsubplaces": [
{"name": subsubplace}
for subsubplace in subsubplaces
]
}
for name, subsubplaces in subplaces.items()
]
}
for code, subplaces in places.items()
]
}
Usage:
result = merge_places(d1, d2, d3)
Output:
{
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
},
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
I think your representation of data has a lot of non unnecessary details, we can reduce them by this solution:
from typing import Dict, List
dicts = [
{
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
},
{
"name": "Louis",
"places": [
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
},
{
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}]
def merger(dicts: List[Dict]) -> Dict:
result = {}
for d in dicts:
name = d["name"]
if not name in result:
result[name] = {}
places = d["places"]
for p in places:
code = p["code"]
if not code in result[name]:
result[name][code] = []
result[name][code].extend(p["subplaces"])
return result
print(merger(dicts=dicts))
The output will be:
{
'Louis':{
'A':[
{'name': 'Subplace name', 'subsubplaces': [{'name': 'subsub1'}]},
{'name': 'Subplace name2', 'subsubplaces': [{'name': 'subsub1'}]},
{'name': 'Subplace name X', 'subsubplaces': [{'name': 'subsub1'}]}
],
'B':[
{'name': 'Subplace name', 'subsubplaces': [{'name': 'subsub1'}]},
{'name': 'Subplace name2', 'subsubplaces': [{'name': 'subsub1'}]}]
}
}
If you want your desired output it's easy to change this one to your desired output, but this on is more readable and maintainable.
Related
While trying to convert a JSON output below to CSV, getting error
Here is the JSON output
{
"data": [
{
"id": "-1000100591151294842",
"type": "fres",
"attributes": {
"operationState": "In Service",
"deploymentState": "discovered",
"displayData": {
"operationState": "Up",
"adminState": "Enabled",
"displayTopologySource": "Protocol,Derived",
"displayPhotonicSpectrumData": [
{
"frequency": "194.950000",
"wavelength": "1537.79",
"channel": "CH-20"
}
],
"displayDeploymentState": "Discovered",
"displayName": "J-BBEG-CHLC-P109"
},
"utilizationData": {
"totalCapacity": "100.0",
"usedCapacity": "100.0",
"utilizationPercent": "100",
"capacityUnits": "Gbps"
},
"resourceState": "discovered",
"serviceClass": "OTU",
"linkLabel": "BBEG-ROADM-0101:5-4-1,CHLC-ROADM-0401:7-35-1",
"lastUpdatedAdminStateTimeStamp": "2021-05-03T00:29:24.444Z",
"lastUpdatedOperationalStateTimeStamp": "2022-12-08T22:42:21.567Z",
"userLabel": "J-BBEG-CHLC-P109",
"mgmtName": "",
"nativeName": "",
"awarenessTime": "2022-12-08T22:42:22.123Z",
"layerRate": "OTU4",
"layerRateQualifier": "OTU4",
"supportedByLayerRatePackageList": [
{
"layerRate": "OTSi",
"layerRateQualifier": "100G"
}
],
"networkRole": "FREAP",
"directionality": "bidirectional",
"topologySources": [
"adjacency",
"stitched"
],
"adminState": "In Service",
"photonicSpectrumPackageList": [
{
"frequency": "194.950000",
"width": "37.5"
}
],
"active": true,
"additionalAttributes": {
"isActual": "true",
"hasLowerTopology": "true"
},
"reliability": "auto",
"resilienceLevel": "unprotected"
},
"relationships": {
"freDiscovered": {
"data": {
"type": "freDiscovered",
"id": "-1000100591151294842"
}
},
"supportedByServices": {
"data": [
{
"type": "fres",
"id": "6765278351459212874"
}
]
},
"endPoints": {
"data": [
{
"type": "endPoints",
"id": "-1000100591151294842:1"
},
{
"type": "endPoints",
"id": "-1000100591151294842:2"
}
]
},
"partitionFres": {
"data": [
{
"type": "fres",
"id": "7147507956181395827"
}
]
}
}
},
{
"id": "-1013895107051577774",
"type": "fres",
"attributes": {
"operationState": "In Service",
"deploymentState": "discovered",
"displayData": {
"operationState": "Up",
"adminState": "Enabled",
"displayTopologySource": "Protocol,Derived",
"displayPhotonicSpectrumData": [
{
"frequency": "191.600000",
"wavelength": "1564.68",
"channel": "CH-87"
}
],
"displayDeploymentState": "Discovered",
"displayName": "J-KFF9-PNTH-P101"
},
"utilizationData": {
"totalCapacity": "100.0",
"usedCapacity": "90.0",
"utilizationPercent": "90",
"capacityUnits": "Gbps"
},
"resourceState": "discovered",
"serviceClass": "OTU",
"tags": [
"J-KFF9-PNTH-P101"
],
"linkLabel": "KFF9-ROADM-0301:1-1-1,PNTH-ROADM-0101:1-1-1",
"lastUpdatedAdminStateTimeStamp": "2021-09-12T20:22:59.334Z",
"lastUpdatedOperationalStateTimeStamp": "2022-10-12T14:20:44.779Z",
"userLabel": "J-KFF9-PNTH-P101",
"mgmtName": "",
"nativeName": "",
"awarenessTime": "2022-10-12T14:20:45.417Z",
"layerRate": "OTU4",
"layerRateQualifier": "OTU4",
"supportedByLayerRatePackageList": [
{
"layerRate": "OTSi",
"layerRateQualifier": "100G"
}
],
"networkRole": "FREAP",
"directionality": "bidirectional",
"topologySources": [
"adjacency",
"stitched"
],
"adminState": "In Service",
"photonicSpectrumPackageList": [
{
"frequency": "191.600000",
"width": "37.5"
}
],
"active": true,
"additionalAttributes": {
"isActual": "true",
"hasLowerTopology": "true"
},
"reliability": "auto",
"resilienceLevel": "unprotected"
},
"relationships": {
"freDiscovered": {
"data": {
"type": "freDiscovered",
"id": "-1013895107051577774"
}
},
"supportedByServices": {
"data": [
{
"type": "fres",
"id": "6055685088078365419"
}
]
},
"endPoints": {
"data": [
{
"type": "endPoints",
"id": "-1013895107051577774:1"
},
{
"type": "endPoints",
"id": "-1013895107051577774:2"
}
]
},
"partitionFres": {
"data": [
{
"type": "fres",
"id": "-6727082893715936342"
}
]
}
}
}
] }
getting below error, not sure what is missing
Here is the python script I used. have been trying different variations but no luck getting different errors in all other instances
filename = Path('fre.json')
data = []
with open(filename,'r') as json_file:
data_str = json_file.read()
data_str = data_str.split('[',1)[-1]
data_str = data_str.rsplit(']',1)[0]
data_str = data_str.split('][')
for jsonStr in data_str:
jsonStr = '[' + jsonStr + ']'
temp_data = json.loads(jsonStr)
for each in temp_data:
data.append(each)
what is wrong?
I started using Python Cubes Olap recently.
I'm trying to sum/avg a JSON postgres column, how can i do this?
my db structure:
events
id
object_type
sn_name
spectra
id
snx_wavelengths (json column)
event_id
my json:
{
"dimensions": [
{
"name": "event",
"levels": [
{
"name": "object_type",
"label": "Object Type",
"attributes": [
"object_type"
]
},
{
"name": "sn_name",
"label": "name",
"attributes": [
"sn_name"
]
}
]
},
{
"name": "spectra",
"levels": [
{
"name": "catalog_name",
"label": "Catalog Name",
"attributes": [
"catalog_name"
]
},
{
"name": "capture_date",
"label": "Capture Date",
"attributes": [
"capture_date"
]
}
]
},
{
"name": "date"
}
],
"cubes": [
{
"id": "uid",
"name": "14G31Yx98ZG8aEhFHjOWNNBmFOETg5APjZo5AiHaqog5YxLMK5",
"dimensions": [
"event",
"spectra",
"date"
],
"aggregates": [
{
"name": "event_snx_wavelengths_sum",
"function": "sum",
"measure": "event.snx_wavelengths"
},
{
"name": "record_count",
"function": "count"
}
],
"joins": [
{
"master": "14G31Yx98ZG8aEhFHjOWNNBmFOETg5APjZo5AiHaqog5YxLMK5.id",
"detail": "spectra.event_id"
},
],
"mappings": {
"event.sn_name": "sn_name",
"event.object_type": "object_type",
"spectra.catalog_name": "spectra.catalog_name",
"spectra.capture_date": "spectra.capture_date",
"event.snx_wavelengths": "spectra.snx_wavelengths",
"date": "spectra.capture_date"
},
}
]
}
I'm getting the follow error:
Unknown attribute ''event.snx_wavelengths''
Anyone can help?
I already tried use mongodb to do the sum, i didnt had success.
I've been struggling with the nested structure in json, how to convert to correct form
{
"id": "0c576f35-d704-4fa8-8cbb-311c6be36358",
"employee_id": null,
"creator_id": "16ca2db9-206c-4e18-891d-a00a5252dbd3",
"closed_by_id": null,
"request_number": 23,
"priority": "2",
"form_id": "urlaub-weitere-abwesenheiten",
"status": "opened",
"name": "Urlaub & weitere Abwesenheiten",
"read_by_employee": false,
"custom_status": {
"id": 15793,
"name": "In Bearbeitung HR"
},
"due_date": null,
"created_at": "2021-03-29T15:18:37.572040+02:00",
"updated_at": "2021-03-29T15:22:15.590156+02:00",
"closed_at": null,
"archived_at": null,
"attachment_count": 1,
"category": {
"id": "payroll-time-management",
"name": "Payroll, Time & Attendance"
},
"public_comment_count": 0,
"form_data": [
{
"field_id": "subcategory",
"values": [
"Time & Attendance - Manage monthly/year-end consolidation and report"
]
},
{
"field_id": "separator-2",
"values": [
null
]
},
{
"field_id": "art-der-massnahme",
"values": [
"Fortbildung"
]
},
{
"field_id": "bezeichnung-der-schulung-kurses",
"values": [
"dfgzhujiko"
]
},
{
"field_id": "startdatum",
"values": [
"2021-03-26"
]
},
{
"field_id": "enddatum",
"values": [
"2021-03-27"
]
},
{
"field_id": "freistellung",
"values": [
"nein"
]
},
{
"field_id": "mit-bildungsurlaub",
"values": [
""
]
},
{
"field_id": "kommentarfeld_fortbildung",
"values": [
""
]
},
{
"field_id": "separator",
"values": [
null
]
},
{
"field_id": "instructions",
"values": [
null
]
},
{
"field_id": "entscheidung-hr-bp",
"values": [
"Zustimmen"
]
},
{
"field_id": "kommentarfeld-hr-bp",
"values": [
"wsdfghjkmhnbgvfcdxsybvnm,"
]
},
{
"field_id": "individuelle-abstimmung",
"values": [
""
]
}
],
"form_files": [
{
"id": 30129,
"filename": "empty_background.png",
"field_id": "anhang"
}
],
"visible_by_employee": false,
"organization_ids": [],
"need_edit_by_employee": false,
"attachments": []
}
using a simple solution with pandas, dataframe
Request = pd.DataFrame.from_dict(pd.json_normalize(data), orient='columns')
it's displaying almost in its correct form:
how to split a dictionary from columns form_data i form_files, I've done a lot of research, but I'm still having a lot of trouble solving this problem, how to split form_data for columns, no rows for meta to ID
You can do something like this.
pass the dataframe and the column to the function as arguments
def explode_node(child_df, column_value):
child_df = child_df.dropna(subset=[column_value])
if isinstance(child_df[str(column_value)].iloc[0], str):
child_df[column_value] = child_df[str(column_value)].apply(ast.literal_eval)
expanded_child_df = (pd.concat({i: json_normalize(x) for i, x in child_df.pop(str(column_value)).items()}).reset_index(level=1,drop=True).join(child_df, how='right', lsuffix='_left', rsuffix='_right').reset_index(drop=True))
expanded_child_df.columns = map(str.lower, expanded_child_df.columns)
return expanded_child_df
Converted into a nested JSON file using Pandas
This is the sample csv for one row
name type aitm alitm aaitm adsc1
specs glass 70072187 ESA65Z45 ESA 65Z45 CUT TIP FG 1808-40
I'm trying to achieve the below structure of Nested JSON for every row
import pandas as pd
import json
df = pd.DataFrame([['specs','glass','70072187','ESA65Z45','ESA 65Z45','CUT TIP FG 1808-40'],
['specs','glass','666','ESA6665','ESB 666','CUT TIP FG 66-40']],
columns = ['name', 'type','aitm','alitm','aaitm','adsc1' ])
data = {'entities':[]}
for key,grp in df.groupby('name'):
for idx, row in grp.iterrows():
temp_dict_alpha = {'name':key, 'type':row['type'], 'data':{'attributes':{}}}
attr_row = row[~row.index.isin(['name','type'])]
for idx2, row2 in attr_row.iteritems():
dict_temp = {}
dict_temp[idx2] = {'values':[]}
dict_temp[idx2]['values'].append({'value':row2,'source':'internal','locale':'en_US'})
temp_dict_alpha['data']['attributes'].update(dict_temp)
data['entities'].append(temp_dict_alpha)
print(json.dumps(data, indent= 4))
Output:
print(json.dumps(data, indent= 4))
{
"entities": [
{
"name": "specs",
"type": "glass",
"data": {
"attributes": {
"aitm": {
"values": [
{
"value": "70072187",
"source": "internal",
"locale": "en_US"
}
]
},
"alitm": {
"values": [
{
"value": "ESA65Z45",
"source": "internal",
"locale": "en_US"
}
]
},
"aaitm": {
"values": [
{
"value": "ESA 65Z45",
"source": "internal",
"locale": "en_US"
}
]
},
"adsc1": {
"values": [
{
"value": "CUT TIP FG 1808-40",
"source": "internal",
"locale": "en_US"
}
]
}
}
}
},
{
"name": "specs",
"type": "glass",
"data": {
"attributes": {
"aitm": {
"values": [
{
"value": "666",
"source": "internal",
"locale": "en_US"
}
]
},
"alitm": {
"values": [
{
"value": "ESA6665",
"source": "internal",
"locale": "en_US"
}
]
},
"aaitm": {
"values": [
{
"value": "ESB 666",
"source": "internal",
"locale": "en_US"
}
]
},
"adsc1": {
"values": [
{
"value": "CUT TIP FG 66-40",
"source": "internal",
"locale": "en_US"
}
]
}
}
}
}
]
}
I have the following code to generate json representation of list of lists.
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
['L2','L2','L3'],
['L2','L2','L1'],
['L3','L2'],
['L4','L2','L1'],
['L4','L2','L4']]
def append_path(root, paths):
if paths:
child = root.setdefault(paths[0], {})
append_path(child, paths[1:])
for p in Levels:
append_path(root, p)
def convert(d):
return [{'name': k, 'children': convert(v) if v else [{}]} for k, v in d.items()]
# Print results
import json
print(json.dumps(convert(root), indent=4))
Output:
[
"name": "L1",
"children": [
{
"name": "L1",
"children":[
{
"name":"L3",
"children":[{}]
},
{
"name":"L1",
"children":[{}]
}]
},
{
"name":"L2",
"children":[{}]
}
]
for the levels
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
I also need to encode the count of each level
for eg there is the path from L1 which has two first level childrens L1(2) and L2(1) followed by L2(1) and L3(1) for next level .
L1(3)-->L1(2)-->L2(1)
-->L3(1)
-->L2(1)
How can I encode this count in my json output.
I want my final output to look like this
"name": "L1(3)",
"children": [
{
"name": "L1(2)",
"children":[
root={}
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
['L2','L2','L3'],
['L2','L2','L1'],
['L3','L2'],
['L4','L2','L1'],
['L4','L2','L4']]
def append_path(root, paths):
if paths:
child = root.setdefault(paths[0], {})
append_path(child, paths[1:])
for p in Levels:
append_path(root, p)
def convert(d):
templist=[]
noofchildren=0
if(len(d.items())==0):
return ([{}],1)
for k,v in d.items():
temp,children=convert(v)
noofchildren+=children
if(temp):
templist.append({"name":k+"("+str(children)+")",'children':temp})
else:
templist.append({'name': k+"("+str(children)+")", 'children':[{}]})
return (templist,noofchildren)
# Print results
import json
print(json.dumps(convert(root)[0], indent=2))
OUTPUT
[
{
"name": "L1(3)",
"children": [
{
"name": "L1(2)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
},
{
"name": "L3(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L3(1)",
"children": [
{}
]
},
{
"name": "L1(1)",
"children": [
{}
]
}
]
}
]
},
{
"name": "L3(1)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L4(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L1(1)",
"children": [
{}
]
},
{
"name": "L4(1)",
"children": [
{}
]
}
]
}
]
}
]