Python - How can I convert S3 folders into JSON hierarchy? - python

I have created a list using boto3 that contains all the subfolders in my S3 bucket. The list is sorted as below:
s3_list = ['a', 'a/a1/a11', 'b', 'b/b1', 'b/b2', 'b/b2/b22']
I'm trying to convert this list into JSON hierarchy structure:
{
"root": [
{
"name": "a",
"path": "a",
"child": [
{
"name": "a1",
"path": "a/a1",
"child": [
{
"name": "a11",
"path": "a/a1/a11"
}
]
}
]
},
{
"name": "b",
"path": "b",
"child": [
{
"name": "b1",
"path": "b/b1"
},
{
"name": "b2",
"path": "b/b2",
"child": [
{
"name": "b22",
"path": "b/b2/b22"
}
]
}
]
}
]
}
Whats the best way/libraries to get this going in Python?

You can use recursion with collections.defaultdict:
from collections import defaultdict
s3_list = ['a', 'a/a1/a11', 'b', 'b/b1', 'b/b2', 'b/b2/b22']
def to_dict(d, c = []):
if not d:
return {}
_d, r = defaultdict(list), []
for a, *b in d:
_d[a].append(b)
return [{'name':a, 'path':'/'.join(c+[a]),
**({} if not (k:=list(filter(None, b))) else {'children':to_dict(k, c+[a])})}
for a, b in _d.items()]
result = {'root':to_dict([i.split('/') for i in s3_list])}
import json
print(json.dumps(result, indent=4))
Output:
{
"root": [
{
"name": "a",
"path": "a",
"children": [
{
"name": "a1",
"path": "a/a1",
"children": [
{
"name": "a11",
"path": "a/a1/a11"
}
]
}
]
},
{
"name": "b",
"path": "b",
"children": [
{
"name": "b1",
"path": "b/b1"
},
{
"name": "b2",
"path": "b/b2",
"children": [
{
"name": "b22",
"path": "b/b2/b22"
}
]
}
]
}
]
}

Related

Generate DataFrame from Nested Dictionary

I have a nested dictionary like below:
[
{
"name": "A",
"flag": "folder",
"children": [
{
"name": "A1",
"flag": "folder",
"children": [
{
"name": "A1x",
"flag": "file",
"children": []
},
{
"name": "A1y",
"flag": "file",
"children": []
}
]
}
]
}
]
From this dict, I would like to generate a dataframe as below:
Is there any nice way to make this?
With the following nested dictionary which expands on yours for demonstration purposes:
data = [
{
"name": "A",
"flag": "folder",
"children": [
{
"name": "A1",
"flag": "folder",
"children": [
{
"name": "A1x",
"flag": "file",
"children": [{"name": "A1xx", "flag": "file", "children": []}],
},
{
"name": "A1y",
"flag": "file",
"children": [{"name": "A1yy", "flag": "file", "children": []}],
},
],
},
{
"name": "A2",
"flag": "folder",
"children": [
{
"name": "A2x",
"flag": "file",
"children": [{"name": "A2xx", "flag": "file", "children": []}],
},
{
"name": "A2y",
"flag": "file",
"children": [{"name": "A2yy", "flag": "file", "children": []}],
},
],
},
],
},
{
"name": "B",
"flag": "folder",
"children": [
{
"name": "B1",
"flag": "folder",
"children": [
{
"name": "B1x",
"flag": "file",
"children": [{"name": "B1xx", "flag": "file", "children": []}],
},
{
"name": "B1y",
"flag": "file",
"children": [{"name": "B1yy", "flag": "file", "children": []}],
},
],
},
{
"name": "B2",
"flag": "folder",
"children": [
{
"name": "B2x",
"flag": "file",
"children": [{"name": "B2xx", "flag": "file", "children": []}],
},
{
"name": "B2y",
"flag": "file",
"children": [{"name": "B2yy", "flag": "file", "children": []}],
},
],
},
],
},
]
Here is one way to do it by defining two short helper functions:
import pandas as pd
def traverse(data, new_data=None):
"""Recursive function to go through dict of values.
Args:
data: target dict.
new_data: container. Defaults to None.
Returns:
Flatten data.
"""
new_data = new_data if new_data else [[]]
new_data[-1].append(data["name"])
new_data[-1].append(data["flag"])
for child in data["children"]:
traverse(child, new_data)
new_data.append([])
return new_data
def make_rows(flat_data):
"""Custom function to shape data.
Args:
flat_data: target data.
Returns:
Dataframe.
"""
rows = [[]]
for item in flat_data:
if item:
rows[-1] += item
else:
rows.append([None for _ in range(int(len(rows[-1]) / 2))])
return pd.DataFrame(rows).dropna(how="all").fillna(method="ffill")
And then:
df = pd.concat([make_rows(traverse(item)) for item in data]).reset_index(drop=True)
df.columns = pd.MultiIndex.from_product(
[[f"Level {i}" for i in range(int(df.shape[1] / 2))], ["name", "flag"]]
)
print(df)
# Output
Level 0 Level 1 Level 2 Level 3
name flag name flag name flag name flag
0 A folder A1 folder A1x file A1xx file
1 A folder A1 folder A1y file A1yy file
2 A folder A2 folder A2x file A2xx file
3 A folder A2 folder A2y file A2yy file
4 B folder B1 folder B1x file B1xx file
5 B folder B1 folder B1y file B1yy file
6 B folder B2 folder B2x file B2xx file
7 B folder B2 folder B2y file B2yy file

How to properly "merge" complex python dictionaries?

I have n of very complex Python dictionaries with big depth level (~5) and I don't know how to merge them properly and fast, not to iterate over them for a milion times.
What is worth mentioning - that dicts have strict structure as you will see below.
I was trying solutions connected with:
defaultdict
merge operator
Version of Python - 3.9
d1 = {
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
d2 = {
"name": "Louis",
"places": [
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
d3 = {
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
And in that case output should be
d_merged = {
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
},
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
Your task is quite specific, so universal solution is not possible. I'd suggest you to merge all "places", "subplaces" and "subsubplaces" in nested dictionary to clean up all possible duplicates and then modify data to match desired format.
from itertools import groupby
from operator import itemgetter
from collections import defaultdict
def merge_places(*dicts):
if not dicts:
return
# check all dicts have same names
# https://docs.python.org/3/library/itertools.html#itertools-recipes
g = groupby(dicts, itemgetter("name"))
if next(g, True) and next(g, False):
raise ValueError("Dictionaries names are not equal")
places = defaultdict(lambda: defaultdict(set)) # set values are unique
for d in dicts:
for place in d["places"]:
for subplace in place["subplaces"]:
for subsubplace in subplace["subsubplaces"]:
places[place["code"]][subplace["name"]].add(subsubplace["name"])
return {
"name": d["name"], # always exists as dicts aren't empty
"places": [
{
"code": code,
"subplaces": [
{
"name": name,
"subsubplaces": [
{"name": subsubplace}
for subsubplace in subsubplaces
]
}
for name, subsubplaces in subplaces.items()
]
}
for code, subplaces in places.items()
]
}
Usage:
result = merge_places(d1, d2, d3)
Output:
{
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
},
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}
I think your representation of data has a lot of non unnecessary details, we can reduce them by this solution:
from typing import Dict, List
dicts = [
{
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
},
{
"name": "Louis",
"places": [
{
"code": "B",
"subplaces": [
{
"name": "Subplace name",
"subsubplaces": [
{
"name": "subsub1"
}
]
},
{
"name": "Subplace name2",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
},
{
"name": "Louis",
"places": [
{
"code": "A",
"subplaces": [
{
"name": "Subplace name X",
"subsubplaces": [
{
"name": "subsub1"
}
]
}
]
}
]
}]
def merger(dicts: List[Dict]) -> Dict:
result = {}
for d in dicts:
name = d["name"]
if not name in result:
result[name] = {}
places = d["places"]
for p in places:
code = p["code"]
if not code in result[name]:
result[name][code] = []
result[name][code].extend(p["subplaces"])
return result
print(merger(dicts=dicts))
The output will be:
{
'Louis':{
'A':[
{'name': 'Subplace name', 'subsubplaces': [{'name': 'subsub1'}]},
{'name': 'Subplace name2', 'subsubplaces': [{'name': 'subsub1'}]},
{'name': 'Subplace name X', 'subsubplaces': [{'name': 'subsub1'}]}
],
'B':[
{'name': 'Subplace name', 'subsubplaces': [{'name': 'subsub1'}]},
{'name': 'Subplace name2', 'subsubplaces': [{'name': 'subsub1'}]}]
}
}
If you want your desired output it's easy to change this one to your desired output, but this on is more readable and maintainable.

Creating custom JSON from existing JSON using Python

(Python beginner alert) I am trying to create a custom JSON from an existing JSON. The scenario is - I have a source which can send many set of fields but I want to cherry pick some of them and create a subset of that while maintaining the original JSON structure. Original Sample
{
"Response": {
"rCode": "11111",
"rDesc": "SUCCESS",
"pData": {
"code": "123-abc-456-xyz",
"sData": [
{
"receiptTime": "2014-03-02T00:00:00.000",
"sessionDate": "2014-02-28",
"dID": {
"d": {
"serialNo": "3432423423",
"dType": "11111",
"dTypeDesc": "123123sd"
},
"mode": "xyz"
},
"usage": {
"duration": "661",
"mOn": [
"2014-02-28_20:25:00",
"2014-02-28_22:58:00"
],
"mOff": [
"2014-02-28_21:36:00",
"2014-03-01_03:39:00"
]
},
"set": {
"abx": "1",
"ayx": "1",
"pal": "1"
},
"rEvents": {
"john": "doe",
"lorem": "ipsum"
}
},
{
"receiptTime": "2014-04-02T00:00:00.000",
"sessionDate": "2014-04-28",
"dID": {
"d": {
"serialNo": "123123",
"dType": "11111",
"dTypeDesc": "123123sd"
},
"mode": "xyz"
},
"usage": {
"duration": "123",
"mOn": [
"2014-04-28_20:25:00",
"2014-04-28_22:58:00"
],
"mOff": [
"2014-04-28_21:36:00",
"2014-04-01_03:39:00"
]
},
"set": {
"abx": "4",
"ayx": "3",
"pal": "1"
},
"rEvents": {
"john": "doe",
"lorem": "ipsum"
}
}
]
}
}
}
Here the sData array tag has got few tags out of which I want to keep only 24 and get rid of the rest. I know I could use element.pop() but I cannot go and delete a new incoming field every time the source publishes it. Below is the expected output -
Expected Output
{
"Response": {
"rCode": "11111",
"rDesc": "SUCCESS",
"pData": {
"code": "123-abc-456-xyz",
"sData": [
{
"receiptTime": "2014-03-02T00:00:00.000",
"sessionDate": "2014-02-28",
"usage": {
"duration": "661",
"mOn": [
"2014-02-28_20:25:00",
"2014-02-28_22:58:00"
],
"mOff": [
"2014-02-28_21:36:00",
"2014-03-01_03:39:00"
]
},
"set": {
"abx": "1",
"ayx": "1",
"pal": "1"
}
},
{
"receiptTime": "2014-04-02T00:00:00.000",
"sessionDate": "2014-04-28",
"usage": {
"duration": "123",
"mOn": [
"2014-04-28_20:25:00",
"2014-04-28_22:58:00"
],
"mOff": [
"2014-04-28_21:36:00",
"2014-04-01_03:39:00"
]
},
"set": {
"abx": "4",
"ayx": "3",
"pal": "1"
}
}
]
}
}
}
I myself took reference from How can I create a new JSON object form another using Python? but its not working as expected. Looking forward for inputs/solutions from all of you gurus. Thanks in advance.
Kind of like this:
data = json.load(open("fullset.json"))
def subset(d):
newd = {}
for name in ('receiptTime','sessionData','usage','set'):
newd[name] = d[name]
return newd
data['Response']['pData']['sData'] = [subset(d) for d in data['Response']['pData']['sData']]
json.dump(data, open('newdata.json','w'))

Customize Python JSON object_hook

I am trying to customize json data using object_hook in Python 3, but do not know how to get started. Any pointers are much appreciated. I am trying to introduce a new key and move existing data into the new key in Python Object.
I am trying to convert below json text:
{
"output": [
{
"Id": "101",
"purpose": "xyz text",
"array": [
{
"data": "abcd"
},
{
"data": "ef gh ij"
}
]
},
{
"Id": "102",
"purpose": "11xyz text",
"array": [
{
"data": "abcd"
},
{
"data": "java"
},
{
"data": "ef gh ij"
}
]
}
]
}
to
{
"output": [
{
"Id": "101",
"mydata": {
"purpose": "xyz text",
"array": [
{
"data": "abcd"
},
{
"data": "ef gh ij"
}
]
}
},
{
"Id": "102",
"mydata": {
"purpose": "11xyz text",
"array": [
{
"data": "abcd"
},
{
"data": "java"
},
{
"data": "ef gh ij"
}
]
}
}
]
}
My Python JSON object hook is defined as:
class JSONObject:
def __init__( self, dict ):
vars(self).update( dict )
def toJSON(self):
return json.dumps(self, default=lambda o: o.__dict__,
sort_keys=True, indent=4)
You can specify a custom object_pairs_hook (input_json is the string with your input JSON).
def mydata_hook(obj):
obj_d = dict(obj)
if 'Id' in obj_d:
return {'Id': obj_d['Id'], 'mydata': {k: v for k, v in obj_d.items() if 'Id' not in k}}
else:
return obj_d
print(json.dumps(json.loads(input_json, object_pairs_hook=mydata_hook), indent=2))
And the output:
{
"output": [
{
"mydata": {
"array": [
{
"data": "abcd"
},
{
"data": "ef gh ij"
}
],
"purpose": "xyz text"
},
"Id": "101"
},
{
"mydata": {
"array": [
{
"data": "abcd"
},
{
"data": "java"
},
{
"data": "ef gh ij"
}
],
"purpose": "11xyz text"
},
"Id": "102"
}
]
}

custom json formatting in python

I have the following code to generate json representation of list of lists.
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
['L2','L2','L3'],
['L2','L2','L1'],
['L3','L2'],
['L4','L2','L1'],
['L4','L2','L4']]
def append_path(root, paths):
if paths:
child = root.setdefault(paths[0], {})
append_path(child, paths[1:])
for p in Levels:
append_path(root, p)
def convert(d):
return [{'name': k, 'children': convert(v) if v else [{}]} for k, v in d.items()]
# Print results
import json
print(json.dumps(convert(root), indent=4))
Output:
[
"name": "L1",
"children": [
{
"name": "L1",
"children":[
{
"name":"L3",
"children":[{}]
},
{
"name":"L1",
"children":[{}]
}]
},
{
"name":"L2",
"children":[{}]
}
]
for the levels
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
I also need to encode the count of each level
for eg there is the path from L1 which has two first level childrens L1(2) and L2(1) followed by L2(1) and L3(1) for next level .
L1(3)-->L1(2)-->L2(1)
-->L3(1)
-->L2(1)
How can I encode this count in my json output.
I want my final output to look like this
"name": "L1(3)",
"children": [
{
"name": "L1(2)",
"children":[
root={}
Levels=[['L1','L1','L2'],
['L1','L1','L3'],
['L1','L2'],
['L2','L2','L3'],
['L2','L2','L1'],
['L3','L2'],
['L4','L2','L1'],
['L4','L2','L4']]
def append_path(root, paths):
if paths:
child = root.setdefault(paths[0], {})
append_path(child, paths[1:])
for p in Levels:
append_path(root, p)
def convert(d):
templist=[]
noofchildren=0
if(len(d.items())==0):
return ([{}],1)
for k,v in d.items():
temp,children=convert(v)
noofchildren+=children
if(temp):
templist.append({"name":k+"("+str(children)+")",'children':temp})
else:
templist.append({'name': k+"("+str(children)+")", 'children':[{}]})
return (templist,noofchildren)
# Print results
import json
print(json.dumps(convert(root)[0], indent=2))
OUTPUT
[
{
"name": "L1(3)",
"children": [
{
"name": "L1(2)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
},
{
"name": "L3(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L2(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L3(1)",
"children": [
{}
]
},
{
"name": "L1(1)",
"children": [
{}
]
}
]
}
]
},
{
"name": "L3(1)",
"children": [
{
"name": "L2(1)",
"children": [
{}
]
}
]
},
{
"name": "L4(2)",
"children": [
{
"name": "L2(2)",
"children": [
{
"name": "L1(1)",
"children": [
{}
]
},
{
"name": "L4(1)",
"children": [
{}
]
}
]
}
]
}
]

Categories