Generate DataFrame from Nested Dictionary

Generate DataFrame from Nested Dictionary - python

I have a nested dictionary like below:
[
{
"name": "A",
"flag": "folder",
"children": [
{
"name": "A1",
"flag": "folder",
"children": [
{
"name": "A1x",
"flag": "file",
"children": []
},
{
"name": "A1y",
"flag": "file",
"children": []
}
]
}
]
}
]
From this dict, I would like to generate a dataframe as below:
Is there any nice way to make this?

With the following nested dictionary which expands on yours for demonstration purposes:
data = [
{
"name": "A",
"flag": "folder",
"children": [
{
"name": "A1",
"flag": "folder",
"children": [
{
"name": "A1x",
"flag": "file",
"children": [{"name": "A1xx", "flag": "file", "children": []}],
},
{
"name": "A1y",
"flag": "file",
"children": [{"name": "A1yy", "flag": "file", "children": []}],
},
],
},
{
"name": "A2",
"flag": "folder",
"children": [
{
"name": "A2x",
"flag": "file",
"children": [{"name": "A2xx", "flag": "file", "children": []}],
},
{
"name": "A2y",
"flag": "file",
"children": [{"name": "A2yy", "flag": "file", "children": []}],
},
],
},
],
},
{
"name": "B",
"flag": "folder",
"children": [
{
"name": "B1",
"flag": "folder",
"children": [
{
"name": "B1x",
"flag": "file",
"children": [{"name": "B1xx", "flag": "file", "children": []}],
},
{
"name": "B1y",
"flag": "file",
"children": [{"name": "B1yy", "flag": "file", "children": []}],
},
],
},
{
"name": "B2",
"flag": "folder",
"children": [
{
"name": "B2x",
"flag": "file",
"children": [{"name": "B2xx", "flag": "file", "children": []}],
},
{
"name": "B2y",
"flag": "file",
"children": [{"name": "B2yy", "flag": "file", "children": []}],
},
],
},
],
},
]
Here is one way to do it by defining two short helper functions:
import pandas as pd
def traverse(data, new_data=None):
"""Recursive function to go through dict of values.
Args:
data: target dict.
new_data: container. Defaults to None.
Returns:
Flatten data.
"""
new_data = new_data if new_data else [[]]
new_data[-1].append(data["name"])
new_data[-1].append(data["flag"])
for child in data["children"]:
traverse(child, new_data)
new_data.append([])
return new_data
def make_rows(flat_data):
"""Custom function to shape data.
Args:
flat_data: target data.
Returns:
Dataframe.
"""
rows = [[]]
for item in flat_data:
if item:
rows[-1] += item
else:
rows.append([None for _ in range(int(len(rows[-1]) / 2))])
return pd.DataFrame(rows).dropna(how="all").fillna(method="ffill")
And then:
df = pd.concat([make_rows(traverse(item)) for item in data]).reset_index(drop=True)
df.columns = pd.MultiIndex.from_product(
[[f"Level {i}" for i in range(int(df.shape[1] / 2))], ["name", "flag"]]
)
print(df)
# Output
Level 0 Level 1 Level 2 Level 3
name flag name flag name flag name flag
0 A folder A1 folder A1x file A1xx file
1 A folder A1 folder A1y file A1yy file
2 A folder A2 folder A2x file A2xx file
3 A folder A2 folder A2y file A2yy file
4 B folder B1 folder B1x file B1xx file
5 B folder B1 folder B1y file B1yy file
6 B folder B2 folder B2x file B2xx file
7 B folder B2 folder B2y file B2yy file

Related

Using pandas to convert csv into nested json with dynamic strucutre

I am new to python and now want to convert a csv file into json file. Basically the json file is nested with dynamic structure, the structure will be defined using the csv header.
From csv input:
ID, Name, person_id/id_type, person_id/id_value,person_id_expiry_date,additional_info/0/name,additional_info/0/value,additional_info/1/name,additional_info/1/value,salary_info/details/0/grade,salary_info/details/0/payment,salary_info/details/0/amount,salary_info/details/1/next_promotion
1,Peter,PASSPORT,A452817,1-01-2055,Age,19,Gender,M,Manager,Monthly,8956.23,unknown
2,Jane,PASSPORT,B859804,2-01-2035,Age,38,Gender,F,Worker, Monthly,125980.1,unknown
To json output:
[
{
"ID": 1,
"Name": "Peter",
"person_id": {
"id_type": "PASSPORT",
"id_value": "A452817"
},
"person_id_expiry_date": "1-01-2055",
"additional_info": [
{
"name": "Age",
"value": 19
},
{
"name": "Gender",
"value": "M"
}
],
"salary_info": {
"details": [
{
"grade": "Manager",
"payment": "Monthly",
"amount": 8956.23
},
{
"next_promotion": "unknown"
}
]
}
},
{
"ID": 2,
"Name": "Jane",
"person_id": {
"id_type": "PASSPORT",
"id_value": "B859804"
},
"person_id_expiry_date": "2-01-2035",
"additional_info": [
{
"name": "Age",
"value": 38
},
{
"name": "Gender",
"value": "F"
}
],
"salary_info": {
"details": [
{
"grade": "Worker",
"payment": " Monthly",
"amount": 125980.1
},
{
"next_promotion": "unknown"
}
]
}
}
]
Is this something can be done by the existing pandas API or I have to write lots of complex codes to dynamically construct the json object? Thanks.

How to browse and get only json position 0 in python [duplicate]

This question already has answers here:
How to extract data from dictionary in the list
(3 answers)
Closed 11 months ago.
I have the following json output.
"detections": [
{
"source": "detection",
"uuid": "50594028",
"detectionTime": "2022-03-27T06:50:56Z",
"ingestionTime": "2022-03-27T07:04:50Z",
"filters": [
{
"id": "F2058",
"unique_id": "3638f7c0",
"level": "critical",
"name": "Possible Right-To-Left Override Attack",
"description": "Possible Right-To-Left Override Detected in the Filename",
"tactics": [
"TA0005"
],
"techniques": [
"T1036.002"
],
"highlightedObjects": [
{
"field": "fileName",
"type": "filename",
"value": [
"1465940311.,S=473394(NONAMEFL(Z00057-PI‮fdp.exe))"
]
},
{
"field": "filePathName",
"type": "fullpath",
"value": "/exports/10_19/mail/12/91/20193/new/1465940311.,S=473394(NONAMEFL(Z00057-PI‮fdp.exe))"
},
{
"field": "malName",
"type": "detection_name",
"value": "HEUR_RLOTRICK.A"
},
{
"field": "actResult",
"type": "text",
"value": [
"Passed"
]
},
{
"field": "scanType",
"type": "text",
"value": "REALTIME"
}
]
},
{
"id": "F2140",
"unique_id": "5a313874",
"level": "medium",
"name": "Malicious Software",
"description": "A malicious software was detected on an endpoint.",
"tactics": [],
"techniques": [],
"highlightedObjects": [
{
"field": "fileName",
"type": "filename",
"value": [
"1465940311.,S=473394(NONAMEFL(Z00057-PI‮fdp.exe))"
]
},
{
"field": "filePathName",
"type": "fullpath",
"value": "/exports/10_19/mail/12/91/rs001291-excluido-20193/new/1465940311.,S=473394(NONAMEFL(Z00057-PI‮fdp.exe))"
},
{
"field": "malName",
"type": "detection_name",
"value": "HEUR_RLOTRICK.A"
},
{
"field": "actResult",
"type": "text",
"value": [
"Passed"
]
},
{
"field": "scanType",
"type": "text",
"value": "REALTIME"
},
{
"field": "endpointIp",
"type": "ip",
"value": [
"xxx.xxx.xxx"
]
}
]
}
],
"entityType": "endpoint",
"entityName": "xxx(xxx.xxx.xxx)",
"endpoint": {
"name": "xxx",
"guid": "d1dd7e61",
"ips": [
"2xx.xxx.xxx"
]
}
}
Inside the 'filters' offset it brings me two levels, one critical and one medim, both with the variable 'name'.
I want to print only the first name, but when I print the 'name', it returns both names:
How do I print only the first one?
If I put print in for filters, it returns both names:
If I put print in for detections, it only returns the second 'name' and that's not what I want:

If you only want to print the name of the first filter, why iterate over it, just index it and print the value under "name":
for d in r['detections']:
print(d['filters'][0]['name'])

Python - How can I convert S3 folders into JSON hierarchy?

I have created a list using boto3 that contains all the subfolders in my S3 bucket. The list is sorted as below:
s3_list = ['a', 'a/a1/a11', 'b', 'b/b1', 'b/b2', 'b/b2/b22']
I'm trying to convert this list into JSON hierarchy structure:
{
"root": [
{
"name": "a",
"path": "a",
"child": [
{
"name": "a1",
"path": "a/a1",
"child": [
{
"name": "a11",
"path": "a/a1/a11"
}
]
}
]
},
{
"name": "b",
"path": "b",
"child": [
{
"name": "b1",
"path": "b/b1"
},
{
"name": "b2",
"path": "b/b2",
"child": [
{
"name": "b22",
"path": "b/b2/b22"
}
]
}
]
}
]
}
Whats the best way/libraries to get this going in Python?

You can use recursion with collections.defaultdict:
from collections import defaultdict
s3_list = ['a', 'a/a1/a11', 'b', 'b/b1', 'b/b2', 'b/b2/b22']
def to_dict(d, c = []):
if not d:
return {}
_d, r = defaultdict(list), []
for a, *b in d:
_d[a].append(b)
return [{'name':a, 'path':'/'.join(c+[a]),
**({} if not (k:=list(filter(None, b))) else {'children':to_dict(k, c+[a])})}
for a, b in _d.items()]
result = {'root':to_dict([i.split('/') for i in s3_list])}
import json
print(json.dumps(result, indent=4))
Output:
{
"root": [
{
"name": "a",
"path": "a",
"children": [
{
"name": "a1",
"path": "a/a1",
"children": [
{
"name": "a11",
"path": "a/a1/a11"
}
]
}
]
},
{
"name": "b",
"path": "b",
"children": [
{
"name": "b1",
"path": "b/b1"
},
{
"name": "b2",
"path": "b/b2",
"children": [
{
"name": "b22",
"path": "b/b2/b22"
}
]
}
]
}
]
}

Create process tree form json file. in python

I need some assistance on a somewhat simple issue.
I'm trying to convert the content of a json file from this:
{ "Timestamp": "Timestamp",
"name": "SVCHOST.EXE",
"icon": "binary_icon.png",
"Process": SVCHOST.EXE,
"Pid": "876",
"PPID": "500"],
"children": [Process details])
},
{ "Timestamp":"Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "500",
"PPID": "4"],
"children": [Process details])
},
{ "Timestamp":"Timestamp",
"name": "SYSTEM",
"icon": "binary_icon.png",
"Process": "SYSTEM",
"Pid": "4",
"PPID": "0"],
"children": [Process details])
}
To this:
{
"name": "Root",
"children": [
{
"name": "4",
"children": [
{
"name": "500",
"children": [
{
"name": "876",
"children": []
}
]
}
]
}
}
To create a node tree graph in the end.
But after a lot of trial and error, and still not close to the output I need. I'm asking for some pointer, tips or tricks.
Any help is much appreciated.
Thanks,
Here is my most recent attempt.
import json
links = ({
"Timestamp": "Timestamp",
"name": "SVCHOST.EXE",
"icon": "binary_icon.png",
"Process": "SVCHOST.EXE",
"Pid": "876",
"PPID": "500",
"children": "Process_details"
},
{
"Timestamp":"Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "500",
"PPID": "4",
"children": "Process_details"
},
{
"Timestamp":"Timestamp",
"name": "SYSTEM",
"icon": "binary_icon.png",
"Process": "SYSTEM",
"Pid": "4",
"PPID": "0",
"children": "Process_details"
})
parent_proc_node = {}
root = {'name': 'Root', 'children': []}
for item in procs:
parent_node = parent_proc_node.get(item['Pid'])
if not parent_node:
parent_proc_node[item['Pid']] = parent_node = {'name': item['PPID']}
root['children'].append(parent_node)
parent_proc_node[item['PPID']] = child_node = {'name': item['Pid']}
parent_node.setdefault('children', []).append(child_node)
print json.dumps(root, indent=4)
Current output:
{
"name": "Root",
"children": [
{
"name": "500",
"children": [
{
"name": "876",
"children": [
{
"name": "500",
"children": [
{
"name": "4"
}
]
}
]
}
]
}
}
The output is now what I want, but im still not able to correctly match parent process with children.
What am I doing wrong?
The correct output would be like this:
{
"name": "Root",
"children": [
{
"name": "4",
"children": [
{
"name": "500",
"children": [
{
"name": "876",
"children": [
{
"name": ""
}
]
}
]
}
]
}
}

Here's some code that does what I think you want. It processes the links (which I turned into a list, since JSON doesn't have tuples), converting it into the nested structure that you show as the final correct output. I've also added a couple of new records so that some parents have multiple children.
The trick is to first create a dictionary (ids) that captures the parent-child relationship of the process IDs.
import json
links = [
{
"Timestamp": "Timestamp",
"name": "SVCHOST.EXE",
"icon": "binary_icon.png",
"Process": "SVCHOST.EXE",
"Pid": "876",
"PPID": "500",
"children": "Process_details"
},
{
"Timestamp": "Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "500",
"PPID": "4",
"children": "Process_details"
},
{
"Timestamp": "Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "510",
"PPID": "4",
"children": "Process_details"
},
{
"Timestamp": "Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "600",
"PPID": "510",
"children": "Process_details"
},
{
"Timestamp": "Timestamp",
"name": "SYSTEM",
"icon": "binary_icon.png",
"Process": "SYSTEM",
"Pid": "4",
"PPID": "0",
"children": "Process_details"
}
]
# Create a dict linking each pid to its parent
ids = {}
for d in links:
# Use "0" as the ppid if "PPID" field is an empty string
ppid, pid = d["PPID"] or "0", d["Pid"]
ids.setdefault(ppid, []).append(pid)
print(ids)
# Nest the data for each pid in its parent's dict
def insert(lst, ppid, name):
if ppid in ids:
children = []
lst.append({"name": name, "children": children})
for pid in ids[ppid]:
insert(children, pid, pid)
else:
children = [{"name": ""}]
lst.append({"name": name, "children": children})
nested = []
insert(nested, "0", "Root")
print(json.dumps(nested[0], indent=4))
output
{'500': ['876'], '4': ['500', '510'], '510': ['600'], '0': ['4']}
{
"name": "Root",
"children": [
{
"name": "4",
"children": [
{
"name": "500",
"children": [
{
"name": "876",
"children": [
{
"name": ""
}
]
}
]
},
{
"name": "510",
"children": [
{
"name": "600",
"children": [
{
"name": ""
}
]
}
]
}
]
}
]
}

#PM 2Ring Sorry please disregard the PPID 0 remarks. missing execution handling at my end. :)
Your excample works perfectly, for parents with child pids. however if a PID has no parent it is not added to the root node.
procs = [{
"Timestamp": "Timestamp",
"name": "SVCHOST.EXE",
"icon": "binary_icon.png",
"Process": "SVCHOST.EXE",
"Pid": "876",
"PPID": "500",
"children": "Process_details"
},
{
"Timestamp":"Timestamp",
"name": "LSAS.EXE",
"icon": "binary_icon.png",
"Process": "LSAS.EXE",
"Pid": "500",
"PPID": "4",
"children": "Process_details"
},
{
"Timestamp":"Timestamp",
"name": "SYSTEM",
"icon": "binary_icon.png",
"Process": "SYSTEM",
"Pid": "4",
"PPID": "0",
"children": "Process_details"
}
,
{
"Timestamp":"Timestamp",
"name": "ROUGEPROC",
"icon": "binary_icon.png",
"Process": "ROUGEPROC",
"Pid": "4322",
"PPID": "",
"children": "Process_details"
}]
# Create a dict linking each pid to its parent
ids = {}
for d in procs:
ppid, pid = d["PPID"], d["Pid"]
ids.setdefault(ppid, []).append(pid)
print(ids)
# Nest the data for each pid in its parent's dict
def insert(lst, ppid, name):
if ppid in ids:
children = []
lst.append({"name": name, "children": children, "icon": "binary_icon.png"})
for pid in ids[ppid]:
insert(children, pid, pid)
else:
children = []
lst.append({"name": name, "children": children, "icon": "binary_icon.png"})
nested = []
insert(nested, "0", "PPID")
proc_report = {"name" :"HOSTNAME",
"icon": "win_os_icon.png",
"children": nested[0]["children"]
}
print(json.dumps(proc_report, indent=4))
Output:
{'': ['4322'], '0': ['4'], '4': ['500'], '500': ['876']}
{
"children": [
{
"icon": "binary_icon.png",
"name": "4",
"children": [
{
"icon": "binary_icon.png",
"name": "500",
"children": [
{
"icon": "binary_icon.png",
"name": "876",
"children": []
}
]
}
]
}
],
"name": "HOSTNAME",
"icon": "win_os_icon.png"
}

Filesytem tree to json

I'm trying to convert a filesystem tree to json using python.
Imagine that i have the following tree :
plans/
|-- p1/
| |-- p1_1.pdf
| |-- p1_2.pdf
| `-- test/
| `-- test.jpg
|-- p2/
| |-- p2_1.pdf
| |-- p2_2.pdf
| `-- test2/
|
`-- resume.pdf
I would like to have a json output like that :
[
{
"name": "p1",
"type": "folder",
"path": "/plans/p1",
"tag": "org",
"children": [
{
"name": "p1_1.pdf",
"type": "file",
"path": "/plans/p1/p1_1.pdf",
"tag": "org"
},
{
"name": "p1_2.pdf",
"type": "file",
"path": "/plans/p1/p1_2.pdf",
"tag": "org"
},
{
"name": "test",
"type": "folder",
"path": "/plans/p1/test",
"tag": "org",
"children": [
{
"name": "test.jpg",
"type": "file",
"path": "/plans/p1/test/test.jpg",
"tag": "org"
}
]
}
]
},
{
"name": "p2",
"type": "folder",
"path": "/plans/p2",
"tag": "org",
"children": [
{
"name": "p2_1.pdf",
"type": "file",
"path": "/plans/p2/p2_1.pdf",
"tag": "org"
},
{
"name": "p2_2.pdf",
"type": "file",
"path": "/plans/p2/p2_2.pdf",
"tag": "org"
},
{
"name": "test2",
"type": "folder",
"path": "/plans/p2/test2",
"tag": "org",
"children": [
]
}
]
},
{
"name": "resume.pdf",
"type": "file",
"path": "/plans/resume.pdf",
"tag": "org"
}
]
I'm currently using the os.walk() python function to go through the tree and creating lists of dicts to generate a "dumpable" list using json.dumps() but i didn't know how to do it recursively.
Here a quick code draft:
def tree_to_json(rootdir):
main = []
for path, dirs, files in os.walk(rootdir):
for curdir in dirs:
child = []
new_dir = {"name": curdir,
"type": "folder",
"path": path + os.sep + curdir,
"children": child}
main.append(new_dir)
for curfile in files:
new_file = {"name": curfile,
"type": "file",
"path": path + os.sep + curfile}
main.append(new_file)
return json.dumps(main, sort_keys=True, indent=2, separators=(',', ': '))

As anything in programming there are many ways to solve. Here is one solution:
import json
from os import walk, path
def file_to_dict(fpath):
return {
'name': path.basename(fpath),
'type': 'file',
'path': fpath,
'tag': 'org',
}
def folder_to_dict(rootpath):
return {
'name': path.basename(rootpath),
'type': 'folder',
'path': rootpath,
'tag': 'org',
'children': [],
}
def tree_to_dict(rootpath):
root_dict = folder_to_dict(rootpath)
root, folders, files = walk(rootpath).next()
root_dict['children'] = [file_to_dict(path.sep.join([root, fpath])) for fpath in files]
root_dict['children'] += [tree_to_dict(path.sep.join([root, folder])) for folder in folders]
return root_dict
def tree_to_json(rootdir, pretty_print=True):
root, folders, files = walk(rootdir).next()
root_dict = [tree_to_dict(path.sep.join([root, folder])) for folder in folders]
root_dict += [file_to_dict(path.sep.join([root, fpath])) for fpath in files]
if pretty_print:
js = json.dumps(root_dict, indent=4, encoding='utf-8')
else:
js = json.dumps(root_dict, encoding='utf-8')
return js
print tree_to_json('/tmp/tree')
And here is the output:
[
{
"path": "/tmp/tree/p1",
"tag": "org",
"type": "folder",
"name": "p1",
"children": [
{
"path": "/tmp/tree/p1/p1_1.pdf",
"tag": "org",
"type": "file",
"name": "p1_1.pdf"
},
{
"path": "/tmp/tree/p1/p1_2.pdf",
"tag": "org",
"type": "file",
"name": "p1_2.pdf"
},
{
"path": "/tmp/tree/p1/test",
"tag": "org",
"type": "folder",
"name": "test",
"children": [
{
"path": "/tmp/tree/p1/test/test.jpg",
"tag": "org",
"type": "file",
"name": "test.jpg"
}
]
}
]
},
{
"path": "/tmp/tree/p2",
"tag": "org",
"type": "folder",
"name": "p2",
"children": [
{
"path": "/tmp/tree/p2/p2_1.pdf",
"tag": "org",
"type": "file",
"name": "p2_1.pdf"
},
{
"path": "/tmp/tree/p2/p2_2.pdf",
"tag": "org",
"type": "file",
"name": "p2_2.pdf"
},
{
"path": "/tmp/tree/p2/test2",
"tag": "org",
"type": "folder",
"name": "test2",
"children": []
}
]
},
{
"path": "/tmp/tree/resume.pdf",
"tag": "org",
"type": "file",
"name": "resume.pdf"
}
]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Generate DataFrame from Nested Dictionary - python

Related

Using pandas to convert csv into nested json with dynamic strucutre

How to browse and get only json position 0 in python [duplicate]

Python - How can I convert S3 folders into JSON hierarchy?

Create process tree form json file. in python

Filesytem tree to json

Categories

Resources