nested json to csv using pandas normalize - python

With given script I am able to get output as I showed in a screenshot,
but there is a column named as cve.description.description_data which is again in json format. I want to extract that data as well.
import json
import pandas as pd
from pandas.io.json import json_normalize
#load json object
with open('nvdcve-1.0-modified.json') as f:
d = json.load(f)
#tells us parent node is 'programs'
nycphil = json_normalize(d['CVE_Items'])
nycphil.head(3)
works_data = json_normalize(data=d['CVE_Items'], record_path='cve')
works_data.head(3)
nycphil.to_csv("test4.csv")
If I change works_data = json_normalize(data=d['CVE_Items'], record_path='cve.descr') it gives this error:
"result = result[spec] KeyError: 'cve.description'"
JSON format as follows:
{
"CVE_data_type":"CVE",
"CVE_data_format":"MITRE",
"CVE_data_version":"4.0",
"CVE_data_numberOfCVEs":"1000",
"CVE_data_timestamp":"2018-04-04T00:00Z",
"CVE_Items":[
{
"cve":{
"data_type":"CVE",
"data_format":"MITRE",
"data_version":"4.0",
"CVE_data_meta":{
"ID":"CVE-2001-1594",
"ASSIGNER":"cve#mitre.org"
},
"affects":{
"vendor":{
"vendor_data":[
{
"vendor_name":"gehealthcare",
"product":{
"product_data":[
{
"product_name":"entegra_p&r",
"version":{
"version_data":[
{
"version_value":"*"
}
]
}
}
]
}
}
]
}
},
"problemtype":{
"problemtype_data":[
{
"description":[
{
"lang":"en",
"value":"CWE-255"
}
]
}
]
},
"references":{
"reference_data":[
{
"url":"http://apps.gehealthcare.com/servlet/ClientServlet/2263784.pdf?DOCCLASS=A&REQ=RAC&DIRECTION=2263784-100&FILENAME=2263784.pdf&FILEREV=5&DOCREV_ORG=5&SUBMIT=+ ACCEPT+"
},
{
"url":"http://www.forbes.com/sites/thomasbrewster/2015/07/10/vulnerable- "
},
{
"url":"https://ics-cert.us-cert.gov/advisories/ICSMA-18-037-02"
},
{
"url":"https://twitter.com/digitalbond/status/619250429751222277"
}
]
},
"description":{
"description_data":[
{
"lang":"en",
"value":"GE Healthcare eNTEGRA P&R has a password of (1) value."
}
]
}
},
"configurations":{
"CVE_data_version":"4.0",
"nodes":[
{
"operator":"OR",
"cpe":[
{
"vulnerable":true,
"cpe22Uri":"cpe:/a:gehealthcare:entegra_p%26r",
"cpe23Uri":"cpe:2.3:a:gehealthcare:entegra_p\\&r:*:*:*:*:*:*:*:*"
}
]
}
]
},
"impact":{
"baseMetricV2":{
"cvssV2":{
"version":"2.0",
"vectorString":"(AV:N/AC:L/Au:N/C:C/I:C/A:C)",
"accessVector":"NETWORK",
"accessComplexity":"LOW",
"authentication":"NONE",
"confidentialityImpact":"COMPLETE",
"integrityImpact":"COMPLETE",
"availabilityImpact":"COMPLETE",
"baseScore":10.0
},
"severity":"HIGH",
"exploitabilityScore":10.0,
"impactScore":10.0,
"obtainAllPrivilege":false,
"obtainUserPrivilege":false,
"obtainOtherPrivilege":false,
"userInteractionRequired":false
}
},
"publishedDate":"2015-08-04T14:59Z",
"lastModifiedDate":"2018-03-28T01:29Z"
}
]
}
I want to flatten all data.

Assuming the multiple URLs delineate between rows and all else meta data repeats, consider a recursive function call to extract every key-value pair in nested json object, d.
The recursive function will call global to update the needed global objects to be binded into a list of dictionaries for pd.DataFrame() call. Last loop at end updates the recursive function's dictionary, inner, to integrate the different urls (stored in multi)
import json
import pandas as pd
# load json object
with open('nvdcve-1.0-modified.json') as f:
d = json.load(f)
multi = []; inner = {}
def recursive_extract(i):
global multi, inner
if type(i) is list:
if len(i) == 1:
for k,v in i[0].items():
if type(v) in [list, dict]:
recursive_extract(v)
else:
inner[k] = v
else:
multi = i
if type(i) is dict:
for k,v in i.items():
if type(v) in [list, dict]:
recursive_extract(v)
else:
inner[k] = v
recursive_extract(d['CVE_Items'])
data_dict = []
for i in multi:
tmp = inner.copy()
tmp.update(i)
data_dict.append(tmp)
df = pd.DataFrame(data_dict)
df.to_csv('Output.csv')
Output (all columns the same except for URL, widened for emphasis)

Related

Convert Nested JSON list API data into CSV using PYTHON

Want to convert Sample JSON data into CSV file using python. I am retrieving JSON data from API.
As my JSON has nested objects, so it normally cannot be directly converted to CSV.I don't want to do any hard coding and I want to make a python code fully dynamic.
So, I have written a function that flatten my JSON Data but I am not able to work out how to iterate all records, finding relevant column names and then output those data into CSV.
In the Sample JSON file I have mentioned only 2 records but in actual there are 100 records.
Sample JSON Look like this:
[
{
"id":"Random_Company_57",
"unid":"75",
"fieldsToValues":{
"Email":"None",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"true",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"true",
"password":"None",
"externalUser":"false",
"Username":"Random_Company_57",
"affiliation":"",
"Phone":"+16 22 22 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"gfds"
},
{
"hierarchyField":"Project",
"value":"JKL-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-02-22T03:47:41.632Z",
"email":"None",
"docNo":"None",
"virtualSuperUser":false
},
{
"id":"xyz.abc#safe.net",
"unid":"98",
"fieldsToValues":{
"Email":"xyz.abc#safe.net",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"false",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"false",
"password":"None",
"externalUser":"false",
"Username":"xyz.abc#safe.net",
"affiliation":"",
"Phone":"+16 2222 222 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"PUHJ"
},
{
"hierarchyField":"Project",
"value":"RPOJ-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-03-16T05:04:13.085Z",
"email":"xyz.abc#safe.net",
"docNo":"None",
"virtualSuperUser":false
}
]
What I have tried.
def flattenjson(b, delim):
val = {}
for i in b.keys():
if isinstance(b[i], dict):
get = flattenjson(b[i], delim)
for j in get.keys():
val[i + delim + j] = get[j]
else:
val[i] = b[i]
print(val)
return val
json=[{Sample JSON String that mentioned above}]
flattenjson(json,"__")
I don't know it is a right way to deal this problem or not?
My final aim is that all the above json data will output in a csv file.
Based on this answer, you could loop through your list of json data and flatten each json with the given function (they always have the same structure?), then build a DataFrame and write the data to csv. That's the easiest way I can think of,
try this:
import pandas as pd
import json
import collections
def flatten(dictionary, parent_key=False, separator='__'):
items = []
for key, value in dictionary.items():
new_key = str(parent_key) + separator + key if parent_key else key
if isinstance(value, collections.MutableMapping):
items.extend(flatten(value, new_key, separator).items())
elif isinstance(value, list):
for k, v in enumerate(value):
items.extend(flatten({str(k): v}, new_key).items())
else:
items.append((new_key, value))
return dict(items)
with open('your_json.json') as f:
data = json.load(f) # data is a the example you provided (list of dicts)
all_records=[]
for jsn in data:
tmp = flatten(jsn)
all_records.append(tmp)
df = pd.DataFrame(all_records)
out = df.to_csv('json_to_csv.csv')

Python function to get all inner values on JSON

data_dict = {
"foo1": "bar1",
"foo2": [{"bar2": "koko"}, {"bar3": "koko2"} ],
"foo3": {
"foo4": "bar4",
"foo5": {
"foo6": "bar6",
"foo7": "bar7",
},
}
}
I need a Python function that get JSON and path of keys string like "foo2[0].bar2" and return from dict the data_dict[foo2][0][bar2] no matter if its an inner list/dict and no matter how many keys to get.
there is some external package in Python for this functionality?
You can look into jmespath-
pip install jmespath
import jmespath
data_dict = {
"foo1": "bar1",
"foo2": [{"bar2": "koko"}, {"bar3": "koko2"} ],
"foo3": {
"foo4": "bar4",
"foo5": {
"foo6": "bar6",
"foo7": "bar7",
},
}
}
print(jmespath.search('foo2[0].bar2' , data_dict))
>> koko
jmespath tutorials
Parse the path into keys and indices.
key = "foo2[0].bar2"
keys = tuple(
int(x) if x.isdigit() else x
for x in key.replace("[", ".").replace("]", "").split(".")
)
curr = data_dict
for k in keys:
curr = curr[k]
print(curr)
# koko

How to update/change both keys and values separately (not dedicated key-value pair) in a deeply nested JSON in python 3.x

I have a JSON file where I need to replace the UUID and update it with another one. I'm having trouble replacing the deeply nested keys and values.
Below is my JSON file that I need to read in python, replace the keys and values and update the file.
JSON file - myfile.json
{
"name": "Shipping box"
"company":"Detla shipping"
"description":"---"
"details" : {
"boxes":[
{
"box_name":"alpha",
"id":"a3954710-5075-4f52-8eb4-1137be51bf14"
},
{
"box_name":"beta",
"id":"31be3763-3d63-4e70-a9b6-d197b5cb6929"
} ​
​ ]
​}
"container": [
"a3954710-5075-4f52-8eb4-1137be51bf14":[],
"31be3763-3d63-4e70-a9b6-d197b5cb6929":[] ​
​]
​"data":[
{
"data_series":[],
"other":50
},
{
"data_series":[],
"other":40
},
{
"data_series":
{
"a3954710-5075-4f52-8eb4-1137be51bf14":
{
{
"dimentions":[2,10,12]
}
},
"31be3763-3d63-4e70-a9b6-d197b5cb6929":
{
{
"dimentions":[3,9,12]
}
}
},
"other":50
}
]
}
I want achieve something like the following-
"details" : {
"boxes":[
{
"box_name":"alpha"
"id":"replace_uuid"
},
}
.
.
.
​ "data":[ {
"data_series":
{
"replace_uuid":
{
{
"dimentions":[2,10,12]
}
}
]
In such a type of deeply nested dictionary, how can we replace all the occurrence of keys and values with another string, here replace_uuid?
I tried with pop() and dotty_dict but I wasn't able to replace the nested list.
I was able to achieve it in the following way-
def uuid_change(): #generate a random uuid
new_uuid = uuid.uuid4()
return str(new_uuid)
dict = json.load(f)
for uid in dict[details][boxes]:
old_id = uid['id']
replace_id = uuid_change()
uid['id'] = replace_id
for i in range(n):
for uid1 in dict['container'][i].keys()
if uid1 == old_id:
dict['container'][i][replace_id]
= dict['container'][i].pop(uid1) #replace the key
for uid2 in dict['data'][2]['data_series'].keys()
if uid2 == old_id:
dict['data'][2]['data_series'][replace_id]
= dict['data'][2]['data_series'].pop(uid2) #replace the key

How to get values of keys for changing Json

I am using python2.7
I have a json i pull that is always changing when i request it.
I need to pull out Animal_Target_DisplayName under Term7 Under Relation6 in my dict.
The problem is sometimes the object Relation6 is in another part of the Json, it could be leveled deeper or in another order.
I am trying to create code that can just export the values of the key Animal_Target_DisplayName but nothing is working. It wont even loop down the nested dict.
Now this can work if i just pull it out using something like ['view']['Term0'][0]['Relation6'] but remember the JSON is never returned in the same structure.
Code i am using to get the values of the key Animal_Target_DisplayName but it doesnt seem to loop through my dict and find all the values with that key.
array = []
for d in dict.values():
row = d['Animal_Target_DisplayName']
array.append(row)
JSON Below:
dict = {
"view":{
"Term0":[
{
"Id":"b0987b91-af12-4fe3-a56f-152ac7a4d84d",
"DisplayName":"Dog",
"FullName":"Dog",
"AssetType1":[
{
"AssetType_Id":"00000000-0000-0000-0000-000000031131",
}
]
},
{
"Id":"ee74a59d-fb74-4052-97ba-9752154f015d",
"DisplayName":"Dog2",
"FullName":"Dog",
"AssetType1":[
{
"AssetType_Id":"00000000-0000-0000-0000-000000031131",
}
]
},
{
"Id":"eb548eae-da6f-41e8-80ea-7e9984f56af6",
"DisplayName":"Dog3",
"FullName":"Dog3",
"AssetType1":[
{
"AssetType_Id":"00000000-0000-0000-0000-000000031131",
}
]
},
{
"Id":"cfac6dd4-0efa-4417-a2bf-0333204f8a42",
"DisplayName":"Animal Set",
"FullName":"Animal Set",
"AssetType1":[
{
"AssetType_Id":"00000000-0000-0000-0001-000400000001",
}
],
"StringAttribute2":[
{
"StringAttribute_00000000-0000-0000-0000-000000003114_Id":"00a701a8-be4c-4b76-a6e5-3b0a4085bcc8",
"StringAttribute_00000000-0000-0000-0000-000000003114_Value":"Desc"
}
],
"StringAttribute3":[
{
"StringAttribute_00000000-0000-0000-0000-000000000262_Id":"a81adfb4-7528-4673-8c95-953888f3b43a",
"StringAttribute_00000000-0000-0000-0000-000000000262_Value":"meow"
}
],
"BooleanAttribute4":[
{
"BooleanAttribute_00000000-0000-0000-0001-000500000001_Id":"932c5f97-c03f-4a1a-a0c5-a518f5edef5e",
"BooleanAttribute_00000000-0000-0000-0001-000500000001_Value":"true"
}
],
"SingleValueListAttribute5":[
{
"SingleValueListAttribute_00000000-0000-0000-0001-000500000031_Id":"ef51dedd-6f25-4408-99a6-5a6cfa13e198",
"SingleValueListAttribute_00000000-0000-0000-0001-000500000031_Value":"Blah"
}
],
"Relation6":[
{
"Animal_Id":"2715ca09-3ced-4b74-a418-cef4a95dddf1",
"Term7":[
{
"Animal_Target_Id":"88fd0090-4ea8-4ae6-b7f0-1b13e5cf3d74",
"Animal_Target_DisplayName":"Animaltheater",
"Animal_Target_FullName":"Animaltheater"
}
]
},
{
"Animal_Id":"6068fe78-fc8e-4542-9aee-7b4b68760dcd",
"Term7":[
{
"Animal_Target_Id":"4e87a614-2a8b-46c0-90f3-8a0cf9bda66c",
"Animal_Target_DisplayName":"Animaltitle",
"Animal_Target_FullName":"Animaltitle"
}
]
},
{
"Animal_Id":"754ec0e6-19b6-4b6b-8ba1-573393268257",
"Term7":[
{
"Animal_Target_Id":"a8986ed5-3ec8-44f3-954c-71cacb280ace",
"Animal_Target_DisplayName":"Animalcustomer",
"Animal_Target_FullName":"Animalcustomer"
}
]
},
{
"Animal_Id":"86b3ffd1-4d54-4a98-b25b-369060651bd6",
"Term7":[
{
"Animal_Target_Id":"89d02067-ebe8-4b87-9a1f-a6a0bdd40ec4",
"Animal_Target_DisplayName":"Animalfact_transaction",
"Animal_Target_FullName":"Animalfact_transaction"
}
]
},
{
"Animal_Id":"ea2e1b76-f8bc-46d9-8ebc-44ffdd60f213",
"Term7":[
{
"Animal_Target_Id":"e398cd32-1e73-46bd-8b8f-d039986d6de0",
"Animal_Target_DisplayName":"Animalfact_transaction",
"Animal_Target_FullName":"Animalfact_transaction"
}
]
}
],
"Relation10":[
{
"TargetRelation_b8b178ff-e957-47db-a4e7-6e5b789d6f03_Id":"aff80bd0-a282-4cf5-bdcc-2bad35ddec1d",
"Term11":[
{
"AnimalId":"3ac22167-eb91-469a-9d94-315aa301f55a",
"AnimalDisplayName":"Animal",
"AnimalFullName":"Animal"
}
]
}
],
"Tag12":[
{
"Tag_Id":"75968ea6-4c9f-43c9-80f7-dfc41b24ec8f",
"Tag_Name":"AnimalAnimaltitle"
},
{
"Tag_Id":"b1adbc00-aeef-415b-82b6-a3159145c60d",
"Tag_Name":"Animal2"
},
{
"Tag_Id":"5f78e4dc-2b37-41e0-a0d3-cec773af2397",
"Tag_Name":"AnimalDisplayName"
}
]
}
]
}
}
The output i am trying to get is a list of all the values from key Animal_Target_DisplayName like this ['Animaltheater','Animaltitle', 'Animalcustomer', 'Animalfact_transaction', 'Animalfact_transaction'] but we need to remember the nested structure of this json always changes but the keys for it are always the same.
I guess your only option is running through the entire dict and get the values of Animal_Target_DisplayName key, I propose the following recursive solution:
def run_json(dict_):
animal_target_sons = []
if type(dict_) is list:
for element in dict_:
animal_target_sons.append(run_json(element))
elif type(dict_) is dict:
for key in dict_:
if key=="Animal_Target_DisplayName":
animal_target_sons.append([dict_[key]])
else:
animal_target_sons.append(run_json(dict_[key]))
return [x for sublist in animal_target_sons for x in sublist]
run_json(dict_)
Then calling run_json returns a list with what you want. By the way, I recommend you to rename your json from dict to, for example dict_, since dict is a reserved word of Python for the dictionary type.
Since you're getting JSON, why not make use of the json module? That will do the parsing for you and allow you to use dictionary functions+features to get the information you need.
#!/usr/bin/python2.7
from __future__ import print_function
import json
# _somehow_ get your JSON in as a string. I'm calling it "jstr" for this
# example.
# Use the module to parse it
jdict = json.loads(jstr)
# our dict has keys...
# view -> Term0 -> keys-we're-interested-in
templist = jdict["view"]["Term0"]
results = {}
for _el in range(len(templist)):
if templist[_el]["FullName"] == "Animal Set":
# this is the one we're interested in - and it's another list
moretemp = templist[_el]["Relation6"]
for _k in range(len(moretemp)):
term7 = moretemp[_k]["Term7"][0]
displayName = term7["Animal_Target_DisplayName"]
fullName = term7["Animal_Target_FullName"]
results[fullName] = displayName
print("{0}".format(results))
Then you can dump the results dict plain, or with pretty-printing:
>>> print(json.dumps(results, indent=4))
{
"Animaltitle2": "Animaltitle2",
"Animalcustomer3": "Animalcustomer3",
"Animalfact_transaction4": "Animalfact_transaction4",
"Animaltheater1": "Animaltheater1"
}

How to group dictionary elements into lists dynamically?

I have JSON file as mentioned below,
**test.json**
{
"header1" :
{
"header1_body1":
{
"some_key":"some_value",
.......................
},
"header1_body2":
{
"some_key":"some_value",
.......................
}
},
"header2":
{
"header2_body1":
{
"some_key":"some_value",
.......................
},
"header2_body2":
{
"some_key":"some_value",
.......................
}
}
}
Would like to group the JSON content into lists as below:
header1 = ['header1_body1','header1_body2']
header2 = ['header2_body1','header2_body2']
header1, header2 can be till ....header n. So dynamically lists has to be created containing it's values as shown above.
How can i achieve this ?
What's the best optimal way to approach ?
SOLUTION:
with open('test.json') as json_data:
d = json.load(json_data)
for k,v in d.iteritems():
if k == "header1" or k == "header2":
globals()['{}'.format(k)] = d[k].keys()
now, header1 and header2 can be accessed as list.
for i in header1:
print i
Assuming you read the JSON into a variable d (maybe using json.loads), you could iterate over the keys (sorted?) and build the lists with the keys of current value:
for key in sorted(d.keys()):
l = [x for x in sorted(d[key].keys())] # using list comprehension
print(key + ' = ' + str(l))
Fixing your json structure:
{
"header1" :
{
"header1_body1":
{
"some_key":"some_value"
},
"header1_body2":
{
"some_key":"some_value"
}
},
"header2":
{
"header2_body1":
{
"some_key":"some_value"
},
"header2_body2":
{
"some_key":"some_value"
}
}
}
And then loading and creating lists:
header = []
for key, value in dictdump.items():
header.append(list(value.keys()))
for header_num in range(0, len(header)):
print("header{} : {}".format(header_num + 1, header[header_num]))
Gives:
header1 : ['header1_body1', 'header1_body2']
header2 : ['header2_body1', 'header2_body2']
Once you load your json, you can get the list you want for any key by doing something like the following (headers variable below is a placeholder for your loaded json). You don't need to convert it to a list to work with it as an iterable but wrapped it in list(...) to match the output in your question.
list(headers['header1'].keys())
If you need to actually store the list of keys for each of your "header" dicts in some sort of accessible format, then you could create another dictionary that contains the lists you want. For example:
import json
data = """{
"header1" : {
"header1_body1": {
"some_key":"some_value"
},
"header1_body2": {
"some_key":"some_value"
}
},
"header2": {
"header2_body1": {
"some_key":"some_value"
},
"header2_body2": {
"some_key":"some_value"
}
}
}"""
headers = json.loads(data)
# get the list of keys for a specific header
header = list(headers['header1'].keys())
print(header)
# ['header1_body1', 'header1_body2']
# if you really want to store them in another dict
results = {h[0]: list(h[1].keys()) for h in headers.items()}
print(results)
# OUTPUT
# {'header1': ['header1_body1', 'header1_body2'], 'header2': ['header2_body1', 'header2_body2']}
You can use recursion:
d = {'header1': {'header1_body1': {'some_key': 'some_value'}, 'header1_body2': {'some_key': 'some_value'}}, 'header2': {'header2_body1': {'some_key': 'some_value'}, 'header2_body2': {'some_key': 'some_value'}}}
def flatten(_d):
for a, b in _d.items():
yield a
if isinstance(b, dict):
yield from flatten(b)
new_results = {a:[i for i in flatten(b) if i.startswith(a)] for a, b in d.items()}
Output:
{'header1': ['header1_body1', 'header1_body2'], 'header2': ['header2_body1', 'header2_body2']}
import json
with open('test.json') as json_data:
d = json.load(json_data)
for k,v in d.iteritems():
if k == "header1" or k == "header2":
globals()['{}'.format(k)] = d[k].keys()
now, `header1` and `header2` can be accessed as list.
for i in header1:
print i

Categories