Want to convert Sample JSON data into CSV file using python. I am retrieving JSON data from API.
As my JSON has nested objects, so it normally cannot be directly converted to CSV.I don't want to do any hard coding and I want to make a python code fully dynamic.
So, I have written a function that flatten my JSON Data but I am not able to work out how to iterate all records, finding relevant column names and then output those data into CSV.
In the Sample JSON file I have mentioned only 2 records but in actual there are 100 records.
Sample JSON Look like this:
[
{
"id":"Random_Company_57",
"unid":"75",
"fieldsToValues":{
"Email":"None",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"true",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"true",
"password":"None",
"externalUser":"false",
"Username":"Random_Company_57",
"affiliation":"",
"Phone":"+16 22 22 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"gfds"
},
{
"hierarchyField":"Project",
"value":"JKL-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-02-22T03:47:41.632Z",
"email":"None",
"docNo":"None",
"virtualSuperUser":false
},
{
"id":"xyz.abc#safe.net",
"unid":"98",
"fieldsToValues":{
"Email":"xyz.abc#safe.net",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"false",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"false",
"password":"None",
"externalUser":"false",
"Username":"xyz.abc#safe.net",
"affiliation":"",
"Phone":"+16 2222 222 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"PUHJ"
},
{
"hierarchyField":"Project",
"value":"RPOJ-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-03-16T05:04:13.085Z",
"email":"xyz.abc#safe.net",
"docNo":"None",
"virtualSuperUser":false
}
]
What I have tried.
def flattenjson(b, delim):
val = {}
for i in b.keys():
if isinstance(b[i], dict):
get = flattenjson(b[i], delim)
for j in get.keys():
val[i + delim + j] = get[j]
else:
val[i] = b[i]
print(val)
return val
json=[{Sample JSON String that mentioned above}]
flattenjson(json,"__")
I don't know it is a right way to deal this problem or not?
My final aim is that all the above json data will output in a csv file.
Based on this answer, you could loop through your list of json data and flatten each json with the given function (they always have the same structure?), then build a DataFrame and write the data to csv. That's the easiest way I can think of,
try this:
import pandas as pd
import json
import collections
def flatten(dictionary, parent_key=False, separator='__'):
items = []
for key, value in dictionary.items():
new_key = str(parent_key) + separator + key if parent_key else key
if isinstance(value, collections.MutableMapping):
items.extend(flatten(value, new_key, separator).items())
elif isinstance(value, list):
for k, v in enumerate(value):
items.extend(flatten({str(k): v}, new_key).items())
else:
items.append((new_key, value))
return dict(items)
with open('your_json.json') as f:
data = json.load(f) # data is a the example you provided (list of dicts)
all_records=[]
for jsn in data:
tmp = flatten(jsn)
all_records.append(tmp)
df = pd.DataFrame(all_records)
out = df.to_csv('json_to_csv.csv')
Related
I have the below Json file which I need to query to get the values of the keys inside 'validations' in a list
for example the column_values_not_null output will need to be this:
['lu_name', 'transaction_amount']
"validation_file_name": "ctm",
"connection_type": "s3",
"low_threshold": 500000,
"high_threshold": 1000000,
"frequency": "weekly",
"validations": [
{
"columns_to_match_ordered_list" :[
"lu_name",
"site_name",
"transaction_date_time",
"margin",
"transaction_currency_code",
"reversal_indicator_description",
"reversal_amount",
"original_amount"
]
},
{
"column_values_not_null":[
"lu_name",
"transaction_amount"
]
},
{
"column_values_not_duplicate": [
"lu_name",
"response_code_description"
]
}
]
I am able to do the below but I need to do this without using the index value
f = open('test.json')
json_content = json.load(f)
print(json_content['validations'][1]['column_values_not_null'])
Get a list by querying the validations key. The sum( ,[]) are used to flat the list (as required by the condition "without using the index value" if got it right), for details about it with pros and cons see doc.
data = #
def validations(data: dict, key_query: str) -> list:
for k, v in data.items():
if k == 'validations':
return sum(sum([list(d.values()) for d in v if key_query in d], []), [])
print(validations(data, query='column_values_not_null'))
# ['lu_name', 'transaction_amount']
I want to remove some problematic $oid and everything that contains $ in a json file. I wrote:
import json
with open('C:\\Windows\\System32\\files\\news.json', 'r', encoding="utf8") as handle:
data = [json.loads(line) for line in handle]
for k,v in data[0].items():
#check if key has dict value
if type(v) == dict:
#find id with $
r = list(data[k].keys())[0]
#change value if $ occurs
if r[0] == '$':
data[k] = data[k][r]
print(data)
But I get TypeError: list indices must be integers or slices, not str. I know it is because the json dictionaries are made redeable for Python, but how do I fix it?
Edit: the .json file in my computer looks like this:
{
"_id": {
"$oid": "5e7511c45cb29ef48b8cfcff"
},
"description": "some text",
"startDate": {
"$date": "5e7511c45cb29ef48b8cfcff"
},
"completionDate": {
"$date": "2021-01-05T14:59:58.046Z"
}
}
I believe this is because your k is a str and you try to call data[k]?
It will be better if you show the format of the json as well.
Updating with answer.
This should work for the given json. But if you want to for a larger file. looping can be tricky, specially because you're trying to modify the keys of a dictionary.
import json
line = '{"_id": { "$oid": "5e7511c45cb29ef48b8cfcff" }, "description": "some text", "startDate": { "$date": "5e7511c45cb29ef48b8cfcff"},"completionDate": {"$date": "2021-01-05T14:59:58.046Z"}}'
data = [json.loads(line)]
for k,v in data[0].items():
if type(v) == dict:
for k2, v2 in data[0][k].items():
if k2[0] == '$':
formatted = k2[1:]
del data[0][k][k2]
data[0][k][formatted] = v2
print(data)
# import json
# with open('C:\\Windows\\System32\\files\\news.json', 'r', encoding="utf8") as handle:
# data = [json.loads(line) for line in handle]
data = [
{
"_id": {
"$oid": "5e7511c45cb29ef48b8cfcff"
},
"description": "some text",
"startDate": {
"$date": "5e7511c45cb29ef48b8cfcff"
},
"completionDate": {
"$date": "2021-01-05T14:59:58.046Z"
}
}
]
for d in data:
for k, v in d.items():
# check if key has dict value
del_keys = set()
if type(v) == dict:
# find id with $
del_keys.update([i for i in v if i.startswith("$")])
[v.pop(key) for key in del_keys]
print(data)
# [{'_id': {}, 'description': 'some text', 'startDate': {}, 'completionDate': {}}]
{
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft": "Warning",
"Microsoft.Hosting.Lifetime": "Information",
"Microsoft.AspNetCore": "Warning",
"System.Net.Http.HttpClient.Default.ClientHandler": "Warning",
"System.Net.Http.HttpClient.Default.LogicalHandler": "Warning"
}
},
"AllowedHosts": "*",
"AutomaticTransferOptions": {
"DateOffsetForDirectoriesInDays": -1,
"DateOffsetForPortfoliosInDays": -3,
"Clause": {
"Item1": "1"
}
},
"Authentication": {
"ApiKeys": [
{
"Key": "AB8E5976-2A7C-4EEE-92C1-7B0B4DC840F6",
"OwnerName": "Cron job",
"Claims": [
{
"Type": "http://schemas.microsoft.com/ws/2008/06/identity/claims/role",
"Value": "StressTestManager"
}
]
},
{
"Key": "B11D4F27-483A-4234-8EC7-CA121712D5BE",
"OwnerName": "Test admin",
"Claims": [
{
"Type": "http://schemas.microsoft.com/ws/2008/06/identity/claims/role",
"Value": "StressTestAdmin"
},
{
"Type": "http://schemas.microsoft.com/ws/2008/06/identity/claims/role",
"Value": "TestManager"
}
]
},
{
"Key": "EBF98F2E-555E-4E66-9D77-5667E0AA1B54",
"OwnerName": "Test manager",
"Claims": [
{
"Type": "http://schemas.microsoft.com/ws/2008/06/identity/claims/role",
"Value": "TestManager"
}
]
}
],
"LDAP": {
"Domain": "domain.local",
"MachineAccountName": "Soft13",
"MachineAccountPassword": "vixuUEY7884*",
"EnableLdapClaimResolution": true
}
},
"Authorization": {
"Permissions": {
"Roles": [
{
"Role": "TestAdmin",
"Permissions": [
"transfers.create",
"bindings.create"
]
},
{
"Role": "TestManager",
"Permissions": [
"transfers.create"
]
}
]
}
}
}
I have JSON above and need to parse it with output like this
Logging__LogLevel__Default
Authentication__ApiKeys__0__Claims__0__Type
Everything is ok, but I always get some strings with this output
Authentication__ApiKeys__0__Key
Authentication__ApiKeys__0__OwnerName
Authentication__ApiKeys__0__Claims__0__Type
Authentication__ApiKeys__0__Claims__0__Value
Authentication__ApiKeys__0__Claims__0
Authentication__ApiKeys__2
Authorization__Permissions__Roles__0__Role
Authorization__Permissions__Roles__0__Permissions__1
Authorization__Permissions__Roles__1__Role
Authorization__Permissions__Roles__1__Permissions__0
Authorization__Permissions__Roles__1
Why does my code adds not full strings like
Authentication__ApiKeys__0__Claims__0
Authentication__ApiKeys__2
Authorization__Permissions__Roles__1
And why it doesn't print every value from
Authorization__Permissions__Roles__0__Permissions__*
and from
Authorization__Permissions__Roles__1__Permissions__*
I have this code in python3:
def checkdepth(sub_key, variable):
delmt = '__'
for item in sub_key:
try:
if isinstance(sub_key[item], dict):
sub_variable = variable + delmt + item
checkdepth(sub_key[item], sub_variable)
except TypeError:
continue
if isinstance(sub_key[item], list):
sub_variable = variable + delmt + item
for it in sub_key[item]:
sub_variable = variable + delmt + item + delmt + str(sub_key[item].index(it))
checkdepth(it, sub_variable)
print(sub_variable)
if isinstance(sub_key[item], int) or isinstance(sub_key[item], str):
sub_variable = variable + delmt + item
print (sub_variable)
for key in data:
if type(data[key]) is str:
print(key + '=' +str(data[key]))
else:
variable = key
checkdepth(data[key], variable)
I know that the problem in block where I process list data type, but I don't know where is the problem exactly
Use a recursive generator:
import json
with open('input.json') as f:
data = json.load(f)
def strkeys(data):
if isinstance(data,dict):
for k,v in data.items():
for item in strkeys(v):
yield f'{k}__{item}' if item else k
elif isinstance(data,list):
for i,v in enumerate(data):
for item in strkeys(v):
yield f'{i}__{item}' if item else str(i)
else:
yield None # termination condition, not a list or dict
for s in strkeys(data):
print(s)
Output:
Logging__LogLevel__Default
Logging__LogLevel__Microsoft
Logging__LogLevel__Microsoft.Hosting.Lifetime
Logging__LogLevel__Microsoft.AspNetCore
Logging__LogLevel__System.Net.Http.HttpClient.Default.ClientHandler
Logging__LogLevel__System.Net.Http.HttpClient.Default.LogicalHandler
AllowedHosts
AutomaticTransferOptions__DateOffsetForDirectoriesInDays
AutomaticTransferOptions__DateOffsetForPortfoliosInDays
AutomaticTransferOptions__Clause__Item1
Authentication__ApiKeys__0__Key
Authentication__ApiKeys__0__OwnerName
Authentication__ApiKeys__0__Claims__0__Type
Authentication__ApiKeys__0__Claims__0__Value
Authentication__ApiKeys__1__Key
Authentication__ApiKeys__1__OwnerName
Authentication__ApiKeys__1__Claims__0__Type
Authentication__ApiKeys__1__Claims__0__Value
Authentication__ApiKeys__1__Claims__1__Type
Authentication__ApiKeys__1__Claims__1__Value
Authentication__ApiKeys__2__Key
Authentication__ApiKeys__2__OwnerName
Authentication__ApiKeys__2__Claims__0__Type
Authentication__ApiKeys__2__Claims__0__Value
Authentication__LDAP__Domain
Authentication__LDAP__MachineAccountName
Authentication__LDAP__MachineAccountPassword
Authentication__LDAP__EnableLdapClaimResolution
Authorization__Permissions__Roles__0__Role
Authorization__Permissions__Roles__0__Permissions__0
Authorization__Permissions__Roles__0__Permissions__1
Authorization__Permissions__Roles__1__Role
Authorization__Permissions__Roles__1__Permissions__0
Using json_flatten this can be converted to pandas, but it's not clear if that's what you want. Also, when you do convert it can use df.iloc[0] to see why each column is being provided (ie you see the value for that key).
Note: you need to pass a list so I just wrapped your json above in [].
# https://github.com/amirziai/flatten
dic = your json from above
dic =[dic] # put it in a list
dic_flattened = (flatten(d, '__') for d in dic) # add your delimiter
df = pd.DataFrame(dic_flattened)
df.iloc[0]
Logging__LogLevel__Default Information
Logging__LogLevel__Microsoft Warning
Logging__LogLevel__Microsoft.Hosting.Lifetime Information
Logging__LogLevel__Microsoft.AspNetCore Warning
Logging__LogLevel__System.Net.Http.HttpClient.Default.ClientHandler Warning
Logging__LogLevel__System.Net.Http.HttpClient.Default.LogicalHandler Warning
AllowedHosts *
AutomaticTransferOptions__DateOffsetForDirectoriesInDays -1
AutomaticTransferOptions__DateOffsetForPortfoliosInDays -3
AutomaticTransferOptions__Clause__Item1 1
Authentication__ApiKeys__0__Key AB8E5976-2A7C-4EEE-92C1-7B0B4DC840F6
Authentication__ApiKeys__0__OwnerName Cron job
Authentication__ApiKeys__0__Claims__0__Type http://schemas.microsoft.com/ws/2008/06/identi...
Authentication__ApiKeys__0__Claims__0__Value StressTestManager
Authentication__ApiKeys__1__Key B11D4F27-483A-4234-8EC7-CA121712D5BE
Authentication__ApiKeys__1__OwnerName Test admin
Authentication__ApiKeys__1__Claims__0__Type http://schemas.microsoft.com/ws/2008/06/identi...
Authentication__ApiKeys__1__Claims__0__Value StressTestAdmin
Authentication__ApiKeys__1__Claims__1__Type http://schemas.microsoft.com/ws/2008/06/identi...
Authentication__ApiKeys__1__Claims__1__Value TestManager
Authentication__ApiKeys__2__Key EBF98F2E-555E-4E66-9D77-5667E0AA1B54
Authentication__ApiKeys__2__OwnerName Test manager
Authentication__ApiKeys__2__Claims__0__Type http://schemas.microsoft.com/ws/2008/06/identi...
Authentication__ApiKeys__2__Claims__0__Value TestManager
Authentication__LDAP__Domain domain.local
Authentication__LDAP__MachineAccountName Soft13
Authentication__LDAP__MachineAccountPassword vixuUEY7884*
Authentication__LDAP__EnableLdapClaimResolution true
Authorization__Permissions__Roles__0__Role TestAdmin
Authorization__Permissions__Roles__0__Permissions__0 transfers.create
Authorization__Permissions__Roles__0__Permissions__1 bindings.create
Authorization__Permissions__Roles__1__Role TestManager
Authorization__Permissions__Roles__1__Permissions__0 transfers.create
Ok, I looked at your code and it's hard to follow. You're variable and function names are not easy to understand their purpose. Which is fine cause everyone has to learn best practice and all the little tips and tricks in python. So hopefully I can help you out.
You have a recursive-ish function. Which is definingly the best way to handle a situation like this. However your code is part recursive and part not. If you go recursive to solve a problem you have to go 100% recursive.
Also the only time you should print in a recursive function is for debugging. Recursive functions should have an object that is passed down the function and gets appended to or altered and then passed back once it gets to the end of the recursion.
When you get a problem like this, think about which data you actually need or care about. In this problem we don't care about the values that are stored in the object, we just care about the keys. So we should write code that doesn't even bother looking at the value of something except to determine its type.
Here is some code I wrote up that should work for what you're wanting to do. But take note that because I did purely a recursive function my code base is small. Also my function uses a list that is passed around and added to and then at the end I return it so that we can use it for whatever we need. If you have questions just comment on this question and I'll answer the best I can.
def convert_to_delimited_keys(obj, parent_key='', delimiter='__', keys_list=None):
if keys_list is None: keys_list = []
if isinstance(obj, dict):
for k in obj:
convert_to_delimited_keys(obj[k], delimiter.join((parent_key, str(k))), delimiter, keys_list)
elif isinstance(obj, list):
for i, _ in enumerate(obj):
convert_to_delimited_keys(obj[i], delimiter.join((parent_key, str(i))), delimiter, keys_list)
else:
# Append to list, but remove the leading delimiter due to string.join
keys_list.append(parent_key[len(delimiter):])
return keys_list
for item in convert_to_delimited_keys(data):
print(item)
I have a json file in this format,
{
"details": {
"hawk_branch": {
"tandem": {
"value": "4210bnd72"
}
},
"uclif_branch": {
"tandem": {
"value": "e2nc712nma89",
"value": "23s24212",
"value": "12338cm82",
}
}
}
}
The problem is, I need to keep all the value, however when i use json.load to load this file i only get one value, which make sense since dict can keep only unique keys.
Here is the expected output,
{ "hawk_branch": ["4210bnd72"] }
{ "uclif_branch": ["e2nc712nma89" , "23s24212", "12338cm82"] }
I have read this answer, Python json parser allow duplicate keys to use object_pairs_hook like this,
def parse_object_pairs(pairs):
return pairs
# f is file
json.load(f, object_pairs_hook=parse_object_pairs)
but it returns entire json file as list.
I think its possible to do it using lambda as object_pairs_hook but i can't understand how can I use it.
Can someone please guide me
You can use a custom duplicate key resolver function that turns the values of the value keys into a list:
def value_resolver(pairs):
if all(k == 'value' for k, _ in pairs):
return [v for _, v in pairs]
return dict(pairs)
so that:
json.load(f, object_pairs_hook=value_resolver)
returns:
{'details': {'hawk_branch': {'tandem': ['4210bnd72']}, 'uclif_branch': {'tandem': ['e2nc712nma89', '23s24212', '12338cm82']}}}
And to dump the new data structure back to the original JSON format by converting lists to dicts with duplicate value keys, you can use a custom json.JSONEncoder subclass:
class restore_value(json.JSONEncoder):
def encode(self, o):
if isinstance(o, dict):
return '{%s}' % ', '.join(': '.join((json.encoder.py_encode_basestring(k), self.encode(v))) for k, v in o.items())
if isinstance(o, list):
return '{%s}' % ', '.join('"value": %s' % self.encode(v) for v in o)
return super().encode(o)
so that:
d = {'details': {'hawk_branch': {'tandem': ['4210bnd72']}, 'uclif_branch': {'tandem': ['e2nc712nma89', '23s24212', '12338cm82']}}}
print(json.dumps(d, cls=restore_value))
would output:
{"details": {"hawk_branch": {"tandem": {"value": "4210bnd72"}}, "uclif_branch": {"tandem": {"value": "e2nc712nma89", "value": "23s24212", "value": "12338cm82"}}}}
With given script I am able to get output as I showed in a screenshot,
but there is a column named as cve.description.description_data which is again in json format. I want to extract that data as well.
import json
import pandas as pd
from pandas.io.json import json_normalize
#load json object
with open('nvdcve-1.0-modified.json') as f:
d = json.load(f)
#tells us parent node is 'programs'
nycphil = json_normalize(d['CVE_Items'])
nycphil.head(3)
works_data = json_normalize(data=d['CVE_Items'], record_path='cve')
works_data.head(3)
nycphil.to_csv("test4.csv")
If I change works_data = json_normalize(data=d['CVE_Items'], record_path='cve.descr') it gives this error:
"result = result[spec] KeyError: 'cve.description'"
JSON format as follows:
{
"CVE_data_type":"CVE",
"CVE_data_format":"MITRE",
"CVE_data_version":"4.0",
"CVE_data_numberOfCVEs":"1000",
"CVE_data_timestamp":"2018-04-04T00:00Z",
"CVE_Items":[
{
"cve":{
"data_type":"CVE",
"data_format":"MITRE",
"data_version":"4.0",
"CVE_data_meta":{
"ID":"CVE-2001-1594",
"ASSIGNER":"cve#mitre.org"
},
"affects":{
"vendor":{
"vendor_data":[
{
"vendor_name":"gehealthcare",
"product":{
"product_data":[
{
"product_name":"entegra_p&r",
"version":{
"version_data":[
{
"version_value":"*"
}
]
}
}
]
}
}
]
}
},
"problemtype":{
"problemtype_data":[
{
"description":[
{
"lang":"en",
"value":"CWE-255"
}
]
}
]
},
"references":{
"reference_data":[
{
"url":"http://apps.gehealthcare.com/servlet/ClientServlet/2263784.pdf?DOCCLASS=A&REQ=RAC&DIRECTION=2263784-100&FILENAME=2263784.pdf&FILEREV=5&DOCREV_ORG=5&SUBMIT=+ ACCEPT+"
},
{
"url":"http://www.forbes.com/sites/thomasbrewster/2015/07/10/vulnerable- "
},
{
"url":"https://ics-cert.us-cert.gov/advisories/ICSMA-18-037-02"
},
{
"url":"https://twitter.com/digitalbond/status/619250429751222277"
}
]
},
"description":{
"description_data":[
{
"lang":"en",
"value":"GE Healthcare eNTEGRA P&R has a password of (1) value."
}
]
}
},
"configurations":{
"CVE_data_version":"4.0",
"nodes":[
{
"operator":"OR",
"cpe":[
{
"vulnerable":true,
"cpe22Uri":"cpe:/a:gehealthcare:entegra_p%26r",
"cpe23Uri":"cpe:2.3:a:gehealthcare:entegra_p\\&r:*:*:*:*:*:*:*:*"
}
]
}
]
},
"impact":{
"baseMetricV2":{
"cvssV2":{
"version":"2.0",
"vectorString":"(AV:N/AC:L/Au:N/C:C/I:C/A:C)",
"accessVector":"NETWORK",
"accessComplexity":"LOW",
"authentication":"NONE",
"confidentialityImpact":"COMPLETE",
"integrityImpact":"COMPLETE",
"availabilityImpact":"COMPLETE",
"baseScore":10.0
},
"severity":"HIGH",
"exploitabilityScore":10.0,
"impactScore":10.0,
"obtainAllPrivilege":false,
"obtainUserPrivilege":false,
"obtainOtherPrivilege":false,
"userInteractionRequired":false
}
},
"publishedDate":"2015-08-04T14:59Z",
"lastModifiedDate":"2018-03-28T01:29Z"
}
]
}
I want to flatten all data.
Assuming the multiple URLs delineate between rows and all else meta data repeats, consider a recursive function call to extract every key-value pair in nested json object, d.
The recursive function will call global to update the needed global objects to be binded into a list of dictionaries for pd.DataFrame() call. Last loop at end updates the recursive function's dictionary, inner, to integrate the different urls (stored in multi)
import json
import pandas as pd
# load json object
with open('nvdcve-1.0-modified.json') as f:
d = json.load(f)
multi = []; inner = {}
def recursive_extract(i):
global multi, inner
if type(i) is list:
if len(i) == 1:
for k,v in i[0].items():
if type(v) in [list, dict]:
recursive_extract(v)
else:
inner[k] = v
else:
multi = i
if type(i) is dict:
for k,v in i.items():
if type(v) in [list, dict]:
recursive_extract(v)
else:
inner[k] = v
recursive_extract(d['CVE_Items'])
data_dict = []
for i in multi:
tmp = inner.copy()
tmp.update(i)
data_dict.append(tmp)
df = pd.DataFrame(data_dict)
df.to_csv('Output.csv')
Output (all columns the same except for URL, widened for emphasis)