Create DataFrame from a list of nested dictionary objects

Create DataFrame from a list of nested dictionary objects - python

I have list of nested dictionary objects in a JSON file. I am trying to create a DataFrame of this file.
Here are the first 2 objects:
data= [ {
"model": "class",
"pk": 48,
"fields": {
"unique_key": "9f030ed1d5e56523",
"name": "john",
"follower_count": 2395,
"profile_image": " "
} } ,{
"model": "class",
"pk": 49,
"fields": {
"unique_key": "0e8256ad7f27270eb",
"name": "dais",
"follower_count": 264,
"profile_image": " "
} }, .....]
If I try something like:
df = pd.DataFrame(data)
This is what I get.
I was looking for help and I found this, but the problem is the list does not have a keys() function.

It looks like this is data you could flatten using a for loop:
new_data = []
for item in data:
new_entry = {}
for k,v in item.items():
# a dictionary will return True for isinstance(v, dict)
if not isinstance(v, dict):
# v is not a dictionary here
new_entry[k] = v
else:
# v is a dictionary, so we flatten it
for a,b in v.items():
new_entry[a] = b
new_data.append(new_entry)
df = pd.DataFrame(new_data)
The inner loop is a more generalized approach to using something like if k=='Fields', which would be more specific to your problem

Assuming you only have 1 level of nested dictionaries and you know the key name:
for d in data:
d.update(d.pop('fields'))
You only need to "pop" the element out of the dictionary and add the inner key-value data in the base level. The update method will do the latter as an inplace operation.
Now you can create your pandas dataframe with the columns you were expecting:
In [5]: pd.DataFrame(data)
Out[5]:
follower_count model name pk profile_image unique_key
0 2395 class john 48 9f030ed1d5e56523
1 264 class dais 49 0e8256ad7f27270eb

Related

Convert Nested JSON list API data into CSV using PYTHON

Want to convert Sample JSON data into CSV file using python. I am retrieving JSON data from API.
As my JSON has nested objects, so it normally cannot be directly converted to CSV.I don't want to do any hard coding and I want to make a python code fully dynamic.
So, I have written a function that flatten my JSON Data but I am not able to work out how to iterate all records, finding relevant column names and then output those data into CSV.
In the Sample JSON file I have mentioned only 2 records but in actual there are 100 records.
Sample JSON Look like this:
[
{
"id":"Random_Company_57",
"unid":"75",
"fieldsToValues":{
"Email":"None",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"true",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"true",
"password":"None",
"externalUser":"false",
"Username":"Random_Company_57",
"affiliation":"",
"Phone":"+16 22 22 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"gfds"
},
{
"hierarchyField":"Project",
"value":"JKL-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-02-22T03:47:41.632Z",
"email":"None",
"docNo":"None",
"virtualSuperUser":false
},
{
"id":"xyz.abc#safe.net",
"unid":"98",
"fieldsToValues":{
"Email":"xyz.abc#safe.net",
"occupation":"SO1 Change",
"manager":"None",
"First Name":"Bells",
"employeeID":"21011.0",
"loginRequired":"false",
"superUser":"false",
"ldapSuperUser":"false",
"archived":"false",
"password":"None",
"externalUser":"false",
"Username":"xyz.abc#safe.net",
"affiliation":"",
"Phone":"+16 2222 222 222",
"unidDominoKey":"",
"externalUserActive":"false",
"secondaryOccupation":"SO1 Change",
"retypePassword":"None",
"Last Name":"Christmas"
},
"hierarchyFieldAccess":[
],
"userHierarchies":[
{
"hierarchyField":"Company",
"value":"ABC Company"
},
{
"hierarchyField":"Department",
"value":"PUHJ"
},
{
"hierarchyField":"Project",
"value":"RPOJ-SDFGHJW"
},
{
"hierarchyField":"Division",
"value":"Silver RC"
},
{
"hierarchyField":"Site",
"value":"SQ06"
}
],
"locale":{
"id":1,
"dateFormat":"dd/MM/yyyy",
"languageTag":"en-UA"
},
"roles":[
"User"
],
"readAccessRoles":[
],
"preferredLanguage":"en-AU",
"prefName":"Christmas Bells",
"startDate":"None",
"firstName":"Bells",
"lastName":"Christmas",
"fullName":"Christmas Bells",
"lastModified":"2022-03-16T05:04:13.085Z",
"email":"xyz.abc#safe.net",
"docNo":"None",
"virtualSuperUser":false
}
]
What I have tried.
def flattenjson(b, delim):
val = {}
for i in b.keys():
if isinstance(b[i], dict):
get = flattenjson(b[i], delim)
for j in get.keys():
val[i + delim + j] = get[j]
else:
val[i] = b[i]
print(val)
return val
json=[{Sample JSON String that mentioned above}]
flattenjson(json,"__")
I don't know it is a right way to deal this problem or not?
My final aim is that all the above json data will output in a csv file.

Based on this answer, you could loop through your list of json data and flatten each json with the given function (they always have the same structure?), then build a DataFrame and write the data to csv. That's the easiest way I can think of,
try this:
import pandas as pd
import json
import collections
def flatten(dictionary, parent_key=False, separator='__'):
items = []
for key, value in dictionary.items():
new_key = str(parent_key) + separator + key if parent_key else key
if isinstance(value, collections.MutableMapping):
items.extend(flatten(value, new_key, separator).items())
elif isinstance(value, list):
for k, v in enumerate(value):
items.extend(flatten({str(k): v}, new_key).items())
else:
items.append((new_key, value))
return dict(items)
with open('your_json.json') as f:
data = json.load(f) # data is a the example you provided (list of dicts)
all_records=[]
for jsn in data:
tmp = flatten(jsn)
all_records.append(tmp)
df = pd.DataFrame(all_records)
out = df.to_csv('json_to_csv.csv')

Querying a nested JSON file in Python without Indexing

I have the below Json file which I need to query to get the values of the keys inside 'validations' in a list
for example the column_values_not_null output will need to be this:
['lu_name', 'transaction_amount']
"validation_file_name": "ctm",
"connection_type": "s3",
"low_threshold": 500000,
"high_threshold": 1000000,
"frequency": "weekly",
"validations": [
{
"columns_to_match_ordered_list" :[
"lu_name",
"site_name",
"transaction_date_time",
"margin",
"transaction_currency_code",
"reversal_indicator_description",
"reversal_amount",
"original_amount"
]
},
{
"column_values_not_null":[
"lu_name",
"transaction_amount"
]
},
{
"column_values_not_duplicate": [
"lu_name",
"response_code_description"
]
}
]
I am able to do the below but I need to do this without using the index value
f = open('test.json')
json_content = json.load(f)
print(json_content['validations'][1]['column_values_not_null'])

Get a list by querying the validations key. The sum( ,[]) are used to flat the list (as required by the condition "without using the index value" if got it right), for details about it with pros and cons see doc.
data = #
def validations(data: dict, key_query: str) -> list:
for k, v in data.items():
if k == 'validations':
return sum(sum([list(d.values()) for d in v if key_query in d], []), [])
print(validations(data, query='column_values_not_null'))
# ['lu_name', 'transaction_amount']

How do I extract a list item from nested json in Python?

I have a json object and I'm trying to extract a couple of values from a nested list. Then print them in markup. I'm getting and error - AttributeError: 'list' object has no attribute 'get'
I understand that it's a list and I can't preform a get. I've been searching for the proper method for a few hours now and I'm running out of steam. I'm able to get the Event, but not Value1 and Value2.
This is the json object
{
"resource": {
"data": {
"event": "qwertyuiop",
"eventVersion": "1.05",
"parameters": {
"name": "sometext",
"othername": [
""
],
"thing": {
"something": {
"blah": "whatever"
},
"abc": "123",
"def": {
"xzy": "value"
}
},
"something": [
"else"
]
},
"whatineed": [{
"value1": "text.i.need",
"value2": "text.i.need.also"
}]
}
}
}
And this is my function
def parse_json(json_data: dict) -> Info:
some_data = json_data.get('resource', {})
specific_data = some_data.get('data', {})
whatineed_data = specific_data.get('whatineed', {})
formatted_json = json.dumps(json_data, indent=2)
description = f'''
h3. Details
*Event:* {some_data.get('event')}
*Value1:* {whatineed_data('value1')}
*Value2:* {whatineed_data('value2')}
'''

From the data structure, whatineed is a list with a single item, which in turn is a dictionary. So, one way to access it would be:
whatineed_list = specific_data.get('whatineed', [])
whatineed_dict = whatineed_list[0]
At this point you can do:
value1 = whatineed_dict.get('value1')
value2 = whatineed_dict.get('value2')

You can change your function to the following:
def parse_json(json_data: dict) -> Info:
some_data = json_data.get('resource')
specific_data = some_data.get('data', {})
whatineed_data = specific_data.get('whatineed', {})
formatted_json = json.dumps(json_data, indent=2)
description = '''
h3. Details
*Event:* {}
*Value1:* {}
*Value2:* {}
'''.format(some_data.get('data').get('event'),whatineed_data[0]['value1'], whatineed_data[0]['value2'])
Since whatineed_data is a list, you need to index the element first

Python handles json as strings unless they are coming directly from a file. This could be the source for some of your problems. Also this article might help.

Assuming that "whatineed" attribute is really a list, and it's elements are dicts, you can't call whatineed.get asking for Value1 or Value2 as if they are attributes, because it is a list and it don't have attributes.
So, you have two options:
If whatineed list has a single element ever, you can access this element directly and than access the element attributes:
element = whatineed[0]
v1 = element.get('value1', {})
v2 = element.get('value2', {})
Or, if whatineed list can have more items, so, you will need to iterate over this list and access those elements:
for element in whatineed:
v1 = element.get('value1', {})
v2 = element.get('value2', {})
## Do something with values

How to convert any nested json into a pandas dataframe

I'm currently working on a project that will be analyzing multiple data sources for information, other data sources are fine but I am having a lot of trouble with json and its sometimes deeply nested structure. I have tried to turn the json into a python dictionary, but with not much luck as it can start to struggle as it gets more complicated. For example with this sample json file:
{
"Employees": [
{
"userId": "rirani",
"jobTitleName": "Developer",
"firstName": "Romin",
"lastName": "Irani",
"preferredFullName": "Romin Irani",
"employeeCode": "E1",
"region": "CA",
"phoneNumber": "408-1234567",
"emailAddress": "romin.k.irani#gmail.com"
},
{
"userId": "nirani",
"jobTitleName": "Developer",
"firstName": "Neil",
"lastName": "Irani",
"preferredFullName": "Neil Irani",
"employeeCode": "E2",
"region": "CA",
"phoneNumber": "408-1111111",
"emailAddress": "neilrirani#gmail.com"
}
]
}
after converting to dictionary and doing dict.keys() only returns "Employees".
I then resorted to instead opt for a pandas dataframe and I could achieve what I wanted by calling json_normalize(dict['Employees'], sep="_") but my problem is that it must work for ALL jsons and looking at the data beforehand is not an option so my method of normalizing this way will not always work. Is there some way I could write some sort of function that would take in any json and convert it into a nice pandas dataframe? I have searched for about 2 weeks for answers bt with no luck regarding my specific problem. Thanks

I've had to do that in the past (Flatten out a big nested json). This blog was really helpful. Would something like this work for you?
Note, like the others have stated, for this to work for EVERY json, is a tall task, I'm merely offering a way to get started if you have a wider range of json format objects. I'm assuming they will be relatively CLOSE to what you posted as an example with hopefully similarly structures.)
jsonStr = '''{
"Employees" : [
{
"userId":"rirani",
"jobTitleName":"Developer",
"firstName":"Romin",
"lastName":"Irani",
"preferredFullName":"Romin Irani",
"employeeCode":"E1",
"region":"CA",
"phoneNumber":"408-1234567",
"emailAddress":"romin.k.irani#gmail.com"
},
{
"userId":"nirani",
"jobTitleName":"Developer",
"firstName":"Neil",
"lastName":"Irani",
"preferredFullName":"Neil Irani",
"employeeCode":"E2",
"region":"CA",
"phoneNumber":"408-1111111",
"emailAddress":"neilrirani#gmail.com"
}]
}'''
It flattens out the entire json into single rows, then you can put into a dataframe. In this case it creates 1 row with 18 columns. Then iterates through those columns, using the number values within those column names to reconstruct into multiple rows. If you had a different nested json, I'm thinking it theoretically should work, but you'll have to test it out.
import json
import pandas as pd
import re
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
jsonObj = json.loads(jsonStr)
flat = flatten_json(jsonObj)
results = pd.DataFrame()
columns_list = list(flat.keys())
for item in columns_list:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
column = item.replace('_'+row_idx+'_', '_')
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
print (results)
Output:
print (results)
Employees_userId ... Employees_emailAddress
0 rirani ... romin.k.irani#gmail.com
1 nirani ... neilrirani#gmail.com
[2 rows x 9 columns]

d={
"Employees" : [
{
"userId":"rirani",
"jobTitleName":"Developer",
"firstName":"Romin",
"lastName":"Irani",
"preferredFullName":"Romin Irani",
"employeeCode":"E1",
"region":"CA",
"phoneNumber":"408-1234567",
"emailAddress":"romin.k.irani#gmail.com"
},
{
"userId":"nirani",
"jobTitleName":"Developer",
"firstName":"Neil",
"lastName":"Irani",
"preferredFullName":"Neil Irani",
"employeeCode":"E2",
"region":"CA",
"phoneNumber":"408-1111111",
"emailAddress":"neilrirani#gmail.com"
}]
}
import pandas as pd
df=pd.DataFrame([x.values() for x in d["Employees"]],columns=d["Employees"][0].keys())
print(df)
Output
userId jobTitleName firstName ... region phoneNumber emailAddress
0 rirani Developer Romin ... CA 408-1234567 romin.k.irani#gmail.com
1 nirani Developer Neil ... CA 408-1111111 neilrirani#gmail.com
[2 rows x 9 columns]

For the particular JSON data given. My approach, which uses pandas package only, follows:
import pandas as pd
# json as python's dict object
jsn = {
"Employees" : [
{
"userId":"rirani",
"jobTitleName":"Developer",
"firstName":"Romin",
"lastName":"Irani",
"preferredFullName":"Romin Irani",
"employeeCode":"E1",
"region":"CA",
"phoneNumber":"408-1234567",
"emailAddress":"romin.k.irani#gmail.com"
},
{
"userId":"nirani",
"jobTitleName":"Developer",
"firstName":"Neil",
"lastName":"Irani",
"preferredFullName":"Neil Irani",
"employeeCode":"E2",
"region":"CA",
"phoneNumber":"408-1111111",
"emailAddress":"neilrirani#gmail.com"
}]
}
# get the main key, here 'Employees' with index '0'
emp = list(jsn.keys())[0]
# when you have several keys at this level, i.e. 'Employers' for example
# .. you need to handle all of them too (your task)
# get all the sub-keys of the main key[0]
all_keys = jsn[emp][0].keys()
# build dataframe
result_df = pd.DataFrame() # init a dataframe
for key in all_keys:
col_vals = []
for ea in jsn[emp]:
col_vals.append(ea[key])
# add a new column to the dataframe using sub-key as its header
# it is possible that values here is a nested object(s)
# .. such as dict, list, json
result_df[key]=col_vals
print(result_df.to_string())
Output:
userId lastName jobTitleName phoneNumber emailAddress employeeCode preferredFullName firstName region
0 rirani Irani Developer 408-1234567 romin.k.irani#gmail.com E1 Romin Irani Romin CA
1 nirani Irani Developer 408-1111111 neilrirani#gmail.com E2 Neil Irani Neil CA

How to parse key values from nested dictionaries in this example?

Please see the JSON below taken from an API.
my_json =
{
"cities":[
{
"portland":[
{"more_info":[{"rank": "3", "games_played": "5"}
],
"team_name": "blazers"
},
{
"cleveland":[
{"more_info":[{"rank": "2", "games_played": "7"}
],
"team_name": "cavaliers"
}
]
}
I would like to create a new dictionary from this my_json with "team_name" as the key and "rank" as the value.
Like this: {'Blazers': 3, 'Cavaliers': 2, 'Bulls': 7}
I'm not sure how to accomplish this... I can return a list of cities, and I can return a list of ranks, but they end up being two separate lists with no relation, I'm not sure how to relate the two.
Any help would be appreciated (I'm also open to organizing this info in a list rather than dict if that is easier).
If I run this:
results_dict = {}
cities = my_json.get('cities', [])
for x in cities:
for k,v in x.items():
print k, v
it returns:
team_name blazers
portland [{"rank": "3", "games_played": "5"}
team_name cavaliers
cavaliers [{"rank": "2", "games_played": "7"}

If you want to take your cities list and your ranks list and combine them, you could use zip() and a dictionary comprehension:
output = {city: rank for city, rank in zip(cities, ranks)}

Valid JSON looks like:
{
"cities":[
{
"portland":[
{"more_info":
[{"rank": "3", "games_played": "5"}],
"team_name":
"blazers"
}
]
},
{
"cleveland":[
{"more_info":
[{"rank": "2", "games_played": "7"}],
"team_name":
"cavaliers"
}
]
}
]
}
This part of code returns all you want, but I'll try to write more readable code instead of this:
results_dict = {}
cities = my_json.get('cities', [])
for x in cities:
for k,v in x.items():
for element in v:
team = element.get('team_name', '')
meta_data = element.get('more_info', [])
for item in meta_data:
rank = item.get('rank')
results_dict.update({team: rank})
>>> results_dict
{'blazers': '3', 'cavaliers': '2'}

What API is that? The JSON structure (if pivanchy got it right) seems to be unnecessarily nested in lists. (Can a city have more than one team? Probably yes. Can a team have more than one rank, though?)
But just for sports, here is a gigantic dictionary comprehension to extract the data you want:
{ team['team_name']: team['more_info'][0]['rank']
for ((team,),) in (
city.values() for city in my_json['cities']
)
}

The json seemed to be missing some closing brackets. After adding them I got this:
my_json = {
"cities": [
{"portland":[
{"more_info":[{"rank": "3", "games_played": "5"}],"team_name": "blazers"}]},
{"cleveland":[{"more_info":[{"rank": "2", "games_played": "7"}],"team_name": "cavaliers"}]}
]
}
Given that structure, which is extremely nested, the following code will extract the data you want, but its very messy:
results = {}
for el in my_json["cities"]:
name = el.keys()[0]
rank = el.values()[0][0]["more_info"][0]["rank"]
results[name] = rank
print results
Which will give you:
{'portland': '3', 'cleveland': '2'}

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Create DataFrame from a list of nested dictionary objects - python

Related

Convert Nested JSON list API data into CSV using PYTHON

Querying a nested JSON file in Python without Indexing

How do I extract a list item from nested json in Python?

How to convert any nested json into a pandas dataframe

How to parse key values from nested dictionaries in this example?

Categories

Resources