create dataframe in pandas using multilevel dict dynamic - python

I am fetching api and trying that response into csv but on catch is there this is multilevel dict or json when i am converting into csv most of the look like list of dict or dicts
I am trying using this
def expand(data):
d = pd.Series(data)
t = d.index
for i in t:
if type(d[i]) in (list,dict):
expend_s = pd.Series(d[i])
t.append(expend_s.index)
d = d.append(expend_s)
d = d.drop([i])
return d
df['person'].apply(expand)
but this solution is not working. if we see person col there is multiple dict or list of dict like
"birthDate": "0000-00-00",
"genderCode": {
"codeValue": "M",
"shortName": "Male",
"longName": "Male"
},
"maritalStatusCode": {
"codeValue": "M",
"shortName": "Married"
},
"disabledIndicator": False,
"preferredName": {},
"ethnicityCode": {
"codeValue": "4",
"shortName": "4",
"longName": "Not Hispanic or Latino"
},
"raceCode": {
"identificationMethodCode": {},
"codeValue": "1",
"shortName": "White",
"longName": "White"
},
"militaryClassificationCodes": [],
"governmentIDs": [
{
"itemID": "9200037107708_4385",
"idValue": "XXX-XX-XXXX",
"nameCode": {
"codeValue": "SSN",
"longName": "Social Security Number"
},
"countryCode": "US"
}
],
"legalName": {
"givenName": "Jack",
"middleName": "C",
"familyName1": "Abele",
"formattedName": "Abele, Jack C"
},
"legalAddress": {
"nameCode": {
"codeValue": "Personal Address 1",
"shortName": "Personal Address 1",
"longName": "Personal Address 1"
},
"lineOne": "1932 Keswick Lane",
"cityName": "Concord",
"countrySubdivisionLevel1": {
"subdivisionType": "StateTerritory",
"codeValue": "CA",
"shortName": "California"
},
"countryCode": "US",
"postalCode": "94518"
},
"communication": {
"mobiles": [
{
"itemID": "9200037107708_4389",
"nameCode": {
"codeValue": "Personal Cell",
"shortName": "Personal Cell"
},
"countryDialing": "1",
"areaDialing": "925",
"dialNumber": "6860589",
"access": "1",
"formattedNumber": "(925) 686-0589"
}
]
}
}
your suggestion and advice would be so helpful

I think we can solve multiple dict using read as pd.josn_normalise and list of dict using the below functions first we get those columns which have list
def df_list_and_dict_col(explode_df: pd.DataFrame, primary_key: str,
col_name: str, folder: str) -> pd.DataFrame:
""" convert list of dict or list of into clean dataframe
Keyword arguments:
-----------------
dict: explode_df -- dataframe where we have to expand column
dict: col_name -- main_file name where most of data is present
Return: pd.DataFrame
return clean or expand dataframe
"""
explode_df[col_name] = explode_df[col_name].replace('', '[]', regex=True)
explode_df[col_name] = explode_df[col_name].fillna('[]')
explode_df[col_name] = explode_df[col_name].astype(
'string') # to make sure that entire column is string
explode_df[col_name] = explode_df[col_name].apply(ast.literal_eval)
explode_df = explode_df.explode(col_name)
explode_df = explode_df.reset_index(drop=True)
normalized_df = pd.json_normalize(explode_df[col_name])
explode_df = explode_df.join(
other=normalized_df,
lsuffix="_left",
rsuffix="_right"
)
explode_df = explode_df.drop(columns=col_name)
type_df = explode_df.applymap(type)
col_list = []
for col in type_df.columns:
if (type_df[col]==type([])).any():
col_list.append(col)
# print(col_list,explode_df.columns)
if len(col_list) != 0:
for col in col_list:
df_list_and_dict_col(explode_df[[primary_key,col]], primary_key,
col, folder)
explode_df.drop(columns=col, inplace =True)
print(f'{col}.csv is done')
explode_df.to_csv(f'{folder}/{col_name}.csv')
first we get list col and pass col to function one by one and then check is there any list inside col and then go on and save into csv
type_df = df.applymap(type)
col_list =[]
for col in type_df.columns:
if (type_df[col]==type([])).any():
col_list.append(col)
for col in col_list:
# print(col, df[['associateOID',col]])
df_list_and_dict_col(df[['primary_key',col]].copy(), 'primary_key', col,folder='worker')
df.drop(columns=col, inplace=True)
now you have multiple csv in normalise format

Related

Python, How to denormalise CSV file to Nested Json

I currently have a CSV file with the header:
productCode | code | dataFields.0.category | dataFields.0.name | dataFields.0.code | ... with dataFields[n] up to 9.
When I convert the code to Json i get:
{
"Example": [
{
"exCode": "example_code",
"name": "ex",
"code": "ex_2",
"dataFields.0.category": "EXAMPLE",
"dataFields.0.name": "exampl",
"dataFields.0.code": "exampl",
"dataFields.0.unit": "v",
"dataFields.1.category": "EXAMPLE",
"dataFields.1.name": "exampl2",
"dataFields.1.code": "exampl2",
"dataFields.1.unit": "e",
"dataFields.2.category": "EXAMPLE2",
"dataFields.2.name": null,
"dataFields.2.code": null,
"dataFields.2.unit": "e",
}]
}
However, I'm trying to convert the CSV to look like:
{
"Example": [
{
"exCode": "example_code",
"name": "exampl",
"code": "exampl",
"dataFields": [{
"category": "EXAMPLE",
"name": "exampl",
"code": "exampl",
"unit": "v"
},
{
"category": "EXAMPLE",
"name": "exampl2",
"code": "exampl2",
"unit": "e"
}
}]
}
I have been writing this project on Python without using recursion to look at least 2 levels of nesting deep into a nested json array to
remove the normalised fields and add in denormalised (nested) fields. However, my main problem is that "dataFields" will get overwritten instead of adding multiple "dataFields" elements.
This is what I have so far:
def denormalize_json(json_list):
for children in json_list:
for inner_children in json_list[children]:
# print("Arr: ", inner_children)
for innest_child in inner_children:
split_norm = innest_child.split('.')
if len(split_norm) == 2: # If it is only a singular nested field
# Add correct field
changes_arr.append([children, inner_children, split_norm[0], split_norm[1], innest_child])
elif len(split_norm) == 3: # If it is normalized with more than one field
changes_arr.append([children, inner_children, split_norm[0], split_norm[2], innest_child])
print(changes_arr)
# Make changes to json_list
for i in range(len(changes_arr)):
changes = changes_arr[i]
# change inner children to correct fields
# add correct fields
try:
inner_children = changes[1]
inner_children[changes[2]].append(changes[3].append(inner_children[changes[4]]))
del(inner_children[changes[4]])
json_list[changes[0]] = inner_children
except Exception as e:
print(e)

Create a nested data dictionary in Python

I have the data as below
{
"employeealias": "101613177",
"firstname": "Lion",
"lastname": "King",
"date": "2022-04-21",
"type": "Thoughtful Intake",
"subject": "Email: From You Success Coach"
}
{
"employeealias": "101613177",
"firstname": "Lion",
"lastname": "King",
"date": "2022-04-21",
"type": null,
"subject": "Call- CDL options & career assessment"
}
I need to create a dictionary like the below:
You have to create new dictionary with list and use for-loop to check if exists employeealias, firstname, lastname to add other information to sublist. If item doesn't exist then you have to create new item with employeealias, firstname, lastname and other information.
data = [
{"employeealias":"101613177","firstname":"Lion","lastname":"King","date":"2022-04-21","type":"Thoughtful Intake","subject":"Email: From You Success Coach"},
{"employeealias":"101613177","firstname":"Lion","lastname":"King","date":"2022-04-21","type":"null","subject":"Call- CDL options & career assessment"},
]
result = {'interactions': []}
for row in data:
found = False
for item in result['interactions']:
if (row["employeealias"] == item["employeealias"]
and row["firstname"] == item["firstname"]
and row["lastname"] == item["lastname"]):
item["activity"].append({
"date": row["date"],
"subject": row["subject"],
"type": row["type"],
})
found = True
break
if not found:
result['interactions'].append({
"employeealias": row["employeealias"],
"firstname": row["firstname"],
"lastname": row["lastname"],
"activity": [{
"date": row["date"],
"subject": row["subject"],
"type": row["type"],
}]
})
print(result)
EDIT:
You read lines as normal text but you have to convert text to dictonary using module json
import json
data = []
with open("/Users/Downloads/amazon_activity_feed_0005_part_00.json") as a_file:
for line in a_file:
line = line.strip()
dictionary = json.loads(line)
data.append(dictionary)
print(data)
You can create a nested dictionary inside Python like this:
student = {name : "Suman", Age = 20, gender: "male",{class : 11, roll no: 12}}

Convert Pandas Dataframe into multi level nested JSON

I have a dataframe that I need to convert into a nested json format. I can get one level of grouping done, but I don't know how to do a second grouping as well as a nesting beneath that.
I have looked a lot of different examples, but nothing really gets me the example I posted below.
import pandas as pd
data= {'Name': ['TEST01','TEST02'],
'Type': ['Tent','Tent'],
'Address':['123 Happy','456 Happy'],
'City':['Happytown','Happytown'],
'State': ['WA','NY'],
'PostalCode': ['89985','85542'],
'Spot' : ['A','A'],
'SpotAssigment' : ['123','456'],
'Cost': [900,500]
}
df = pd.DataFrame(data)
j = (df.groupby(['Name','Type'])
.apply(lambda x: x[['Address','City', 'State', 'PostalCode']].to_dict('r'))
.reset_index(name='addresses')
.to_json(orient='records'))
print(json.dumps(json.loads(j), indent=2, sort_keys=True))
I want it to look like the below.
[
{
"Name": "TEST01",
"Type": "Tent",
"addresses": [
{
"Address": "123 Happy",
"City": "Happytown",
"PostalCode": "89985",
"State": "WA"
}
],
"spots":[
{"Spot":'A',
"SpotAssignments":[
"SpotAssignment":"123",
"Cost":900
]
}
]
},
{
"Name": "TEST02",
"Type": "Tent",
"addresses": [
{
"Address": "456 Happy",
"City": "Happytown",
"PostalCode": "85542",
"State": "NY"
}
],
"spots":[
{"Spot":'A',
"SpotAssignments":[
"SpotAssignment":"456",
"Cost":500
]
}
]
}
]
try this:
j = (df.groupby(['Name','Type'])
.apply(lambda x: x[['Address','City', 'State', 'PostalCode']].to_dict('r'))
.reset_index(name='addresses'))
k = (df.groupby(['Name','Type', 'Spot'])
.apply(lambda x: x[['SpotAssigment', 'Cost']].to_dict('r'))
.reset_index(name='SpotAssignments'))
h = (k.groupby(['Name','Type'])
.apply(lambda x: x[['Spot','SpotAssignments']].to_dict('r'))
.reset_index(name='spots'))
m = j.merge(h, how='inner', on=['Name', 'Type'])
result = m.to_dict(orient='records')
from pprint import pprint as pp
pp(result)
this result is a python list of dicts in the same format that you want, you should be able to dump it as JSON directly.

Convert multiple string stored in a variable into a single list in python

I hope everyone is doing well.
I need a little help where I need to get all the strings from a variable and need to store into a single list in python.
For example -
I have json file from where I am getting ids and all the ids are getting stored into a variable called id as below when I run print(id)
17298626-991c-e490-bae6-47079c6e2202
17298496-19bd-2f89-7b5f-881921abc632
17298698-3e17-7a9b-b337-aacfd9483b1b
172986ac-d91d-c4ea-2e50-d53700480dd0
172986d0-18aa-6f51-9c62-6cb087ad31e5
172986f4-80f0-5c21-3aee-12f22a5f4322
17298712-a4ac-7b36-08e9-8512fa8322dd
17298747-8cc6-d9d0-8d05-50adf228c029
1729875c-050f-9a99-4850-bb0e6ad35fb0
1729875f-0d50-dc94-5515-b4891c40d81c
17298761-c26b-3ce5-e77e-db412c38a5b4
172987c8-2b5d-0d94-c365-e8407b0a8860
1729881a-e583-2b54-3a52-d092020d9c1d
1729881c-64a2-67cf-d561-6e5e38ed14cb
172987ec-7a20-7eb6-3ebe-a9fb621bb566
17298813-7ac4-258b-d6f9-aaf43f9147b1
17298813-f1ef-d28a-0817-5f3b86c3cf23
17298828-b62b-9ee6-248b-521b0663226e
17298825-7449-2fcb-378e-13671cb4688a
I want these all values to be stored into a single list.
Can some please help me out with this.
Below is the code I am using:
import json
with open('requests.json') as f:
data = json.load(f)
print(type(data))
for i in data:
if 'traceId' in i:
id = i['traceId']
newid = id.split()
#print(type(newid))
print(newid)
And below is my json file looks like:
[
{
"id": "376287298-hjd8-jfjb-khkf-6479280283e9",
"submittedTime": 1591692502558,
"traceId": "17298626-991c-e490-bae6-47079c6e2202",
"userName": "ABC",
"onlyChanged": true,
"description": "Not Required",
"startTime": 1591694487929,
"result": "NONE",
"state": "EXECUTING",
"paused": false,
"application": {
"id": "16b22a09-a840-f4d9-f42a-64fd73fece57",
"name": "XYZ"
},
"applicationProcess": {
"id": "dihihdosfj9279278yrie8ue",
"name": "Deploy",
"version": 12
},
"environment": {
"id": "fkjdshkjdshglkjdshgldshldsh03r937837",
"name": "DEV"
},
"snapshot": {
"id": "djnglkfdglki98478yhgjh48yr844h",
"name": "DEV_snapshot"
},
},
{
"id": "17298495-f060-3e9d-7097-1f86d5160789",
"submittedTime": 1591692844597,
"traceId": "17298496-19bd-2f89-7b5f-881921abc632",
"userName": "UYT,
"onlyChanged": true,
"startTime": 1591692845543,
"result": "NONE",
"state": "EXECUTING",
"paused": false,
"application": {
"id": "osfodsho883793hgjbv98r3098w",
"name": "QA"
},
"applicationProcess": {
"id": "owjfoew028r2uoieroiehojehfoef",
"name": "EDC",
"version": 5
},
"environment": {
"id": "16cf69c5-4194-e557-707d-0663afdbceba",
"name": "DTESTU"
},
}
]
From where I am trying to get the traceId.
you could use simple split method like the follwing:
ids = '''17298626-991c-e490-bae6-47079c6e2202 17298496-19bd-2f89-7b5f-881921abc632 17298698-3e17-7a9b-b337-aacfd9483b1b 172986ac-d91d-c4ea-2e50-d53700480dd0 172986d0-18aa-6f51-9c62-6cb087ad31e5 172986f4-80f0-5c21-3aee-12f22a5f4322 17298712-a4ac-7b36-08e9-8512fa8322dd 17298747-8cc6-d9d0-8d05-50adf228c029 1729875c-050f-9a99-4850-bb0e6ad35fb0 1729875f-0d50-dc94-5515-b4891c40d81c 17298761-c26b-3ce5-e77e-db412c38a5b4 172987c8-2b5d-0d94-c365-e8407b0a8860 1729881a-e583-2b54-3a52-d092020d9c1d 1729881c-64a2-67cf-d561-6e5e38ed14cb 172987ec-7a20-7eb6-3ebe-a9fb621bb566 17298813-7ac4-258b-d6f9-aaf43f9147b1 17298813-f1ef-d28a-0817-5f3b86c3cf23 17298828-b62b-9ee6-248b-521b0663226e 17298825-7449-2fcb-378e-13671cb4688a'''
l = ids.split(" ")
print(l)
This will give the following result, I assumed that the separator needed is simple space you can adjust properly:
['17298626-991c-e490-bae6-47079c6e2202', '17298496-19bd-2f89-7b5f-881921abc632', '17298698-3e17-7a9b-b337-aacfd9483b1b', '172986ac-d91d-c4ea-2e50-d53700480dd0', '172986d0-18aa-6f51-9c62-6cb087ad31e5', '172986f4-80f0-5c21-3aee-12f22a5f4322', '17298712-a4ac-7b36-08e9-8512fa8322dd', '17298747-8cc6-d9d0-8d05-50adf228c029', '1729875c-050f-9a99-4850-bb0e6ad35fb0', '1729875f-0d50-dc94-5515-b4891c40d81c', '17298761-c26b-3ce5-e77e-db412c38a5b4', '172987c8-2b5d-0d94-c365-e8407b0a8860', '1729881a-e583-2b54-3a52-d092020d9c1d', '1729881c-64a2-67cf-d561-6e5e38ed14cb', '172987ec-7a20-7eb6-3ebe-a9fb621bb566', '17298813-7ac4-258b-d6f9-aaf43f9147b1', '17298813-f1ef-d28a-0817-5f3b86c3cf23', '17298828-b62b-9ee6-248b-521b0663226e', '17298825-7449-2fcb-378e-13671cb4688a']
Edit
You get list of lists because each iteration you read only 1 id, so what you need to do is to initiate an empty list and append each id to it in the following way:
l = []
for i in data
if 'traceId' in i:
id = i['traceId']
l.append(id)
you can append the ids variable to the list such as,
#list declaration
l1=[]
#this must be in your loop
l1.append(ids)
I'm assuming you get the id as a str type value. Using id.split() will return a list of all ids in one single Python list, as each id is separated by space here in your example.
id = """17298626-991c-e490-bae6-47079c6e2202 17298496-19bd-2f89-7b5f-881921abc632
17298698-3e17-7a9b-b337-aacfd9483b1b 172986ac-d91d-c4ea-2e50-d53700480dd0
172986d0-18aa-6f51-9c62-6cb087ad31e5 172986f4-80f0-5c21-3aee-12f22a5f4322
17298712-a4ac-7b36-08e9-8512fa8322dd 17298747-8cc6-d9d0-8d05-50adf228c029
1729875c-050f-9a99-4850-bb0e6ad35fb0 1729875f-0d50-dc94-5515-b4891c40d81c
17298761-c26b-3ce5-e77e-db412c38a5b4 172987c8-2b5d-0d94-c365-e8407b0a8860
1729881a-e583-2b54-3a52-d092020d9c1d 1729881c-64a2-67cf-d561-6e5e38ed14cb
172987ec-7a20-7eb6-3ebe-a9fb621bb566 17298813-7ac4-258b-d6f9-aaf43f9147b1
17298813-f1ef-d28a-0817-5f3b86c3cf23 17298828-b62b-9ee6-248b-521b0663226e
17298825-7449-2fcb-378e-13671cb4688a"""
id_list = id.split()
print(id_list)
Output:
['17298626-991c-e490-bae6-47079c6e2202', '17298496-19bd-2f89-7b5f-881921abc632',
'17298698-3e17-7a9b-b337-aacfd9483b1b', '172986ac-d91d-c4ea-2e50-d53700480dd0',
'172986d0-18aa-6f51-9c62-6cb087ad31e5', '172986f4-80f0-5c21-3aee-12f22a5f4322',
'17298712-a4ac-7b36-08e9-8512fa8322dd', '17298747-8cc6-d9d0-8d05-50adf228c029',
'1729875c-050f-9a99-4850-bb0e6ad35fb0', '1729875f-0d50-dc94-5515-b4891c40d81c',
'17298761-c26b-3ce5-e77e-db412c38a5b4', '172987c8-2b5d-0d94-c365-e8407b0a8860',
'1729881a-e583-2b54-3a52-d092020d9c1d', '1729881c-64a2-67cf-d561-6e5e38ed14cb',
'172987ec-7a20-7eb6-3ebe-a9fb621bb566', '17298813-7ac4-258b-d6f9-aaf43f9147b1',
'17298813-f1ef-d28a-0817-5f3b86c3cf23', '17298828-b62b-9ee6-248b-521b0663226e',
'17298825-7449-2fcb-378e-13671cb4688a']
split() splits by default with space as a separator. You can use the sep argument to use any other separator if needed.

How to parse JSON when there are NULL values inside?

I'm trying to parse JSON data, but when I have NULL in some branches of JSON Python gives me an error:
TypeError: 'NoneType' object is not subscriptable.
This situation is OK:
import json
x = '''[{"address":{"city": "city1","street": "street1"}},
{"address":{"city": "city2","street": "street2"}}]'''
source = json.loads(x)
data = []
for s in source:
data.append([s['address']['city'],
s['address']['street']])
print(data)
And this one gives me an error:
import json
x = '''[{"address":{"city": "city1","street": "street1"}},
{"address": null},
{"address":{"city": "city2","street": "street2"}}]'''
source = json.loads(x)
data = []
for s in source:
data.append([s['address']['city'],
s['address']['street']])
print(data)
I would like to get NULL (None) values in the second case. What is the shortest way to do it?
Update #1:
I have a lot of other data, not only "address" and any of them can also be NULL. That is why I can't use "if statements" (there are will be too many different combinations)
Update #2:
To make my question more clear (in real case I have 25 different parameters, not 3 as below):
[
{
"address": {
"city": "city1",
"street": "street1"
},
"car": null,
"person": {
"age": "30",
"name": "John"
}
},
{
"address": null,
"car": {
"color": "red",
"year": "2015"
},
"person": {
"age": "31",
"name": "Peter"
}
},
{
"address": {
"city": "city2",
"street": "street2"
},
"car": {
"color": "green",
"year": "2017"
},
"person": null
}
]
data.append( [s['address']['city'],
s['address']['street'],
s['person']['name'],
s['paerson']['age'],
s['car']['year'],
s['car']['color']])
Here's a generalized way to handle the situation when you have JSON objects nested one-level deep that might have NULL values. It makes use of the optional object_hook= keyword argument to pass a callback function to json.loads() (as does json.load()). In this case, the function converts any None values in the upper-level dicts into empty NoneDict dictionary subclass instances.
NoneDicts simply return None as the value of missing keys instead of raising KeyErrors. Optimization note: If you never change these objects — i.e. they're read-only — you really only need create is a single global instance and always use it in the convertor() function.
import json
from pprint import pprint
class NoneDict(dict):
""" dict subclass that returns a value of None for missing keys instead
of raising a KeyError. Note: doesn't add item to dictionary.
"""
def __missing__(self, key):
return None
def converter(decoded_dict):
""" Convert any None values in decoded dict into empty NoneDict's. """
return {k: NoneDict() if v is None else v for k,v in decoded_dict.items()}
# The following JSON data is equivalent to what you have in Update #2 of your
# question, it's just formatted more compactly.
x = '''
[{"address": {"city": "city1", "street": "street1"},
"car": null,
"person": {"age": "30", "name": "John"}},
{"address": null,
"car": {"color": "red", "year": "2015"},
"person": {"age": "31", "name": "Peter"}},
{"address": {"city": "city2", "street": "street2"},
"car": {"color": "green", "year": "2017"},
"person": null}]
'''
source = json.loads(x, object_hook=converter)
data = []
for s in source:
data.append([s['address']['city'],
s['address']['street'],
s['person']['name'],
s['person']['age'],
s['car']['year'],
s['car']['color']])
pprint(data)
Output:
[['city1', 'street1', 'John', '30', None, None],
[None, None, 'Peter', '31', '2015', 'red'],
['city2', 'street2', None, None, '2017', 'green']]
Note that the part near the very end could be written like this to make it more "data-driven":
items = (('address', 'city'),
('address', 'street'),
('person', 'name'),
('person', 'age'),
('car', 'year'),
('car', 'color'))
for s in source:
data.append([s[k1][k2] for k1, k2 in items])
Handle the None case separately:
for s in source:
address = s['address']
data.append(
[None, None] if address is None
else [address['city'], address['street']]
)
You'll have to check if address is none before trying to access things from it.
For example:
for s in source:
if s['address']:
data.append([s['address']['city]',s['address']['street']])
else:
# whatever behaviour you want for None values
The problem is that in the second case s['address'] evaluates to None and it's not subscriptable. You should check that the value is not None and handle that case separately:
import json
x = '''[{"address":{"city": "city1","street": "street1"}},
{"address": null},
{"address":{"city": "city2","street": "street2"}}]'''
source = json.loads(x)
data = []
for s in source:
if s['address'] is not None:
data.append([s['address']['city'],
s['address']['street']])
else:
data.append(None)
print(data)
This will print: [['city1', 'street1'], None, ['city2', 'street2']]
Edit:
Try this:
import pandas as pd
df = pd.io.json.json_normalize(source)
df = df.where((pd.notnull(df)), None)
data = df[[column for column in df.columns if '.' in column]]
print(data.values.tolist())
Output:
[['city1', 'street1', None, None, '30', 'John'], [None, None, 'red', '2015', '31', 'Peter'], ['city2', 'street2', 'green', '2017', None, None]]

Categories