open JSON file with pandas DataFrame - python

sorry for this trivial question:
I have a json file first.json and I want to open it with pandas.read_json:
df = pandas.read_json('first.json') gives me next result:
The result I need is one row with keys('name', 'street', 'geo', 'servesCuisine' etc.) as columns. I tried to change different"orient" param but it doesn't help. How can I achieve the desired DataFrame format?
This is the data in my json file:
{
"name": "La Continental (San Telmo)",
"geo": {
"longitude": "-58.371852",
"latitude": "-34.616099"
},
"servesCuisine": "Italian",
"containedInPlace": {},
"priceRange": 450,
"currenciesAccepted": "ARS",
"address": {
"street": "Defensa 701",
"postalCode": "C1065AAM",
"locality": "Autonomous City of Buenos Aires",
"country": "Argentina"
},
"aggregateRatings": {
"thefork": {
"ratingValue": 9.3,
"reviewCount": 3
},
"tripadvisor": {
"ratingValue": 4,
"reviewCount": 350
}
},
"id": "585777"
}

you can try
with open("test.json") as fp:
s = json.load(fp)
# flattened df, where nested keys -> column as `key1.key2.key_last`
df = pd.json_normalize(s)
# rename cols to innermost key only (be sure you don't overwrite cols)
cols = {col:col.split(".")[-1] for col in df.columns}
df = df.rename(columns=cols)
output:
name servesCuisine priceRange currenciesAccepted id ... country ratingValue reviewCount ratingValue reviewCount
0 La Continental (San Telmo) Italian 450 ARS 585777 ... Argentina 9.3 3 4 350

You can read the JSON file with Python command, convert it to dict object, then, hand-pick data items to create a new dataframe from it.
import pandas as pd
# open/read the json data file
fo = open("test11.json", "r")
injs = fo.read()
#print(injs)
inp_json = eval(injs) #make it an object
# Or
# inp_json = your_json_data
# prepare 1 row of data
axis1 = [[inp_json["name"], inp_json["address"]["street"], inp_json["geo"], inp_json["servesCuisine"],
inp_json["aggregateRatings"]["tripadvisor"]["ratingValue"],
inp_json["id"],
], ] #for data
axis0 = ['row_1', ] #for index
heads = ["name", "add_.street", "geo", "servesCuisine",
"agg_.tripadv_.ratingValue", "id", ]
# create a dataframe using the prepped values above
df0 = pd.DataFrame(axis1, index=axis0, columns=heads)
# see data in selected columns only
df0[["name","add_.street","id"]]
name add_.street id
row_1 La Continental (San Telmo) Defensa 701 585777

Related

create dataframe in pandas using multilevel dict dynamic

I am fetching api and trying that response into csv but on catch is there this is multilevel dict or json when i am converting into csv most of the look like list of dict or dicts
I am trying using this
def expand(data):
d = pd.Series(data)
t = d.index
for i in t:
if type(d[i]) in (list,dict):
expend_s = pd.Series(d[i])
t.append(expend_s.index)
d = d.append(expend_s)
d = d.drop([i])
return d
df['person'].apply(expand)
but this solution is not working. if we see person col there is multiple dict or list of dict like
"birthDate": "0000-00-00",
"genderCode": {
"codeValue": "M",
"shortName": "Male",
"longName": "Male"
},
"maritalStatusCode": {
"codeValue": "M",
"shortName": "Married"
},
"disabledIndicator": False,
"preferredName": {},
"ethnicityCode": {
"codeValue": "4",
"shortName": "4",
"longName": "Not Hispanic or Latino"
},
"raceCode": {
"identificationMethodCode": {},
"codeValue": "1",
"shortName": "White",
"longName": "White"
},
"militaryClassificationCodes": [],
"governmentIDs": [
{
"itemID": "9200037107708_4385",
"idValue": "XXX-XX-XXXX",
"nameCode": {
"codeValue": "SSN",
"longName": "Social Security Number"
},
"countryCode": "US"
}
],
"legalName": {
"givenName": "Jack",
"middleName": "C",
"familyName1": "Abele",
"formattedName": "Abele, Jack C"
},
"legalAddress": {
"nameCode": {
"codeValue": "Personal Address 1",
"shortName": "Personal Address 1",
"longName": "Personal Address 1"
},
"lineOne": "1932 Keswick Lane",
"cityName": "Concord",
"countrySubdivisionLevel1": {
"subdivisionType": "StateTerritory",
"codeValue": "CA",
"shortName": "California"
},
"countryCode": "US",
"postalCode": "94518"
},
"communication": {
"mobiles": [
{
"itemID": "9200037107708_4389",
"nameCode": {
"codeValue": "Personal Cell",
"shortName": "Personal Cell"
},
"countryDialing": "1",
"areaDialing": "925",
"dialNumber": "6860589",
"access": "1",
"formattedNumber": "(925) 686-0589"
}
]
}
}
your suggestion and advice would be so helpful
I think we can solve multiple dict using read as pd.josn_normalise and list of dict using the below functions first we get those columns which have list
def df_list_and_dict_col(explode_df: pd.DataFrame, primary_key: str,
col_name: str, folder: str) -> pd.DataFrame:
""" convert list of dict or list of into clean dataframe
Keyword arguments:
-----------------
dict: explode_df -- dataframe where we have to expand column
dict: col_name -- main_file name where most of data is present
Return: pd.DataFrame
return clean or expand dataframe
"""
explode_df[col_name] = explode_df[col_name].replace('', '[]', regex=True)
explode_df[col_name] = explode_df[col_name].fillna('[]')
explode_df[col_name] = explode_df[col_name].astype(
'string') # to make sure that entire column is string
explode_df[col_name] = explode_df[col_name].apply(ast.literal_eval)
explode_df = explode_df.explode(col_name)
explode_df = explode_df.reset_index(drop=True)
normalized_df = pd.json_normalize(explode_df[col_name])
explode_df = explode_df.join(
other=normalized_df,
lsuffix="_left",
rsuffix="_right"
)
explode_df = explode_df.drop(columns=col_name)
type_df = explode_df.applymap(type)
col_list = []
for col in type_df.columns:
if (type_df[col]==type([])).any():
col_list.append(col)
# print(col_list,explode_df.columns)
if len(col_list) != 0:
for col in col_list:
df_list_and_dict_col(explode_df[[primary_key,col]], primary_key,
col, folder)
explode_df.drop(columns=col, inplace =True)
print(f'{col}.csv is done')
explode_df.to_csv(f'{folder}/{col_name}.csv')
first we get list col and pass col to function one by one and then check is there any list inside col and then go on and save into csv
type_df = df.applymap(type)
col_list =[]
for col in type_df.columns:
if (type_df[col]==type([])).any():
col_list.append(col)
for col in col_list:
# print(col, df[['associateOID',col]])
df_list_and_dict_col(df[['primary_key',col]].copy(), 'primary_key', col,folder='worker')
df.drop(columns=col, inplace=True)
now you have multiple csv in normalise format

flattening JSON file using json_normalise and choosing specific elements to convert to an excel sheet (Sample Attached)

{
"currency": {
"Wpn": {
"units": "KB_per_sec",
"type": "scalar",
"value": 528922.0,
"direction": "up"
}
},
"catalyst": {
"Wpn": {
"units": "ns",
"type": "scalar",
"value": 70144.0,
"direction": "down"
}
},
"common": {
"Wpn": {
"units": "ns",
"type": "scalar",
"value": 90624.0,
"direction": "down"
}
}
}
So I have to basically convert nested json into excel, for which my approach was to flatten json file using json_normalise , but as I am new to all these...I always seem to end up in KeyError...
Here's my code so far , assuming that the file is named as json.json
import requests
from pandas import json_normalize
with open('json.json', 'r') as f:
data = json.load(f)
df = pd.DataFrame(sum([i[['Wpn'], ['value']] for i in data], []))
df.to_excel('Ai.xlsx')
I'm trying to get output on an excel sheet consisting of currency and common along with their resp. values as an output
I know , there are alot of similar questions , but trust me I have tried most of them and yet I didn't get any desirable output... Plz just help me in this
Try:
import json
import pandas as pd
with open('json.json', 'r') as f: data = json.load(f)
data = [{'key': k, 'wpn_value': v['Wpn']['value']} for k, v in data.items()]
print(data)
# here, the variable data looks like
# [{'key': 'currency', 'wpn_value': 528922.0}, {'key': 'catalyst', 'wpn_value': 70144.0}, {'key': 'common', 'wpn_value': 90624.0}]
df = pd.DataFrame(data).set_index('key') # set_index() optional
df.to_excel('Ai.xlsx')
The result looks like
key
wpn_value
currency
528922
catalyst
70144
common
90624

How to normalize json file containing a list (that should be kept as a list) in Python | Pandas?

I'm trying to use the json_normalize function to convert a json file into a dataframe.
Source JSON
The json is a list of dictionaries that look something like this:
{
"sport_key": "basketball_ncaab",
"sport_nice": "NCAAB",
"teams": [
"Bryant Bulldogs",
"Wagner Seahawks"
],
"commence_time": 1608152400,
"home_team": "Bryant Bulldogs",
"sites": [
{
"site_key": "marathonbet",
"site_nice": "Marathon Bet",
"last_update": 1608156452,
"odds": {
"h2h": [
1.28,
3.54
]
}
},
{
"site_key": "sport888",
"site_nice": "888sport",
"last_update": 1608156452,
"odds": {
"h2h": [
1.13,
5.8
]
}
},
{
"site_key": "unibet",
"site_nice": "Unibet",
"last_update": 1608156434,
"odds": {
"h2h": [
1.13,
5.8
]
}
}
],
"sites_count": 3
}
The problem is that one of the future columns contains a list (which should be the case), but including this column in the meta part of the json_normalize function throws the following error:
ValueError: operands could not be broadcast together with shape (22,) (11,)
The error appears when I try to add "teams" in the list in the following code:
pd.json_normalize(data, 'sites', ['sport_key', 'sport_nice', 'home_team', 'teams'])
Assuming data is a list of dictionaries, you can still use json_normalize but you have to assign the teams column seperately for each corresponding dictionary in data:
def normalize(d):
return pd.json_normalize(d, 'sites', ['sport_key', 'sport_nice', 'home_team'])\
.assign(teams=[d['teams']]*len(d['sites']))
df = pd.concat([normalize(d) for d in data], ignore_index=True)
Alternatively you can try:
data = [{**d, 'teams': ','.join(d['teams'])} for d in data]
df = pd.json_normalize(data, 'sites', ['sport_key', 'sport_nice', 'home_team', 'teams'])
df['teams'] = df['teams'].str.split(',')
Result:
site_key site_nice last_update odds.h2h sport_key sport_nice home_team teams
0 marathonbet Marathon Bet 1608156452 [1.28, 3.54] basketball_ncaab NCAAB Bryant Bulldogs [Bryant Bulldogs, Wagner Seahawks]
1 sport888 888sport 1608156452 [1.13, 5.8] basketball_ncaab NCAAB Bryant Bulldogs [Bryant Bulldogs, Wagner Seahawks]
2 unibet Unibet 1608156434 [1.13, 5.8] basketball_ncaab NCAAB Bryant Bulldogs [Bryant Bulldogs, Wagner Seahawks]

How to loop over data in JSON?

I had a MySQL database stored that way: Company_name, employee1, employee2, employee3.
When I input a company name, the code look for the company name in my database, then loop over employee1, employee2, and employee3 to check if one of them is free in my calendar.
This was my code to check for the employees :
for i in range(3):
employee = row[i+1]
How do I do translate this loop so it can read a JSON structure?
Example of my structure:
[
{
"id": 1,
"name_company": "Acier Michel",
"inspecteur1": "Hou, L",
"inspecteur2": "Caana, C",
"inspecteur3": "Luc, C",
"type": "Water",
"location": "Laval"
},
{
"id": 2,
"name_company": "Aciers ABC Inc.",
"inspecteur1": "Vali, M",
"inspecteur2": "Alemane, K",
"inspecteur3": "laszik, M",
"type": "NA",
"location": "St-Joseph de Sorel"
}
]
I want to be able to iterate through inspecteur1, inspecteur2 and inspecteur 3.
First translate the json to python object with
import json
userList = json.loads(yourJsonString)
Then iterate on the list
for user in userList:
print(user)
The data is a list of dictionaries
Use pandas
This assumes your list of dictionaries is in a file
import pandas as pd
import json
from pathlib import Path
# path to file
p = Path(r'c:\path_to_file\test.json')
# read the file
with p.open('r', encoding='utf-8') as f:
data = json.loads(f.read())
# load into pandas
df = pd.DataFrame(data)
print(df)
id name_company inspecteur1 inspecteur2 inspecteur3 type location
1 Acier Michel Hou, L Caana, C Luc, C Water Laval
2 Aciers ABC Inc. Vali, M Alemane, K laszik, M NA St-Joseph de Sorel
# search datafram
search = df[['inspecteur1', 'inspecteur2', 'inspecteur3']][df.name_company == 'Aciers ABC Inc.']
print(search)
inspecteur1 inspecteur2 inspecteur3
Vali, M Alemane, K laszik, M
Note addressing comment:
With search you have access to the desired values of inspecteur1-3
search.values returns an numpy array, which can be iterated through.
There is not enough information in the question to offer a more comprehensive solution.
for name in search.values[0]:
print(name)
Vali, M
Alemane, K
laszik, M
Additionally, the dataframe can be updated with additional columns and or rows and saved back in to a file.
df.to_json('test.json', orient='records')

python trasform data from csv to array of dictionaries and group by field value

I have csv like this:
id,company_name,country,country_id
1,batstop,usa, xx
2,biorice,italy, yy
1,batstop,italy, yy
3,legstart,canada, zz
I want an array of dictionaries to import to firebase. I need to group the different country informations for the same company in a nested list of dictionaries. This is the desired output:
[ {'id':'1', 'agency_name':'batstop', countries [{'country':'usa','country_id':'xx'}, {'country':'italy','country_id':'yy'}]} ,
{'id':'2', 'agency_name':'biorice', countries [{'country':'italy','country_id':'yy'}]},
{'id':'3', 'legstart':'legstart', countries [{'country':'canada','country_id':'zz'}]} ]
Recently I had a similar task, the groupby function from itertools and the itemgetter function from operator - both standard python libraries - helped me a lot. Here's the code considering your csv, note how defining the primary keys of your csv dataset is important.
import csv
import json
from operator import itemgetter
from itertools import groupby
primary_keys = ['id', 'company_name']
# Start extraction
with open('input.csv', 'r') as file:
# Read data from csv
reader = csv.DictReader(file)
# Sort data accordingly to primary keys
reader = sorted(reader, key=itemgetter(*primary_keys))
# Create a list of tuples
# Each tuple containing a dict of the group primary keys and its values, and a list of the group ordered dicts
groups = [(dict(zip(primary_keys, _[0])), list(_[1])) for _ in groupby(reader, key=itemgetter(*primary_keys))]
# Create formatted dict to be converted into firebase objects
group_dicts = []
for group in groups:
group_dict = {
"id": group[0]['id'],
"agency_name": group[0]['company_name'],
"countries": [
dict(country=_['country'], country_id=_['country_id']) for _ in group[1]
],
}
group_dicts.append(group_dict)
print("\n".join([json.dumps(_, indent=2) for _ in group_dicts]))
Here's the output:
{
"id": "1",
"agency_name": "batstop",
"countries": [
{
"country": "usa",
"country_id": " xx"
},
{
"country": "italy",
"country_id": " yy"
}
]
}
{
"id": "2",
"agency_name": "biorice",
"countries": [
{
"country": "italy",
"country_id": " yy"
}
]
}
{
"id": "3",
"agency_name": "legstart",
"countries": [
{
"country": "canada",
"country_id": " zz"
}
]
}
There's no external library,
Hope it suits you well!
You can try this, you may have to change a few parts to get it working with your csv, but hope it's enough to get you started:
csv = [
"1,batstop,usa, xx",
"2,biorice,italy, yy",
"1,batstop,italy, yy",
"3,legstart,canada, zz"
]
output = {} # dictionary useful to avoid searching in list for existing ids
# Parse each row
for line in csv:
cols = line.split(',')
id = int(cols[0])
agency_name = cols[1]
country = cols[2]
country_id = cols[3]
if id in output:
output[id]['countries'].append([{'country': country,
'country_id': country_id}])
else:
output[id] = {'id': id,
'agency_name': agency_name,
'countries': [{'country': country,
'country_id': country_id}]
}
# Put into list
json_output = []
for key in output.keys():
json_output.append( output[key] )
# Check output
for row in json_output:
print(row)

Categories