Parsing multidimentional Json in python - python

I have issue with parsing Json file. here the format i have:
{'metadata': {'timezone': {'location': 'Etc/UTC'},
'serial_number': '123456',
'device_type': 'sensor'},
'timestamp': '2019-08-21T13:57:12.500Z',
'framenumber': '4866274',
'tracked_objects': [{'id': 2491,
'type': 'PERSON',
'position': {'x': -361,
'y': -2933,
'type': 'FOOT',
'coordinate_system': 'REAL_WORLD_IN_MILLIMETER'},
'person_data': {'height': 1295}},
{'id': 2492,
'type': 'PERSON',
'position': {'x': -733,
'y': -2860,
'type': 'FOOT',
'coordinate_system': 'REAL_WORLD_IN_MILLIMETER'},
'person_data': {'height': 1928}},
{'id': 2495,
'type': 'PERSON',
'position': {'x': -922,
'y': -3119,
'type': 'FOOT',
'coordinate_system': 'REAL_WORLD_IN_MILLIMETER'},
'person_data': {'height': 1716}}]}
And I am trying to get next columns into dataframe:
timezone, serial_number,id, x , y which are part of position, and height.
This is the code i used so far:
# Import Dependencies
import pandas as pd
import json
from pandas.io.json import json_normalize
# loading json file. In your case you will point the data stream into json variable
infile = open("C:/Users/slavi/Documents/GIT/test2.json")
json_raw = json.load(infile)
# Functions to flaten multidimensional json file
def flatten_json(nested_json):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(nested_json)
return out
# Use Function to flaten json
json_flat = flatten_json(json_raw)
# Create panda dataframe from dictionary sine json itself is list of dictionaries or dictiornary of dictionaries
df = pd.DataFrame.from_dict(json_flat, orient='index')
# Reseting index
df.reset_index(level=0, inplace=True)
df.set_index('index', inplace=True)
df
I used the function to flaten the json however when i run the code I am getting results like this:
So there should be 3 lines of data for each tracked object and i should retrieve those columns with 3 lines of data under.
Any suggestion on how to adjust my code?

For any kind of JSON parsing to DtaFrame, get acquanited to json_normalize:
import json
from pandas.io.json import json_normalize
with open('...', r) as f:
json_raw = json.load(f)
df = json_normalize(json_raw, record_path='tracked_objects', meta=[
['metadata', 'serial_number'],
'timestamp'
])
Result:
id type position.x position.y position.type position.coordinate_system person_data.height metadata.serial_number timestamp
0 2491 PERSON -361 -2933 FOOT REAL_WORLD_IN_MILLIMETER 1295 123456 2019-08-21T13:57:12.500Z
1 2492 PERSON -733 -2860 FOOT REAL_WORLD_IN_MILLIMETER 1928 123456 2019-08-21T13:57:12.500Z
2 2495 PERSON -922 -3119 FOOT REAL_WORLD_IN_MILLIMETER 1716 123456 2019-08-21T13:57:12.500Z
Rename the columns as you wish.

Related

What's the fastest way to turn json results from an API into a dataframe?

Below is an example of sports betting app I'm working on.
games.json()['data'] - contains the game id for each sport event for that day. The API then returns the odds for that specific game.
What's the fastest option to take json and turn it into a panda dataframe? currently looking into msgspec.
Some games can have over 5K total bets
master_df = pd.DataFrame()
for game in games.json()['data']:
odds_params = {'key': api_key, 'game_id': game['id'], 'sportsbook': sportsbooks}
odds = requests.get(api_url, params=odds_params)
for o in odds.json()['data'][0]['odds']:
temp = pd.DataFrame()
temp['id'] = [game['id']]
for k,v in game.items():
if k != 'id' and k != 'is_live':
temp[k] = v
for k, v in o.items():
if k == 'id':
temp['odds_id'] = v
else:
temp[k] = v
if len(master_df) == 0:
master_df = temp
else:
master_df = pd.concat([master_df, temp])
odds.json response snippet -
{'data': [{'id': '35142-30886-2023-02-08',
'sport': 'basketball',
'league': 'NBA',
'start_date': '2023-02-08T19:10:00-05:00',
'home_team': 'Washington Wizards',
'away_team': 'Charlotte Hornets',
'is_live': False,
'tournament': None,
'status': 'unplayed',
'odds': [{'id': '4BB426518ECF',
'sports_book_name': 'Betfred',
'name': 'Charlotte Hornets',
'price': 135.0,
'checked_date': '2023-02-08T11:46:12-05:00',
'bet_points': None,
'is_main': True,
'is_live': False,
'market_name': '1st Half Moneyline',
'home_rotation_number': None,
'away_rotation_number': None,
'deep_link_url': None,
'player_id': None},
....
By the end of this process, I usually have about 30K records in the dataframe
Here is what I would do.
def _create_record_(game: dict, odds: dict) -> dict:
"""
Warning: THIS MUTATES THE INPUT
"""
odds['id'] = "odds_id"
# the pipe | operator is only available in dicts in recent versions of python
# use dict(**game, **odds) if you get a TypeError
result = game | odds
result.pop("is_live")
return result
def _get_odds(game: dict) -> list:
params = {'key': api_key, 'game_id': game['id'], 'sportsbook': sportsbooks}
return requests.get(api_url, params=params).json()['data'][0]['odds']
df = pd.DataFrame(
[
_create_record_(game, odds)
for game in games.json()['data']
for odds in _get_odds(game)
]
)
The fact that it is in this list comprehenesion isn't relevant. And equivalent for-loop would work just as well, the point is you create a list of dicts first, then create your dataframe. This avoids the quadratic time behavior of incrementally creating a dataframe using pd.concat.

Group the data and convert to json data

I have a data frame with 150 rows and sample two rows mentioned below. Need to convert the data to json data like below.
Input:
artwork_id creator_id department_id art_work creator department
0 86508 29993 21 {'id': '86508', 'accession_number': '2015.584'... {'id': '29993', 'role': 'artist', 'description... {'id': '21', 'name': 'Prints'}
1 86508 68000 21 {'id': '86508', 'accession_number': '2015.584'... {'id': '68000', 'role': 'printer', 'descriptio... {'id': '21', 'name': 'Prints'}
desired output:
Attached as image
I have tried using below code
df.groupby(['artwork_id']).agg(lambda x: list(x))
df.to_json(orient = 'records')
Do you get the right format if you do the following:
result = df.to_json(orient="records")
parsed = json.loads(result)
json.dumps(parsed, indent=4)
or
grouped_art=df.groupby(['artwork_id']).agg(lambda x: list(x))
result = grouped_art.to_json(orient="records")
parsed = json.loads(result)
json.dumps(parsed, indent=4)

From list to nested dictionary

there are list :
data = ['man', 'man1', 'man2']
key = ['name', 'id', 'sal']
man_res = ['Alexandra', 'RST01', '$34,000']
man1_res = ['Santio', 'RST009', '$45,000']
man2_res = ['Rumbalski', 'RST50', '$78,000']
the expected output will be nested output:
Expected o/p:- {'man':{'name':'Alexandra', 'id':'RST01', 'sal':$34,000},
'man1':{'name':'Santio', 'id':'RST009', 'sal':$45,000},
'man2':{'name':'Rumbalski', 'id':'RST50', 'sal':$78,000}}
Easy way would be using pandas dataframe
import pandas as pd
df = pd.DataFrame([man_res, man1_res, man2_res], index=data, columns=key)
print(df)
df.to_dict(orient='index')
name id sal
man Alexandra RST01 $34,000
man1 Santio RST009 $45,000
man2 Rumbalski RST50 $78,000
{'man': {'name': 'Alexandra', 'id': 'RST01', 'sal': '$34,000'},
'man1': {'name': 'Santio', 'id': 'RST009', 'sal': '$45,000'},
'man2': {'name': 'Rumbalski', 'id': 'RST50', 'sal': '$78,000'}}
Or you could manually merge them using dict + zip
d = dict(zip(
data,
(dict(zip(key, res)) for res in (man_res, man1_res, man2_res))
))
d
{'man': {'name': 'Alexandra', 'id': 'RST01', 'sal': '$34,000'},
'man1': {'name': 'Santio', 'id': 'RST009', 'sal': '$45,000'},
'man2': {'name': 'Rumbalski', 'id': 'RST50', 'sal': '$78,000'}}
#save it in 2D array
all_man_res = []
all_man_res.append(man_res)
all_man_res.append(man1_res)
all_man_res.append(man2_res)
print(all_man_res)
#Add it into a dict output
output = {}
for i in range(len(l)):
person = l[i]
details = {}
for j in range(len(key)):
value = key[j]
details[value] = all_man_res[i][j]
output[person] = details
output
The pandas dataframe answer provided by NoThInG makes the most intuitive sense. If you are looking to use only the built in python tools, you can do
info_list = [dict(zip(key,man) for man in (man_res, man1_res, man2_res)]
output = dict(zip(data,info_list))

How to convert json to pandas dataframe?

I am new at api programming. I am trying to download data from the moex api.
Here is the code I use:
import requests as re
from io import StringIO
import pandas as pd
import json
session = re.Session()
login = "aaaa"
password = "bbbb"
session.get('https://passport.moex.com/authenticate', auth=(login, password))
cookies = {'MicexPassportCert': session.cookies['MicexPassportCert']}
def api_query(engine, market, session, secur, from_start, till_end):
param = 'https://iss.moex.com/iss/history/engines/{}/markets/{}/sessions/{}/securities/{}/candles.json?from={}&till={}&interval=24&start=0'.format(engine, market, session, secur, from_start, till_end)
return param
url = api_query('stock', 'bonds', 'session', 'RU000A0JVWL2', '2020-11-01', '2021-05-01')
response = re.get(url, cookies=cookies)
As a result I have got the following data (part of data)
'history.cursor': {'metadata': {'INDEX': {'type': 'int64'}, 'TOTAL': {'type': 'int64'}, 'PAGESIZE': {'type': 'int64'}}, 'columns': ['INDEX', 'TOTAL', 'PAGESIZE'], 'data': [[0, 32, 100]]}}
I need to convert json format into pandas dataframe. How to do it? As a result I should get dataframe with 1 row and 3 columns.
Thanks in advance
Assuming your json is properly encoded you could try something like this:
import pandas as pd
import numpy as np
json = {
'history.cursor': {
'metadata': {'INDEX': {'type': 'int64'}, 'TOTAL': {'type': 'int64'}, 'PAGESIZE': {'type': 'int64'}},
'columns': ['INDEX', 'TOTAL', 'PAGESIZE'],
'data': [[0, 32, 100]]
}
}
columns = json['history.cursor']['columns']
data = np.array(json['history.cursor']['data'])
metadata = json['history.cursor']['metadata']
d = {}
for i, column in enumerate(columns):
d[column] = data[:,i].astype(metadata[column]['type'])
df = pd.DataFrame(d)
print(df)
you should use the method pd.io.json.read_json() method
your orientation would likely be 'split'
so
pd.read_json(json,orient='split') where split is your json in the form of dict like {index -> [index], columns -> [columns], data -> [values]}

Python - Extracting values from a nested list

I have a list as shown below:
[{'id': 'id_123',
'type': 'type_1',
'created_at': '2020-02-12T17:45:00Z'},
{'id': 'id_124',
'type': 'type_2',
'created_at': '2020-02-12T18:15:00Z'},
{'id': 'id_125',
'type': 'type_1',
'created_at': '2020-02-13T19:43:00Z'},
{'id': 'id_126',
'type': 'type_3',
'created_at': '2020-02-13T07:00:00Z'}]
I am trying to find how many times type : type_1 occurs and what is the earliest created_at timestamp in that list for type_1
We can achieve this in several steps.
To find the number of times type_1 occurs we can use the built-in filter in tandem with itemgetter.
from operator import itemgetter
def my_filter(item):
return item['type'] == 'type_1'
key = itemgetter('created_at')
items = sorted(filter(my_filter, data), key=key)
print(f"Num records is {len(items)}")
print(f"Earliest record is {key(items[0])}")
Num records is 2
Earliest record is 2020-02-12T17:45:00Z
Conversely you can use a generator-comprehension and then sort the generator.
gen = (item for item in data if item['type'] == 'type_1')
items = sorted(gen, key=key)
# rest of the steps are the same...
You could use list comprehension to get all the sublists you're interested in, then sort by 'created_at'.
l = [{'id': 'id_123',
'type': 'type_1',
'created_at': '2020-02-12T17:45:00Z'},
{'id': 'id_124',
'type': 'type_2',
'created_at': '2020-02-12T18:15:00Z'},
{'id': 'id_125',
'type': 'type_1',
'created_at': '2020-02-13T19:43:00Z'},
{'id': 'id_126',
'type': 'type_3',
'created_at': '2020-02-13T07:00:00Z'}]
ll = [x for x in l if x['type'] == 'type_1']
ll.sort(key=lambda k: k['created_at'])
print(len(ll))
print(ll[0]['created_at'])
Output:
2
02/12/2020 17:45:00
This is one approach using filter and min.
Ex:
data = [{'id': 'id_123',
'type': 'type_1',
'created_at': '2020-02-12T17:45:00Z'},
{'id': 'id_124',
'type': 'type_2',
'created_at': '2020-02-12T18:15:00Z'},
{'id': 'id_125',
'type': 'type_1',
'created_at': '2020-02-13T19:43:00Z'},
{'id': 'id_126',
'type': 'type_3',
'created_at': '2020-02-13T07:00:00Z'}]
onlytype_1 = list(filter(lambda x: x['type'] == 'type_1', data))
print(len(onlytype_1))
print(min(onlytype_1, key=lambda x: x['created_at']))
Or:
temp = {}
for i in data:
temp.setdefault(i['type'], []).append(i)
print(len(temp['type_1']))
print(min(temp['type_1'], key=lambda x: x['created_at']))
Output:
2
{'id': 'id_123', 'type': 'type_1', 'created_at': '2020-02-12T17:45:00Z'}
You can just generate a list of all the type_1s using a list_comprehension, and them use sort with datetime.strptime to sort the values accordingly
from datetime import datetime
# Generate a list with only the type_1s' created_at values
type1s = [val['created_at'] for val in vals if val['type']=="type_1"]
# Sort them based on the timestamps
type1s.sort(key=lambda date: datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ"))
# Print the lowest value
print(type1s[0])
#'2020-02-12T17:45:00Z'
You can use the following function to get the desired output:
from datetime import datetime
def sol(l):
sum_=0
dict_={}
for x in l:
if x['type']=='type_1':
sum_+=1
dict_[x['id']]=datetime.strptime(x['created_at'], "%Y-%m-%dT%H:%M:%SZ")
date =sorted(dict_.values())[0]
for key,value in dict_.items():
if value== date: id_=key
return sum_,date,id_
sol(l)
This function gives the number of times type ='type_1', corresponding minimum date and its id respectively.
Hope this helps!

Categories