python convert dictionary to dataframe - python

I try to convert json to dataframe but I could not find what i want
here is dictionary and result I got
{'20210df12820df1456-ssddsd': {'2': {'num': '2',
'product_name': 'apple',
'product_price': '20900'},
'order': {'add_info': None,
'basket_count': '2',
'deli_price': '2500',
'id': 'nhdd#abvc',
'is_member': 'MEMBER',
'mem_type': 'PERSON',
'order_date': '2021-01-28 20:14:56',
'ordernum': '20210df12820df1456-ssddsd',
'pay_price': '43100',
'reserve': '840',
'start_price': '43100',
'total_product_price': '41800',
'used_emoney': '0',
'used_reserve': '0'},
'pay_history': [{'add_price': '0',
'deli_price': '2500',
'discount_price': '-1200',
'order_price': '43100',
'pay_date': '2021-01-28 '
'20:15:14',
'pay_price': '43100',
'pay_type': 'creditcard',
'paymethod': 'C',
'total_price': '41800',
'used_emoney': '0',
'used_reserve': '0'}],
'payment': {'card_flag': '0000',
'card_partcancel_code': '00',
'card_state': 'Y',
'in_card_price': '43100',
'pay_date': '2021-01-28 20:15:14',
'pay_status': 'Y',
'paymethod': 'C',
'simple_pay': 'NPY'},
'product': {'1': {'num': '1',
'product_name': 'banana',
'product_price': '20900'}}}}
json_data = response.json()
result =json_data['list']
df = pd.DataFrame(result).transpose()
df.head()
I would like to make this dictionary to four dataframe.
but i have this result
I expected like below
order = df[['order']]
payment = df[['payment']]
pay_history = df[['pay_history']]
product = df[['product']]
something like this.
any ideas??

Since the data for each category (order, payment, pay history, product) are organized differently, you should consider iterating through each category, adding additional data (such as ordernum for indexing purpose) and putting them in their own list that you will later use to convert them into DataFrame object
import pandas as pd
json_data = {'20210df12820df1456-ssddsd': {'order': {'ordernum': '20210df12820df1456-ssddsd', 'order_date': '2021-01-28 20:14:56', 'is_member': 'MEMBER', 'start_price': '43100', 'pay_price': '43100', 'deli_price': '2500', 'total_product_price': '41800', 'basket_count': '2', 'id': 'nhdd#abvc', 'mem_type': 'PERSON', 'used_emoney': '0', 'used_reserve': '0', 'add_info': None, 'reserve': '840'}, 'payment': {'paymethod': 'C', 'pay_date': '2021-01-28 20:15:14', 'card_state': 'Y', 'pay_status': 'Y', 'simple_pay': 'NPY', 'card_flag': '0000', 'card_partcancel_code': '00', 'in_card_price': '43100'}, 'pay_history': [{'pay_date': '2021-01-28 20:15:14', 'pay_type': 'creditcard', 'total_price': '41800', 'deli_price': '2500', 'discount_price': '-1200', 'add_price': '0', 'order_price': '43100', 'pay_price': '43100', 'used_reserve': '0', 'used_emoney': '0', 'paymethod': 'C'}], 'product': {'1': {'num': '1', 'product_name': 'banana', 'product_price': '20900'}, '2': {'num': '2', 'product_name': 'apple', 'product_price': '20900'}}}}
order_data = []
payment_data = []
pay_history_data = []
product_data = []
for key in json_data:
order_data.append(json_data[key]['order'])
payment = dict(json_data[key]['payment'])
payment['ordernum'] = key
pay_history_data.append(payment)
pay_history = json_data[key]['pay_history']
if 'pay_history' in json_data[key]:
for p in pay_history:
p_clone = dict(p)
p_clone['ordernum'] = key
payment_data.append(p_clone)
product = json_data[key]['product']
if 'product' in json_data[key]:
for product_key in product:
p_clone = dict(product[product_key])
p_clone['ordernum'] = key
product_data.append(p_clone)
order_df = pd.DataFrame(order_data)
payment_df = pd.DataFrame(payment_data)
pay_history_df = pd.DataFrame(pay_history_data)
product_df = pd.DataFrame(product_data)
Edit: If you're experiencing KeyError exception in iterating through pay_history, it could be that in some order, there are no pay_history key in the json data of that order so you can avoid this by first checking if the key exists in the json file before proceeding to iterating through the pay_history (if 'pay_history' in json_data[key]:), same thing can be done before iterating through product (if 'product' in json_data[key]:).

What you need is pandas json_normalize function.
I wrote some example codes.
import pandas as pd
from pandas.io.json import json_normalize
data = {'20210df12820df1456-ssddsd': {'order': {'ordernum': '20210df12820df1456-ssddsd', 'order_date': '2021-01-28 20:14:56', 'is_member': 'MEMBER',
'start_price': '43100', 'pay_price': '43100', 'deli_price': '2500', 'total_product_price': '41800',
'basket_count': '2', 'id': 'nhdd#abvc', 'mem_type': 'PERSON', 'used_emoney': '0', 'used_reserve': '0', 'add_info': None, 'reserve': '840'},
'payment': {'paymethod': 'C', 'pay_date': '2021-01-28 20:15:14', 'card_state': 'Y', 'pay_status': 'Y', 'simple_pay': 'NPY', 'card_flag': '0000', 'card_partcancel_code': '00', 'in_card_price': '43100'},
'pay_history': [{'pay_date': '2021-01-28 20:15:14', 'pay_type': 'creditcard', 'total_price': '41800', 'deli_price': '2500', 'discount_price': '-1200', 'add_price': '0', 'order_price': '43100', 'pay_price': '43100', 'used_reserve': '0', 'used_emoney': '0', 'paymethod': 'C'}],
'product': {'1': {'num': '1', 'product_name': 'banana', 'product_price': '20900'}}, '2': {'num': '2', 'product_name': 'apple', 'product_price': '20900'}}}
df = pd.DataFrame(data).transpose()
order = json_normalize(df['order'])
the result will look like:

Related

Nested dictionary parsing error JSON- TypeError: string indices must be integers

Image of Code
Im trying to pull the key values pair for the dictionary associated to the "awayBattingTotals". However, im encountering the below error that i do not know how to fix.
Snippet of the JSON response is below
{
'namefield': '9 Lopez, N SS',
'ab': '3',
'r': '0',
'h': '1',
'doubles': '0',
'triples': '0',
'hr': '0',
'rbi': '0',
'sb': '0',
'bb': '0',
'k': '0',
'lob': '2',
'avg': '.248',
'ops': '.599',
'personId': 670032,
'battingOrder': '900',
'substitution': False,
'note': '',
'name': 'Lopez, N',
'position': 'SS',
'obp': '.305',
'slg': '.294'
}],
'awayBattingTotals': {
'namefield': 'Totals',
'ab': '33',
'r': '2',
'h': '7',
'hr': '1',
'rbi': '2',
'bb': '0',
'k': '8',
'lob': '13',
'avg': '',
'ops': '',
'obp': '',
'slg': '',
'name': 'Totals',
'position': '',
'note': '',
'substitution': False,
'battingOrder': '',
'personId': 0
},
'homeBattingTotals': {
'namefield': 'Totals',
'ab': '34',
'r': '4',
'h': '9',
'hr': '2',
'rbi': '4',
'bb': '1',
'k': '7',
'lob': '13',
'avg': '',
'ops': '',
'obp': '',
'slg': '',
'name': 'Totals',
'position': '',
'note': '',
'substitution': False,
'battingOrder': '',
'personId': 0
},
The below is obtained via
statsapi.boxscore_data(662647)
summary = statsapi.boxscore(662647)
From the above im trying to run
summary["awayBattingTotals"]["Totals"]
to pull the below values:
`awayBattingTotals': {'namefield': 'Totals', 'ab': '33', 'r': '2', 'h': '7', 'hr': '1', 'rbi': '2', 'bb': '0', 'k': '8', 'lob': '13',`
but i keep getting the below error:
TypeError: string indices must be integers`
As Barmar mentioned, it seemed like the data wasn't behaving as json...
Switching the single to double quotes in the json-like text of the response allows me to reach into it with json.loads() like so:
mysecond = '''{"awayBattingTotals": {"namefield": "Totals", "ab": "33", "r": "2"}}'''
myload = json.loads(mysecond)
print myload
Result:
{u'awayBattingTotals': {u'r': u'2', u'ab': u'33', u'namefield': u'Totals'}}
This failed in the same way you described when I cut and pasted the json response you included in your question:
import json
myjson = """{'awayBattingTotals': { 'namefield': 'Totals',
'ab': '33',
'r': '2'}}"""
print json.loads(myjson)
result:
TypeError: string indices must be integers, not str

remove repeated values in dictionary

I want to remove the repeated value in a dictionary after I extracted the needed data which is 'rate' and 'genre'
a=[{'movie': 'abc', 'rate': '9', 'origin': 'AU', 'genre': 'horror'},
{'movie': 'xyz', 'rate': '7', 'origin': 'NY', 'genre': 'romance'},
{'movie': 'jkl', 'rate': '9', 'origin': 'HK', 'genre': 'horror'},
{'movie': 'qwe', 'rate': '6', 'origin': 'HK', 'genre': 'comedy'},
{'movie': 'vbn', 'rate': '9', 'origin': 'BKK', 'genre': 'romance'}]
needed_data=[]
for test in a:
x={}
word=['rate','genre']
for key,value in test.items():
for words in word:
if key == words:
x[key] = value
needed_data.append(x)
results = {}
filters=[]
for yy in needed_data:
for key,value in yy.items():
if value not in results.values():
results[key] = value
filters.append(results)
print(filters)
the output from above code is
[{'rate': '9', 'genre': 'romance'},
{'rate': '9', 'genre': 'romance'},
{'rate': '9', 'genre': 'romance'},
{'rate': '9', 'genre': 'romance'},
{'rate': '9', 'genre': 'romance'}]
my desired output would be
[{'rate': '9', 'genre': 'horror'},
{'rate': '7', 'genre': 'romance'},
{'rate': '6', 'genre': 'comedy'},
{'rate': '9', 'genre': 'romance'}]
I would recommend to use pandas for data processing
import pandas as pd
df = pd.DataFrame(a)
df_dd= df[["genre", "rate"]].drop_duplicates()
new_a = df_dd.to_dict(orient="records")
print(new_a)
Output
[{'genre': 'horror', 'rate': '9.'},
{'genre': 'romance', 'rate': '7'},
{'genre': 'horror', 'rate': '9'},
{'genre': 'comedy', 'rate': '6'},
{'genre': 'romance', 'rate': '9'}]
Your data has strings '9.' and '9' Do you want it that way?
z = {f"{float(x['rate']):.2f}-{x['genre']}": x for x in needed_data}
list(z.values())
Output
[{'rate': '9', 'genre': 'horror'},
{'rate': '7', 'genre': 'romance'},
{'rate': '6', 'genre': 'comedy'},
{'rate': '9', 'genre': 'romance'}]
This is the easy way to do your task:
a=[{'movie': 'abc', 'rate': '9.', 'origin': 'AU', 'genre': 'horror'},
{'movie': 'xyz', 'rate': '7', 'origin': 'NY', 'genre': 'romance'},
{'movie': 'jkl', 'rate': '9', 'origin': 'HK', 'genre': 'horror'},
{'movie': 'qwe', 'rate': '6', 'origin': 'HK', 'genre': 'comedy'},
{'movie': 'vbn', 'rate': '9', 'origin': 'BKK', 'genre': 'romance'}]
c = []
for b in a:
c.append({'rate':b['rate'],'genre':b['genre'] })
print(c)
So the Output will be:
[{'rate': '9.', 'genre': 'horror'}, {'rate': '7', 'genre': 'romance'}, {'rate': '9', 'genre': 'horror'}, {'rate': '6', 'genre': 'comedy'}, {'rate': '9', 'genre': 'romance'}]

list of dictionary: aggregate value by grouping by inner dictionary key

I have this signature:
def aggregate_by_player_id(input, playerid, fields):
By 'fields', i mean fields to sum up grouping by 'playerID' within the 'input'.
I call the function like this:
aggregate_by_player_id(input, 'player', ['stat1','stat3'])
Input look like this:
[{'player': '1', 'stat1': '3', 'stat2': '4', 'stat3': '5'},
{'player': '1', 'stat1': '1', 'stat2': '4', 'stat3': '1'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '3'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '1'},
{'player': '3', 'stat1': '4', 'stat2': '1', 'stat3': '6'}]
My output structure is:
nested_dic = {value_of_playerid1: {'playerid': value_of_playerid1, 'stat1': value_of_stat1, 'stat2': value_of_stat2},
value_of_playerid2: {'playerid': value_of_playerid2, 'stat2': value_of_stat2, 'stat2': value_of_stat2},
value_of_playerid3: {'playerid': value_of_playerid3, 'stat3': value_of_stat3, 'stat3': value_of_stat3}}
Hence the output should look like:
{'1': {'player': '1', 'stat1': 4, 'stat3': 6},
'2': {'player': '2', 'stat1': 2, 'stat3': 4},
'3': {'player': '3', 'stat1': 4, 'stat3': 6}}
We can use itertools.groupby for this to group on playerid and then sum values across the fields.
from itertools import groupby
from operator import itemgetter
def aggregate_by_player_id(input_, playerid, fields):
player = itemgetter(playerid)
output = {}
for k, v in groupby(input_, key=player):
data = list(v)
stats = {playerid: k}
for field in fields:
stats[field] = sum(int(d.get(field, 0)) for d in data)
output[k] = stats
return output
data.sort(key=player) # data must be pre-sorted on grouping key
results = aggregate_by_player_id(data, 'player', ['stat1', 'stat3'])
{'1': {'player': '1', 'stat1': 4, 'stat3': 6},
'2': {'player': '2', 'stat1': 2, 'stat3': 4},
'3': {'player': '3', 'stat1': 4, 'stat3': 6}}
Capturing the result you're after in a single comprehension might be possible, but is likely not very readable. Here's a simple function that does the work:
data = [
{'player': '1', 'stat1': '3', 'stat2': '4', 'stat3': '5'},
{'player': '1', 'stat1': '1', 'stat2': '4', 'stat3': '1'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '3'},
{'player': '2', 'stat1': '1', 'stat2': '2', 'stat3': '1'},
{'player': '3', 'stat1': '4', 'stat2': '1', 'stat3': '6'}
]
def aggregate_dicts(ds, id_field, aggr_fields):
result = {}
for d in ds:
identifier = d[id_field]
if identifier not in result:
result[identifier] = {f: 0 for f in aggr_fields}
for f in aggr_fields:
result[identifier][f] += int(d[f])
return result
print(aggregate_dicts(data, 'player', ['stat1', 'stat3']))
Result:
{'1': {'stat1': 4, 'stat3': 6}, '2': {'stat1': 2, 'stat3': 4}, '3': {'stat1': 4, 'stat3': 6}}
If you want to repeat the identifier inside the dict, just add this line to the if block:
result[identifier][id_field] = identifier

Finding missing value in JSON using python

I am facing this problem, I want to separate the dataset that has completed and not complete.
So, I want to put flag like 'complete' in the JSON. Example as in output.
This is the data that i have
data=[{'id': 'abc001',
'demo':{'gender':'1',
'job':'6',
'area':'3',
'study':'3'},
'ex_data':{'fam':'small',
'scholar':'2'}},
{'id': 'abc002',
'demo':{'gender':'1',
'edu':'6',
'qual':'3',
'living':'3'},
'ex_data':{'fam':'',
'scholar':''}},
{'id': 'abc003',
'demo':{'gender':'1',
'edu':'6',
'area':'3',
'sal':'3'}
'ex_data':{'fam':'big',
'scholar':NaN}}]
Output
How can I put the flag and also detect NaN and NULL in JSON?
Output=[{'id': 'abc001',
'completed':'yes',
'demo':{'gender':'1',
'job':'6',
'area':'3',
'study':'3'},
'ex_data':{'fam':'small',
'scholar':'2'}},
{'id': 'abc002',
'completed':'no',
'demo':{'gender':'1',
'edu':'6',
'qual':'3',
'living':'3'},
'ex_data':{'fam':'',
'scholar':''}},
{'id': 'abc003',
'completed':'no',
'demo':{'gender':'1',
'edu':'6',
'area':'3',
'sal':'3'}
'ex_data':{'fam':'big',
'scholar':NaN}}]
Something like this should work for you:
data = [
{
'id': 'abc001',
'demo': {
'gender': '1',
'job': '6',
'area': '3',
'study': '3'},
'ex_data': {'fam': 'small',
'scholar': '2'}
},
{
'id': 'abc002',
'demo': {
'gender': '1',
'edu': '6',
'qual': '3',
'living': '3'},
'ex_data': {'fam': '',
'scholar': ''}},
{
'id': 'abc003',
'demo': {
'gender': '1',
'edu': '6',
'area': '3',
'sal': '3'},
'ex_data': {'fam': 'big',
'scholar': None}
}
]
def browse_dict(dico):
empty_values = 0
for key in dico:
if dico[key] is None or dico[key] == "":
empty_values += 1
if isinstance(dico[key], dict):
for k in dico[key]:
if dico[key][k] is None or dico[key][k] == "":
empty_values += 1
if empty_values == 0:
dico["completed"] = "yes"
else:
dico["completed"] = "no"
for d in data:
browse_dict(d)
print(d)
Output :
{'id': 'abc001', 'demo': {'gender': '1', 'job': '6', 'area': '3', 'study': '3'}, 'ex_data': {'fam': 'small', 'scholar': '2'}, 'completed': 'yes'}
{'id': 'abc002', 'demo': {'gender': '1', 'edu': '6', 'qual': '3', 'living': '3'}, 'ex_data': {'fam': '', 'scholar': ''}, 'completed': 'no'}
{'id': 'abc003', 'demo': {'gender': '1', 'edu': '6', 'area': '3', 'sal': '3'}, 'ex_data': {'fam': 'big', 'scholar': None}, 'completed': 'no'}
Note that I changed NaN to None, because here you are most likely showing a python dictionary, not a JSON file since you are using data =
In a dictionary, the NaN value would be changed for None.
If you have to convert your JSON to a dictionary, refer to the JSON module documentation.
Also please check your dictionary syntax. You missed several commas to separate data.
You should try
The Input is
data = [{'demo': {'gender': '1', 'job': '6', 'study': '3', 'area': '3'}, 'id': 'abc001', 'ex_data': {'scholar': '2', 'fam': 'small'}}, {'demo': {'living': '3', 'gender': '1', 'qual': '3', 'edu': '6'}, 'id': 'abc002', 'ex_data': {'scholar': '', 'fam': ''}}, {'demo': {'gender': '1', 'area': '3', 'sal': '3', 'edu': '6'}, 'id': 'abc003', 'ex_data': {'scholar': None, 'fam': 'big'}}]
Also, Nan will not work in Python. So, instead of Nan we have used None.
for item in data:
item["completed"] = 'yes'
for key in item.keys():
if isinstance(item[key],dict):
for inner_key in item[key].keys():
if (not item[key][inner_key]):
item["completed"] = "no"
break
else:
if (not item[key]):
item["completed"] = "no"
break
The Output will be
data = [{'demo': {'gender': '1', 'job': '6', 'study': '3', 'area': '3'}, 'completed': 'yes', 'id': 'abc001', 'ex_data': {'scholar': '2', 'fam': 'small'}}, {'demo': {'living': '3', 'edu': '6', 'qual': '3', 'gender': '1'}, 'completed': 'no', 'id': 'abc002', 'ex_data': {'scholar': '', 'fam': ''}}, {'demo': {'edu': '6', 'gender': '1', 'sal': '3', 'area': '3'}, 'completed': 'no', 'id': 'abc003', 'ex_data': {'scholar': None, 'fam': 'big'}}]

Merging nested dictionaries by keys preserving different values

I have two list of nested dictionaries with the same keys, but different values:
d1 = {
'distilled ': [{'water': '45'}, {'vodka': '9'}, {'vinegar': '7'}, {'beer': '6'}, {'alcohol': '5'}, {'whiskey': '5'}],
'planted': [{'tree': '30'}, {'seed': '28'}, {'flower': '20'}, {'plant': '7'}, {'bomb': '4'}, {'garden': '2'}]
}
and
d2 = {
'distilled ': [{'water': '14'}, {'vinegar': '9'}, {'wine': '8'}, {'alcohol': '8'}, {'liquid': '7'}, {'whiskey': '6'}, {'beer': '5'}],
'planted ': [{'flower': '28'}, {'tree': '18'}, {'seed': '9'}, {'vegetable': '4'}, {'bush': '3'}, {'grass': '3'}, {'garden': '3'}]
}
I want to merge them in a way that preserves the values and merges only the keys in the nested dictionaries. So that the outcome would look like:
{
'distilled ': [('water', '45', '14'), ('vodka', '9'), ('vinegar', '7', '9'), ('beer', '6', '5'), ('alcohol', '5'), ('whiskey', '5'), ('wine', '8')],
'planted': [('tree', '30', '18'), ('seed', '28', '9'), ('flower', '20', '7'), ('plant', '7'), ('bomb', '4'), ('garden', '2', '3')]
}
I tried merging the two using:
d_merged = { k: [ d1[k], d2_to_compare[k] ] for k in d1 }
but the in the outcome only the values of the first dictionary are presented, obviously. Do you have any ideas on how to fix this? Thank you very much in advance.
I am not sure which way to take from here. Would really appreciate any suggestions! Thanks a lot.
dict only has one key-value pair is not a good idea, but anyway, we can work out like this:
d1 = {
'distilled': [{'water': '45'}, {'vodka': '9'}, {'vinegar': '7'}, {'beer': '6'}, {'alcohol': '5'}, {'whiskey': '5'}],
'planted': [{'tree': '30'}, {'seed': '28'}, {'flower': '20'}, {'plant': '7'}, {'bomb': '4'}, {'garden': '2'}]
}
d2 = {
'distilled': [{'water': '14'}, {'vinegar': '9'}, {'wine': '8'}, {'alcohol': '8'}, {'liquid': '7'}, {'whiskey': '6'}, {'beer': '5'}],
'planted': [{'flower': '28'}, {'tree': '18'}, {'seed': '9'}, {'vegetable': '4'}, {'bush': '3'}, {'grass': '3'}, {'garden': '3'}]
}
d3 = {}
for k, v in d1.items():
k1 = dict([d.items()[0] for d in d1[k]])
k2 = dict([d.items()[0] for d in d2[k]])
ret = []
for d in (set(k1.keys()) | set(k2.keys())):
ret.append((d, k1.get(d), k2.get(d)))
d3[k] = ret
print d3

Categories