Loop through JSON object and store results in pandas dataframe - python

I have a JSON object that looks like this:
data = {'A': {'code': 'Ok',
'tracepoints': [None,
None,
{'alternatives_count': 0,
'location': [-122.419189, 37.753805],
'distance': 28.078003,
'hint': '5Qg7hUqpFQA2AAAAOgAAAAwAAAAPAAAAiVMWQq2VIEIAuABB7FgoQTYAAAA6AAAADAAAAA8AAAD4RAAACwi0-M0TQALvB7T4yRRAAgEAXwX5Wu6N',
'name': '23rd Street',
'matchings_index': 0,
'waypoint_index': 0},
{'alternatives_count': 0,
'location': [-122.417866, 37.75389],
'distance': 26.825184,
'hint': 'K8w6BRinFYAdAAAACwAAAA0AAAAAAAAAIxmmQTSs6kCiuRFBAAAAAB0AAAALAAAADQAAAAAAAAD4RAAANg20-CIUQAJNDbT4MRNAAgIAnxD5Wu6N',
'name': '23rd Street',
'matchings_index': 0,
'waypoint_index': 1},
{'alternatives_count': 0,
'location': [-122.416896, 37.75395],
'distance': 16.583412,
'hint': 'Jcw6BSzMOoUqAAAAQwAAABAAAAANAAAA0i_uQb3SOEKKPC9BG1EaQSoAAABDAAAAEAAAAA0AAAD4RAAAABG0-F4UQALyELT48xRAAgEAnxD5Wu6N',
'name': '23rd Street',
'matchings_index': 0,
'waypoint_index': 2},
{'alternatives_count': 7,
'location': [-122.415502, 37.754028],
'distance': 10.013916,
'hint': 'Jsw6hbN6kQBmAAAACAAAABAAAAANAAAAQOKOQg89nkCKPC9BEMcOQWYAAAAIAAAAEAAAAA0AAAD4RAAAcha0-KwUQAJ6FrT4UhRAAgEAbwX5Wu6N',
'name': '23rd Street',
'matchings_index': 0,
'waypoint_index': 3}],
'matchings': [{'duration': 50.6,
'distance': 325.2,
'weight': 50.6,
'geometry': 'y{h_gAh~znhF}#k[OmFMoFcAea#IeD[uMAYKsDMsDAe#}#u_#g#aTMwFMwFwAqq#',
'confidence': 0.374625,
'weight_name': 'routability',
'legs': [{'steps': [],
'weight': 18.8,
'distance': 116.7,
'annotation': {'nodes': [1974590926,
4763953263,
65359046,
4763953265,
5443374298,
2007343352]},
'summary': '',
'duration': 18.8},
{'steps': [],
'weight': 12.2,
'distance': 85.6,
'annotation': {'nodes': [5443374298,
2007343352,
4763953266,
65359043,
4763953269,
2007343354,
4763953270]},
'summary': '',
'duration': 12.2},
{'steps': [],
'weight': 19.6,
'distance': 122.9,
'annotation': {'nodes': [2007343354,
4763953270,
65334199,
4763953274,
2007343347]},
'summary': '',
'duration': 19.6}]}]},
'B': {'code': 'Ok',
'tracepoints': [{'alternatives_count': 0,
'location': [-122.387971, 37.727587],
'distance': 11.53267,
'hint': 'xHWRAEJ2kYALAAAArQAAAA4AAAAsAAAAnpH1QDVG8EJWgBdBa2v0QQsAAACtAAAADgAAACwAAAD4RAAA_YG0-GOtPwJKgrT4t60_AgIA3wf5Wu6N',
'name': 'Underwood Avenue',
'matchings_index': 0,
'waypoint_index': 0},
{'alternatives_count': 0,
'location': [-122.388563, 37.727175],
'distance': 13.565054,
'hint': 'w3WRgBuxOgVPAAAACAAAABMAAAASAAAA7ONaQo4CrUDv7U1BJdFAQU8AAAAIAAAAEwAAABIAAAD4RAAArX-0-MerPwIsgLT4gqs_AgIAbw35Wu6N',
'name': 'Jennings Street',
'matchings_index': 0,
'waypoint_index': 1},
{'alternatives_count': 1,
'location': [-122.388478, 37.725984],
'distance': 9.601917,
'hint': 't3WRABexOoWcAAAAbAAAABEAAAALAAAAdujYQqu4lUJXHD1B9-ruQJwAAABsAAAAEQAAAAsAAAD4RAAAAoC0-CCnPwJCgLT4Zqc_AgIAHxP5Wu6N',
'name': 'Wallace Avenue',
'matchings_index': 0,
'waypoint_index': 2}],
'matchings': [{'duration': 50,
'distance': 270.4,
'weight': 50,
'geometry': 'euu}fAd_~lhFoAlCMTuAvCvC|Bh#`#hXbUnAdADBhDzCzClCXVzZnW\\X~CnC~#qBLWnWej#',
'confidence': 1e-06,
'weight_name': 'routability',
'legs': [{'steps': [],
'weight': 17.8,
'distance': 84.8,
'annotation': {'nodes': [5443147626,
6360865540,
6360865536,
65307580,
6360865535,
6360865539,
6360865531]},
'summary': '',
'duration': 17.8},
{'steps': [],
'weight': 32.2,
'distance': 185.6,
'annotation': {'nodes': [6360865539,
6360865531,
6360865525,
65343521,
6360865527,
6360865529,
6360865523,
6360865520,
65321110,
6360865519,
6360865522,
6376329343]},
'summary': '',
'duration': 32.2}]}]},
'C': {'code': 'Ok',
'tracepoints': [None,
None,
{'alternatives_count': 0,
'location': [-122.443682, 37.713254],
'distance': 6.968076,
'hint': 'QXo6hUR6OgUAAAAANQAAAAAAAAAkAAAAAAAAAOCMMUEAAAAA_Z1yQQAAAAAbAAAAAAAAACQAAAD4RAAAXqiz-GZ1PwKiqLP4hnU_AgAAzxL5Wu6N',
'name': '',
'matchings_index': 0,
'waypoint_index': 0},
{'alternatives_count': 0,
'location': [-122.442428, 37.714335],
'distance': 16.488956,
'hint': 'E3o6BVRukYAJAAAAIgAAAGgAAAAUAAAA2RnSQL_5uUEPjI9CBTlaQQkAAAAiAAAAaAAAABQAAAD4RAAARK2z-J95PwKTrLP4b3k_AgEAXxX5Wu6N',
'name': 'Allison Street',
'matchings_index': 0,
'waypoint_index': 1},
{'alternatives_count': 1,
'location': [-122.441751, 37.712761],
'distance': 17.311636,
'hint': 'Fno6hRl6OgWZAAAANwAAAAAAAAAKAAAAH4vUQgKXFkIAAAAAXtbYQJkAAAA3AAAAAAAAAAoAAAD4RAAA6a-z-HlzPwKjsLP4q3M_AgAAHwr5Wu6N',
'name': 'Allison Street',
'matchings_index': 0,
'waypoint_index': 2}],
'matchings': [{'duration': 64.1,
'distance': 420.1,
'weight': 66.7,
'geometry': 'kuy|fAbyjphFcBxEmE`FqJkKiBqBuP}Qgc#ie#eAiAcB}ArA_Eb#mAjKkDnBo#fe#mOrw#kW',
'confidence': 7.3e-05,
'weight_name': 'routability',
'legs': [{'steps': [],
'weight': 40.1,
'distance': 235.2,
'annotation': {'nodes': [5440513673,
5440513674,
5440513675,
65363070,
1229920760,
65307726,
6906452420,
1229920717,
65361047,
1229920749,
554163599,
3978809925]},
'summary': '',
'duration': 37.5},
{'steps': [],
'weight': 26.6,
'distance': 184.9,
'annotation': {'nodes': [554163599, 3978809925, 65345518, 8256268328]},
'summary': '',
'duration': 26.6}]}]}}
I would like to extract the values under the key nodes per user (A, B and C) and store these values in a pandas dataframe, together with the corresponding user. Like below:
value user
1974590926 A
4763953263 A
65359046 A
4763953265 A
5443374298 A
2007343352 A
5443374298 A
2007343352 A
4763953266 A
65359043 A
4763953269 A
2007343354 A
4763953270 A
2007343354 A
4763953270 A
65334199 A
4763953274 A
2007343347 A
5443147626 B
6360865540 B
6360865536 B
65307580 B
6360865535 B
6360865539 B
6360865531 B
6360865539 B
6360865531 B
6360865525 B
65343521 B
6360865527 B
6360865529 B
6360865523 B
6360865520 B
65321110 B
6360865519 B
6360865522 B
6376329343 B
5440513673 C
5440513674 C
5440513675 C
65363070 C
1229920760 C
65307726 C
6906452420 C
1229920717 C
65361047 C
1229920749 C
554163599 C
3978809925 C
554163599 C
3978809925 C
65345518 C
8256268328 C
I am able to extract and store only the nodes belonging to user C to a pandas dataframe with the code below. However, I struggle to add the user column and the other nodes with their corresponding user. Any ideas?
import pandas as pd
nodes_df = pd.DataFrame({'node':{}})
for user in output[user]['matchings'][0]['legs']:
result = user['annotation']['nodes']
values_temp = pd.DataFrame(result, columns=['value'])
values_df = values_df.append(values_temp, ignore_index=True)
values_df.node = values_df.value.astype(int)
values_df
value
0 5440513673
1 5440513674
2 5440513675
3 65363070
4 1229920760
5 65307726
6 6906452420
7 1229920717
8 65361047
9 1229920749
10 554163599
11 3978809925
12 554163599
13 3978809925
14 65345518
15 8256268328

You can use json_normalize() with record_path and then concat() the users:
dfs = []
for user in output.keys():
df = pd.json_normalize(output, record_path=[user, 'matchings', 'legs', 'annotation', 'nodes'])
df['user'] = user
dfs.append(df)
nodes_df = pd.concat(dfs).rename(columns={0: 'node'})
# node user
# 1974590926 A
# 4763953263 A
# 65359046 A
# ... ...
# 3978809925 C
# 65345518 C
# 8256268328 C
If there are some users with missing matchings, you can check if 'matchings' in output[user]:
dfs = []
for user in output.keys():
if 'matchings' in output[user]:
df = pd.json_normalize(output, record_path=[user, 'matchings', 'legs', 'annotation', 'nodes'])
df['user'] = user
dfs.append(df)
nodes_df = pd.concat(dfs).rename(columns={0: 'node'})
If the output keys are like ('2018-02-03', 'A') and you're iterating them as trip, you need to access its date and user as trip[0] and trip[1]:
dfs = []
for trip in output.keys():
if 'matchings' in output[trip]:
df = pd.json_normalize(output, record_path=[trip, 'matchings', 'legs', 'annotation', 'nodes'])
df['date'] = trip[0]
df['user'] = trip[1]
dfs.append(df)
nodes_df = pd.concat(dfs).rename(columns={0: 'node'})

We want to put all the node values in [legs]
If you want the simplest way with just for loop:
nodes = []
user = []
for i in output.keys():
for j in output[i]['matchings'][0]['legs']:
for k in j['annotation']['nodes']:
col1.append(k)
col2.append(i)
d = {'nodes':nodes, 'user':user}
df = pd.DataFrame(data=d)
print(df)

You could use the jmespath module to extract the data, before recombining within the dataframe; you should get some speed up, since the iteration is within the dictionary:
The summary for jmespath is : if accessing a key, then use dot, if the data is within a list, then use the [] to access the data:
#pip install jmespath
import jmespath
from itertools import chain
query ={letter: jmespath.compile(f"{letter}.matchings[].legs[].annotation.nodes")
for letter in ("A", "B", "C")}
result = {letter: pd.DataFrame(chain.from_iterable(expression.search(output)),
columns=['node'])
for letter, expression in query.items()}
result = pd.concat(result).droplevel(-1).rename_axis(index='user').reset_index()
result.head(15)
user node
0 A 1974590926
1 A 4763953263
2 A 65359046
3 A 4763953265
4 A 5443374298
5 A 2007343352
6 A 5443374298
7 A 2007343352
8 A 4763953266
9 A 65359043
10 A 4763953269
11 A 2007343354
12 A 4763953270
13 A 2007343354
14 A 4763953270

Related

Change column format of DF, where some columns are dicts

I'm new to pandas and I need help. Below I described my DF, which I need to change.
id title \
0 121852 {'en': 'Hard Fork'}
1 123209 {'en': 'Quarterly Public Meeting'}
2 122436 {'en': 'Luxy NFT Marketplace'}
3 122995 {'en': 'Poloniex Listing'}
4 123391 {'en': 'Staking 3.0 Release'}
5 123355 {'en': 'BitMart Listing'}
6 122819 {'en': 'Amazy IGO'}
7 123470 {'en': 'YouTube Live AMA'}
8 123392 {'en': 'AMA'}
9 123319 {'en': 'LBank Listing'}
10 123306 {'en': 'Community Call'}
11 123465 {'en': 'Digifinex Listing'}
12 123469 {'en': 'MEXC Global Listing'}
13 123512 {'en': 'Metarun & Fabwelt AMA'}
14 123460 {'en': 'Digifinex Listing'}
15 123489 {'en': 'BitMart Listing'}
coins \
0 [{'id': 'gxchain', 'coingecko_id': '', 'name': 'GXChain', 'rank': 442, 'symbol': 'GXC', 'fullname': 'GXChain (GXC)'}, {'id': 'rei-network', 'coingecko_id': '', 'name': 'REI Network', 'rank': 376, 'symbol': 'REI', 'fullname': 'REI Network (REI)'}]
1 [{'id': 'filecoin', 'coingecko_id': '', 'name': 'Filecoin', 'rank': 45, 'symbol': 'FIL', 'fullname': 'Filecoin (FIL)'}]
2 [{'id': 'luxy', 'coingecko_id': '', 'name': 'Luxy', 'rank': 0, 'symbol': 'LUXY', 'fullname': 'Luxy (LUXY)'}, {'id': 'syscoin', 'coingecko_id': '', 'name': 'Syscoin', 'rank': 240, 'symbol': 'SYS', 'fullname': 'Syscoin (SYS)'}]
3 [{'id': 'bitkub-coin', 'coingecko_id': '', 'name': 'Bitkub Coin', 'rank': 125, 'symbol': 'KUB', 'fullname': 'Bitkub Coin (KUB)'}]
4 [{'id': 'sidus', 'coingecko_id': '', 'name': 'Sidus', 'rank': 1231, 'symbol': 'SIDUS', 'fullname': 'Sidus (SIDUS)'}]
5 [{'id': 'solve', 'coingecko_id': '', 'name': 'SOLVE', 'rank': 693, 'symbol': 'SOLVE', 'fullname': 'SOLVE (SOLVE)'}]
6 [{'id': 'seedify-fund', 'coingecko_id': '', 'name': 'Seedify.fund', 'rank': 389, 'symbol': 'SFUND', 'fullname': 'Seedify.fund (SFUND)'}]
7 [{'id': 'oasis-network', 'coingecko_id': '', 'name': 'Oasis Network', 'rank': 134, 'symbol': 'ROSE', 'fullname': 'Oasis Network (ROSE)'}]
8 [{'id': 'dydx', 'coingecko_id': '', 'name': 'dYdX', 'rank': 157, 'symbol': 'DYDX', 'fullname': 'dYdX (DYDX)'}]
9 [{'id': 'grove', 'coingecko_id': '', 'name': 'Grove', 'rank': 0, 'symbol': 'GVR', 'fullname': 'Grove (GVR)'}]
10 [{'id': 'perpetual-protocol', 'coingecko_id': '', 'name': 'Perpetual Protocol', 'rank': 373, 'symbol': 'PERP', 'fullname': 'Perpetual Protocol (PERP)'}]
11 [{'id': 'new-paradigm-assets-solution', 'coingecko_id': '', 'name': 'New Paradigm Assets Solution', 'rank': 0, 'symbol': 'NPAS', 'fullname': 'New Paradigm Assets Solution (NPAS)'}]
12 [{'id': 'handy', 'coingecko_id': '', 'name': 'Handy', 'rank': 0, 'symbol': 'HANDY', 'fullname': 'Handy (HANDY)'}]
13 [{'id': 'fabwelt', 'coingecko_id': '', 'name': 'Fabwelt', 'rank': 2626, 'symbol': 'WELT', 'fullname': 'Fabwelt (WELT)'}, {'id': 'metarun', 'coingecko_id': '', 'name': 'Metarun', 'rank': 3092, 'symbol': 'MRUN', 'fullname': 'Metarun (MRUN)'}]
14 [{'id': 'dungeon', 'coingecko_id': '', 'name': 'Dungeon', 'rank': 0, 'symbol': 'DGN', 'fullname': 'Dungeon (DGN)'}]
15 [{'id': 'monetha', 'coingecko_id': '', 'name': 'Monetha', 'rank': 1967, 'symbol': 'MTH', 'fullname': 'Monetha (MTH)'}]
date_event can_occur_before created_date \
0 2022-07-13T00:00:00Z False 2022-06-27T14:39:15Z
1 2022-07-13T00:00:00Z False 2022-07-09T13:27:25Z
2 2022-07-13T00:00:00Z False 2022-07-02T06:10:09Z
3 2022-07-13T00:00:00Z False 2022-07-07T13:55:34Z
4 2022-07-13T00:00:00Z False 2022-07-11T18:42:01Z
5 2022-07-13T00:00:00Z False 2022-07-11T18:16:08Z
6 2022-07-13T00:00:00Z False 2022-07-06T06:55:16Z
7 2022-07-13T00:00:00Z False 2022-07-12T13:59:23Z
8 2022-07-13T00:00:00Z False 2022-07-11T18:43:02Z
9 2022-07-13T00:00:00Z False 2022-07-11T14:12:23Z
10 2022-07-13T00:00:00Z False 2022-07-11T14:11:47Z
11 2022-07-13T00:00:00Z False 2022-07-12T13:49:28Z
12 2022-07-13T00:00:00Z False 2022-07-12T14:05:15Z
13 2022-07-13T00:00:00Z False 2022-07-12T18:46:28Z
14 2022-07-13T00:00:00Z False 2022-07-12T13:48:55Z
15 2022-07-13T00:00:00Z False 2022-07-12T23:33:03Z
categories \
0 [{'id': 14, 'name': 'Fork/Swap'}]
1 [{'id': 16, 'name': 'Team Update'}]
2 [{'id': 4, 'name': 'Exchange'}]
3 [{'id': 4, 'name': 'Exchange'}]
4 [{'id': 17, 'name': 'Staking/Farming'}]
5 [{'id': 4, 'name': 'Exchange'}]
6 [{'id': 7, 'name': 'Other'}]
7 [{'id': 9, 'name': 'AMA'}]
8 [{'id': 9, 'name': 'AMA'}]
9 [{'id': 4, 'name': 'Exchange'}]
10 [{'id': 16, 'name': 'Team Update'}]
11 [{'id': 4, 'name': 'Exchange'}]
12 [{'id': 4, 'name': 'Exchange'}]
13 [{'id': 9, 'name': 'AMA'}]
14 [{'id': 4, 'name': 'Exchange'}]
15 [{'id': 4, 'name': 'Exchange'}]
I need to change the column "title": delete the key 'en' and stay only values.
I need to change the column "coins": extract keys as separate columns and put there their values.
I need to change the column "categories": delete the key "id" and values from "id", delete the key "name", but stay values from "name"
For columns with list in rows i would use pandas.explode
For columns with dict rows, use .apply(pandas.Series)
and then rename the columns with same name if u want use it (like 'id') or reformat the dicts when you get the parsed json
should look like this
import pandas
df = pandas.DataFrame({
'1': [[{"id": 1, "a": 1}], [{"id": 2, "a": 1}]],
'2': [1, 2],
'3': [[{"id": 1, "name": "a"}], [{"id": 2, "name": "b"}]],
'4': [{"en": "a"}, {"en": "b"}]
})
df = df.explode(["1", "3"])
pandas.concat([
df.drop(["1", "3", "4"], axis=1),
df['1'].apply(pandas.Series),
df['3'].apply(pandas.Series),
df['4'].apply(pandas.Series)
], axis=1)

Concat 2 list of dictionaries with same id

I have 2 lists of dictionaries
a = [{'id':1, 'name':'John Doe'}, {'id':2, 'name':'Jane Doe'}, {'id':4, 'name':'Sample Doe'}]
b = [{'id':1, 'rating':9}, {'id':2, 'rating':7}, {'id':3, 'rating':8}]
Is there a way to concat b to a if the id b is on id a?
[{'id':1, 'name':'John Doe', 'rating':9}, {'id':2, 'name':'Jane Doe', 'rating':7}, {'id':4, 'name':'Sample Doe', 'rating':0}]
You could use the new merging dictionaries feature introduced in Python 3.9:
>>> a = [{'id': 1, 'name': 'John Doe'}, {'id': 2, 'name': 'Jane Doe'}, {'id': 4, 'name': 'Sample Doe'}]
>>> b = [{'id': 1, 'rating': 9}, {'id': 2, 'rating': 7}, {'id': 3, 'rating': 8}]
>>> b_id_to_d = {d['id']: d for d in b} # Create for O(1) lookup time by id.
>>> b_id_to_d
{1: {'id': 1, 'rating': 9}, 2: {'id': 2, 'rating': 7}, 3: {'id': 3, 'rating': 8}}
>>> c = [d | b_id_to_d.get(d['id'], {'rating': 0}) for d in a]
>>> c
[{'id': 1, 'name': 'John Doe', 'rating': 9}, {'id': 2, 'name': 'Jane Doe', 'rating': 7}, {'id': 4, 'name': 'Sample Doe', 'rating': 0}]
For older versions of Python you can try use dict unpacking instead:
>>> c = [{**d, **b_id_to_d.get(d['id'], {'rating': 0})} for d in a]
>>> c
[{'id': 1, 'name': 'John Doe', 'rating': 9}, {'id': 2, 'name': 'Jane Doe', 'rating': 7}, {'id': 4, 'name': 'Sample Doe', 'rating': 0}]
This should work:
[{**item1, **item2} for item1 in a for item2 in b if item1['id'] == item2['id']]
It iterates over the the two dict so it is O(n^2), but it is clear and concise.
{**item1, **item2} means adds the key value pairs from item1, then the key value pairs from item2.
Here, the results will be:
[{'id': 1, 'name': 'John Doe', 'rating': 9},
{'id': 2, 'name': 'Jane Doe', 'rating': 7}]
There is no direct solution to this problem.
But you can use following code:
a = [{'id':1, 'name':'John Doe'}, {'id':2, 'name':'Jane Doe'}]
b = [{'id':1, 'rating':9}, {'id':2, 'rating':7}, {'id':3, 'rating':8}]
key_pos_mapping = {}
for index,dict in enumerate(a):
key_pos_mapping[dict['id']] = index
for dict in b:
if( dict['id'] in key_pos_mapping.keys()):
dict.update(a[key_pos_mapping[dict['id']]])
else:
b.remove(dict)

Extract values from dicts inside lists

I'm trying to extract the values from this JSON file, but I having some trouble to extract the data inside from lists in the dict values. For example, in the city and state, I would like to get only the name values and create a Pandas Dataframe and select only some keys like this.
I tried using some for with get methods techniques, but without success.
{'birthday': ['1987-07-13T00:00:00.000Z'],
'cpf': ['9999999999999'],
'rg': [],
'gender': ['Feminino'],
'email': ['my_user#bol.com.br'],
'phone_numbers': ['51999999999'],
'photo': [],
'id': 11111111,
'duplicate_id': -1,
'name': 'My User',
'cnpj': [],
'company_name': '[]',
'city': [{'id': 0001, 'name': 'Porto Alegre'}],
'state': [{'id': 100, 'name': 'Rio Grande do Sul', 'fs': 'RS'}],
'type': 'Private Person',
'tags': [],
'pending_tickets_count': 0}
In [123]: data
Out[123]:
{'birthday': ['1987-07-13T00:00:00.000Z'],
'cpf': ['9999999999999'],
'rg': [],
'gender': ['Feminino'],
'email': ['my_user#bol.com.br'],
'phone_numbers': ['51999999999'],
'photo': [],
'id': 11111111,
'duplicate_id': -1,
'name': 'My User',
'cnpj': [],
'company_name': '[]',
'city': [{'id': '0001', 'name': 'Porto Alegre'}],
'state': [{'id': 100, 'name': 'Rio Grande do Sul', 'fs': 'RS'}],
'type': 'Private Person',
'tags': [],
'pending_tickets_count': 0}
In [124]: data2 = {k:v for k,v in data.items() if k in required}
In [125]: data2
Out[125]:
{'birthday': ['1987-07-13T00:00:00.000Z'],
'gender': ['Feminino'],
'id': 11111111,
'name': 'My User',
'city': [{'id': '0001', 'name': 'Porto Alegre'}],
'state': [{'id': 100, 'name': 'Rio Grande do Sul', 'fs': 'RS'}]}
In [126]: pd.DataFrame(data2).assign(
...: city_name=lambda x: x['city'].str.get('name'),
...: state_name=lambda x: x['state'].str.get('name'),
...: state_fs=lambda x: x['state'].str.get('fs')
...: ).drop(['state', 'city'], axis=1)
Out[126]:
birthday gender id name city_name state_name state_fs
0 1987-07-13T00:00:00.000Z Feminino 11111111 My User Porto Alegre Rio Grande do Sul RS
reason why data2 is required is that you can't have columns that differ in length. So in this case, pd.DataFrame(data) won't work as rg has 0 items but birthday has 1 item.
Also something to look at if you are directly dealing with json files is pd.json_normalize

Use the column of a dataframe that has a list of dictionaries to create other columns for the dataframe

I have a column in my dataframe of type object that has values like:
for i in df3['placeholders'][:10]:
Output:
[{'type': 'experience', 'label': '0-1 Yrs'}, {'type': 'salary', 'label': '1,00,000 - 1,25,000 PA.'}, {'type': 'location', 'label': 'Chennai'}]
[{'type': 'date', 'label': '08 October - 13 October'}, {'type': 'salary', 'label': 'Not disclosed'}, {'type': 'location', 'label': 'Chennai'}]
[{'type': 'education', 'label': 'B.Com'}, {'type': 'salary', 'label': 'Not disclosed'}, {'type': 'location', 'label': 'Mumbai Suburbs, Navi Mumbai, Mumbai'}]
[{'type': 'experience', 'label': '0-2 Yrs'}, {'type': 'salary', 'label': '50,000 - 2,00,000 PA.'}, {'type': 'location', 'label': 'Chennai'}]
[{'type': 'experience', 'label': '0-1 Yrs'}, {'type': 'salary', 'label': '2,00,000 - 2,25,000 PA.'}, {'type': 'location', 'label': 'Bengaluru(JP Nagar)'}]
[{'type': 'experience', 'label': '0-3 Yrs'}, {'type': 'salary', 'label': '80,000 - 2,00,000 PA.'}, {'type': 'location', 'label': 'Hyderabad'}]
[{'type': 'experience', 'label': '0-5 Yrs'}, {'type': 'salary', 'label': 'Not disclosed'}, {'type': 'location', 'label': 'Hyderabad'}]
[{'type': 'experience', 'label': '0-1 Yrs'}, {'type': 'salary', 'label': '1,25,000 - 2,00,000 PA.'}, {'type': 'location', 'label': 'Mumbai'}]
[{'type': 'date', 'label': '08 October - 17 October'}, {'type': 'salary', 'label': 'Not disclosed'}, {'type': 'location', 'label': 'Pune(Bavdhan)'}]
[{'type': 'experience', 'label': '0-2 Yrs'}, {'type': 'salary', 'label': 'Not disclosed'}, {'type': 'location', 'label': 'Jaipur'}]
[{'type': 'experience', 'label': '0-0 Yrs'}, {'type': 'salary', 'label': '1,00,000 - 1,50,000 PA.'}, {'type': 'location', 'label': 'Delhi NCR(Sector-81 Noida)'}]
I want to add more columns to my existing dataframe by extracting features from this column such that
value of "type"= Column name
value of "label"= value under the column
The final expected output:
df.head(3)
Output:
..... experience, salary, location, date, education
..... 0-1 Yrs, 1,00,000 - 1,25,000 PA., Chennai, nan, nan
..... nan, 1,00,000 - 1,25,000 PA., Chennai, 08 October - 13 October, nan
..... nan, Not disclosed, Mumbai Suburbs, Navi Mumbai, Mumbai, nan, B.Com
The first answer worked.
[EDIT 2]
Later, I tried the same code suggested in the first response for a new dataset with same issue. I got the following error:
<ipython-input-23-ad8e644044af> in <listcomp>(.0)
----> 1 new_columns = set([d['Name'] for l in dfr.RatingDistribution.values for d in l ])
2 # Make a dict of dicts
3 col_val_dict = {}
4 for col_name in new_columns:
5 col_val_dict[col_name] = {}
TypeError: 'float' object is not iterable
My Input column:
RatingDistribution
[{'Name': 'Work-Life Balance', 'count': 5}, {'Name': 'Skill Development', 'count': 5}, {'Name': 'Salary & Benefits', 'count': 5}, {'Name': 'Job Security', 'count': 5}, {'Name': 'Company Culture', 'count': 5}, {'Name': 'Career Growth', 'count': 5}, {'Name': 'Work Satisfaction', 'count': 5}]
[{'Name': 'Work-Life Balance', 'count': 4}, {'Name': 'Skill Development', 'count': 5}, {'Name': 'Salary & Benefits', 'count': 4}, {'Name': 'Job Security', 'count': 4}, {'Name': 'Company Culture', 'count': 3}, {'Name': 'Career Growth', 'count': 3}, {'Name': 'Work Satisfaction', 'count': 5}]
[{'Name': 'Work-Life Balance', 'count': 3}, {'Name': 'Skill Development', 'count': 4}, {'Name': 'Salary & Benefits', 'count': 5}, {'Name': 'Job Security', 'count': 4}, {'Name': 'Company Culture', 'count': 5}, {'Name': 'Career Growth', 'count': 4}, {'Name': 'Work Satisfaction', 'count': 4}]
[{'Name': 'Work-Life Balance', 'count': 5}, {'Name': 'Skill Development', 'count': 5}, {'Name': 'Salary & Benefits', 'count': 5}, {'Name': 'Job Security', 'count': 5}, {'Name': 'Company Culture', 'count': 5}, {'Name': 'Career Growth', 'count': 5}, {'Name': 'Work Satisfaction', 'count': 5}]
[{'Name': 'Work-Life Balance', 'count': 3}, {'Name': 'Skill Development', 'count': 5}, {'Name': 'Salary & Benefits', 'count': 3}, {'Name': 'Job Security', 'count': 3}, {'Name': 'Company Culture', 'count': 3}, {'Name': 'Career Growth', 'count': 3}, {'Name': 'Work Satisfaction', 'count': 4}]
[{'Name': 'Work-Life Balance', 'count': 3}, {'Name': 'Skill Development', 'count': 5}, {'Name': 'Salary & Benefits', 'count': 5}, {'Name': 'Job Security', 'count': 1}, {'Name': 'Company Culture', 'count': 3}, {'Name': 'Career Growth', 'count': 1}, {'Name': 'Work Satisfaction', 'count': 1}]
My code:
new_columns = set([d['Name'] for l in dfr.RatingDistribution.values for d in l ])
# Make a dict of dicts
col_val_dict = {}
for col_name in new_columns:
col_val_dict[col_name] = {}
# For each column name look to see if a row has that as a type
# If so, get the label for that dict
# otherwise fill it with NaN
for i,l in enumerate(dfr.placeholders.values):
the_label = [d['count'] for d in l if d['Name'] == col_name]
if the_label:
col_val_dict[col_name][i] = the_label[0]
else:
col_val_dict[col_name][i] = np.NaN
# Merge this new dfa with the old one
merged_dfa = pd.concat([dfr,pd.DataFrame(col_val_dict)],axis='columns')
dfr.shape
I'm getting error in the very first line. I'm not able to figure out why it is throwing me the float error.
PLEASE HELP
# Get the unique types (column names)
new_columns = set([d['type'] for l in df3.placeholders.values for d in l ])
# Make a dict of dicts
col_val_dict = {}
for col_name in new_columns:
col_val_dict[col_name] = {}
# For each column name look to see if a row has that as a type
# If so, get the label for that dict
# otherwise fill it with NaN
for i,l in enumerate(df3.placeholders.values):
the_label = [d['label'] for d in l if d['type'] == col_name]
if the_label:
col_val_dict[col_name][i] = the_label[0]
else:
col_val_dict[col_name][i] = np.NaN
# Merge this new df with the old one
merged_df = pd.concat([df3,pd.DataFrame(col_val_dict)],axis='columns')

how to create a list that will store many values from a list of dictionaries

I have a list of Dictionaries in which airbnb[0] is
{
'room_id': '1133718',
'survey_id': '1280',
'host_id': '6219420',
'room_type': 'Shared room',
'country': '',
'city': 'Singapore',
'borough': '',
'neighborhood': 'MK03',
'reviews': 9.0,
'overall_satisfaction': 4.5,
'accommodates': '12',
'bedrooms': '1.0',
'bathrooms': '',
'price': 74.0,
'minstay': '',
'last_modified': '2017-05-17 09:10:25.431659',
'latitude': 1.293354,
'longitude': 103.769226,
'location': '0101000020E6100000E84EB0FF3AF159409C69C2F693B1F43F'
}
how do I go about it if I want to get a list consisting of only the room_id Value and the price for each dictionary in my list of dictionaries so that I can compile those lists in my new_list?
Not sure if this is what you're after but you can make a dictionary where the key is the room_id and the value the price for each property like so:
room_prices = { room['room_id'] : room['price'] for room in airbnb }
Then you access the price for a given room like so:
room_id = '1133718'
room_price = room_prices[room_id]
If you want them as tuples:
new_list = [(x['room_id'], x['price']) for x in airbnb]
# returns
[('1133718', 74.0)]
or a dict:
new_list = [{'room_id': x['room_id'], 'price': x['price']} for x in airbnb]
# returns
[{'room_id': '1133718', 'price': 74.0}]
A list comprehension selecting target keys in your list of dictionaries should do the job, assuming your list contains multiple dictionaries.
room_info =[{
'room_id': '1133718',
'survey_id': '1280',
'host_id': '6219420',
'room_type': 'Shared room',
'country': '',
'city': 'Singapore',
'borough': '',
'neighborhood': 'MK03',
'reviews': 9.0,
'overall_satisfaction': 4.5,
'accommodates': '12',
'bedrooms': '1.0',
'bathrooms': '',
'price': 74.0,
'minstay': '',
'last_modified': '2017-05-17 09:10:25.431659',
'latitude': 1.293354,
'longitude': 103.769226,
'location': '0101000020E6100000E84EB0FF3AF159409C69C2F693B1F43F'
},
{
'room_id': '1133718',
'survey_id': '1280',
'host_id': '6219420',
'room_type': 'Shared room',
'country': '',
'city': 'Singapore',
'borough': '',
'neighborhood': 'MK03',
'reviews': 9.0,
'overall_satisfaction': 4.5,
'accommodates': '12',
'bedrooms': '1.0',
'bathrooms': '',
'price': 74.0,
'minstay': '',
'last_modified': '2017-05-17 09:10:25.431659',
'latitude': 1.293354,
'longitude': 103.769226,
'location': '0101000020E6100000E84EB0FF3AF159409C69C2F693B1F43F'
}]
[[i['room_id'],i['price']] for i in room_info]
>>[['1133718', 74.0], ['1133718', 74.0]]
The result will return a nested list where each individual list contains the room_id and price detail.
It's easy to extract one element of the dict into a new list:
room_ids = [item.get('room_id') for item in airbnb]
Do that for all interesting ones and generate a new list of dicts, if you don't want separate lists. Or you can do all that in one loop
newlist = [{'room_id': item.get('room_id'), 'price': item.get('price')} for item in airbnb]
EDIT: Or a bit more verbose but more general:
mylist = [{'a': 1, 'b':2, 'c':1}, {'a': 2, 'b': 2, 'c':1}, {'a': 5, 'b': 2, 'c':1}, {'b': 5}]
interesting_keys = ['a', 'b']
newlist = []
for item in mylist:
d = dict()
for i in interesting_keys:
d[i] = item.get(i)
newlist.append(d)
print(nl)
will output:
[{'a': 1, 'b': 2}, {'a': 2, 'b': 2}, {'a': 5, 'b': 2}, {'a': None, 'b': 5}]

Categories