Add values from a nested JSON to a pandas dataframe - python

I have the following JSON object:
{"code":"Ok","matchings":[{"confidence":0.025755,"geometry":"qnp{bBww{kH??~D_I}E_J{EaJ{E{I{AsCoJgQfKuTjJwNtF}HdBuBnAgBpFsF~EeEzAsAt#i#lA}#x#q#lEmCjDuBdDoAvFmAfYmEtAUrJyDj#_#h#m#`#u#T}#J{#B_A?gAGmAM}#Su#]u#wN{QwI{KcA}Aa#gASiAWsBOwCGmDCoJ??cEH?{FA{HgIXuG`#eHrAsLdDkI|CkIfDq#VoDlB_GzDaE`D_A|#kA`AeAx#sI~G}DlDk#j#mClCiOrQwGvJiGxJoFdK_HjP{Pne#aLt\\sK~]oKb_#sG~TeJ`_#q#fD{#dEoBlMwBxQaAbI{Dh\\wKrfAiRbvBy#`KaLjwAyHj_AANM~AUxC}#tKi#bHe#jGfBj#t#V|#\\TFjAXz#HhASxAy#vCcBjX~GvG`BlEjAv\\xJfBf#dThG~Ad#nFrBnCbBdCvBzB`DbCfEr{#b~A","legs":[{"annotation":{"nodes":[330029575,5896466632,330029575,5896466588,5896466587,5896466586,5896466637,330029340,330029339,330029338,1497356855,1880770263,46388213,1880770262,1880770257,2021835257,3306177380,46387099,2021835255,6909770873,46385948,6909770874,46384887,46382454]},"steps":[],"distance":332.2,"duration":93.1,"summary":"","weight":93.1},{"annotation":{"nodes":[46384887,46382454,5888264001,6909802199,3296872014,6909802198,5888264003,6909802197,3296872012,6909802194,6909802195,6909802193,6909802196,3296872013,3296872015]},"steps":[],"distance":88.1,"duration":13.5,"summary":"","weight":13.5},{"annotation":{"nodes":[3296872013,3296872015,6909802186,6909802187,6909770884,3296872017,6909802185,4904066416,3296872018,1614187163]},"steps":[],"distance":62.3,"duration":12.4,"summary":"","weight":12.4},{"annotation":{"nodes":[3296872018,1614187163,2054127599,1614187129,5896479942,6909802219,46384372,1027299576,6909802220,46389815]},"steps":[],"distance":144,"duration":25.2,"summary":"","weight":25.2},{"annotation":{"nodes":[6909802220,46389815,6296436095,6296436094,298079716,6296436096,46391324,1083528076,6909802221,6909802222,46393158]},"steps":[],"distance":90.6,"duration":10.1,"summary":"","weight":10.1},{"annotation":{"nodes":[6909802222,46393158,46393795,6909802223,1027299602,6909802224,46396846,46398397,2054127645,46399502,46400708,1027299589,6712474212,6903665704,46402805,46403163,4374153462]},"steps":[],"distance":422.9,"duration":40.1,"summary":"","weight":40.1},{"annotation":{"nodes":[46403163,4374153462,46404084,1027299603,364146312,2262500170]},"steps":[],"distance":273.6,"duration":24.7,"summary":"","weight":24.7},{"annotation":{"nodes":[364146312,2262500170,5289718695]},"steps":[],"distance":170.9,"duration":15.3,"summary":"","weight":15.3},{"annotation":{"nodes":[2262500170,5289718695,2054127657,1693195716,46408565,6913837768,1693195721,2262500247,1693195714,2262500104,1693195717]},"steps":[],"distance":56.9,"duration":14.2,"summary":"","weight":14.2},{"annotation":{"nodes":[46397705,46401323,46405521]},"steps":[],"distance":86.6,"duration":12.6,"summary":"","weight":12.6},{"annotation":{"nodes":[46401323,46405521,46410773]},"steps":[],"distance":156.5,"duration":22.5,"summary":"","weight":22.5},{"annotation":{"nodes":[46405521,46410773,452003319,452003320]},"steps":[],"distance":95.4,"duration":13.8,"summary":"","weight":13.8},{"annotation":{"nodes":[452003319,452003320,46411428,46414457,46419384,46421801]},"steps":[],"distance":226.4,"duration":32.6,"summary":"","weight":32.6},{"annotation":{"nodes":[46419384,46421801,46421802,46421735]},"steps":[],"distance":69.2,"duration":10,"summary":"","weight":10},{"annotation":{"nodes":[46421802,46421735,46421416]},"steps":[],"distance":34.1,"duration":4.9,"summary":"","weight":4.9},{"annotation":{"nodes":[46421735,46421416,46420466]},"steps":[],"distance":2.7,"duration":0.3,"summary":"","weight":0.3},{"annotation":{"nodes":[46421416,46420466]},"steps":[],"distance":31.4,"duration":4.6,"summary":"","weight":4.6},{"annotation":{"nodes":[46421416,46420466,452003307,452003308,46421260,46422467,5761752102,46423905]},"steps":[],"distance":135.5,"duration":25,"summary":"","weight":25},{"annotation":{"nodes":[5761752102,46423905,46424346,5777055555,5713213408,46425605,5777055050,5777346784,5777055556,5713221227,46426685,46427741,3175895442,3183752428,5826014405,46428227]},"steps":[],"distance":106.5,"duration":14.9,"summary":"","weight":14.9},{"annotation":{"nodes":[5826014405,46428227,3175895443,5826014406,3175895444,5826014368,5826014369,5826014374,46429570,5826014373,5826014375,5826014372,5826014358,5826014371,5826014370,5826014376]},"steps":[],"distance":172.7,"duration":15.7,"summary":"","weight":15.7},{"annotation":{"nodes":[2054127660,2054127638,2054127605,6296435009,2054127599,6909770882,3296872018,4904066416,6909802185,3296872017,6909770884,6909802187,6909802186,3296872015,3296872013,6909802196,6909802193,6909802195,6909802194,3296872012,6909802197,5888264003,6909802198,3296872014,6909802199,5888264001,46382454,46384887,6909770874,46385948,6909770873,2021835255,46387099,3306177380,2021835257]},"steps":[],"distance":317.7,"duration":46.1,"summary":"","weight":46.1},{"annotation":{"nodes":[3306177380,2021835257,1880770257,1880770262,46388213,1880770263,1497356855,330029338,330029339,330029340,5896466637]},"steps":[],"distance":150.4,"duration":29.4,"summary":"","weight":29.4}],"distance":80317.8,"duration":10983.5,"weight_name":"duration","weight":10983.5}],"tracepoints":[{"alternatives_count":0,"waypoint_index":0,"matchings_index":0,"location":[4.929932,52.372217],"name":"Willem Theunisse Blokstraat","distance":10.791613,"hint":"CAkHgHAJBwAlAAAAAAAAAAAAAAAAAAAALCd0QQAAAAAAAAAAAAAAACUAAAAAAAAAAAAAAAAAAAABAAAAjDlLAPkiHwP3OEsAGiMfAwAArxMz7Ejh"},null,{"alternatives_count":0,"waypoint_index":1,"matchings_index":0,"location":[4.932506,52.3709],"name":"Frans de Wollantstraat","distance":11.915926,"hint":"pwUBAPYEAYAHAAAARwAAAAAAAAAAAAAA3_qaQE0JPUIAAAAAAAAAAAcAAABHAAAAAAAAAAAAAAABAAAAmkNLANQdHwPtQksAxB0fAwAA_xUz7Ejh"},{"alternatives_count":0,"waypoint_index":472,"matchings_index":0,"location":[4.932745,52.373288],"name":"Piet Heinkade","distance":0.98867,"hint":"gwUBgMgFAQAFAAAADQAAABoBAABYAAAAQMS3QHTNW0HsWZ1DmZ2WQgUAAAANAAAAGgEAAFgAAAABAAAAiURLACgnHwN9REsAIycfAwoADwkz7Ejh"},null,null,{"alternatives_count":1,"waypoint_index":473,"matchings_index":0,"location":[4.934022,52.371637],"name":"Piet Heinkade","distance":2.713742,"hint":"NA8HADsPB4ACAAAADwAAADoAAAA-AAAAjU82QIAqg0FUpSdCLoWJQgIAAAAPAAAAOgAAAD4AAAABAAAAhklLALUgHwNfSUsAsCAfAwQAvxUz7Ejh"},null,null,{"alternatives_count":1,"waypoint_index":474,"matchings_index":0,"location":[4.93213,52.371794],"name":"Frans de Wollantstraat","distance":10.337677,"hint":"AgUBgAcFAQABAAAABAAAAAwAAAAAAAAA1paeP-KrBUAomAdBAAAAAAEAAAAEAAAADAAAAAAAAAABAAAAIkJLAFIhHwOrQksAeiEfAwIA7xQz7Ejh"},{"alternatives_count":1,"waypoint_index":475,"matchings_index":0,"location":[4.93074,52.372528],"name":"Isaac Titsinghkade","distance":0.65222,"hint":"AwkHgAYJBwA5AAAACwAAAAAAAACMAAAA_Fe_QWP_k0AAAAAA33FqQjkAAAALAAAAAAAAAIwAAAABAAAAtDxLADAkHwOtPEsANCQfAwAADw4z7Ejh"},null,null]}
I want to add all values that belong to the key nodes to one column in a pandas dataframe
When I run:
for i in output["matchings"][0]['legs']:
result = i['annotation']['nodes']
df = pd.DataFrame(result, columns=['node'])
df
only a fraction gets added to the dataframe. What am I doing wrong?

At the end of your for loop, 'df' keeps the last 'node' key of your json. You have to append all 'nodes' keys in a single dataframe instead.
Extending your code:
df = pd.DataFrame({'node':{}})
for i in output["matchings"][0]['legs']:
result = i['annotation']['nodes']
df_temp = pd.DataFrame(result, columns=['node'])
df = df.append(df_temp, ignore_index=True)

Related

Understanding initializing an empty dictionary

I really do not understand how there was the command (if "entry" in langs_count) is possible when the dictionary was initialized to be empty, so what is inside the dictionary and how did it get there? I'm really confused
`
import pandas as pd
# Import Twitter data as DataFrame: df
df = pd.read_csv("tweets.csv")
# Initialize an empty dictionary: langs_count
langs_count = {}
# Extract column from DataFrame: col
col = df['lang']
# Iterate over lang column in DataFrame
for entry in col:
# If the language is in langs_count, add 1
if entry in langs_count.keys():
langs_count[entry]+=1
# Else add the language to langs_count, set the value to 1
else:
langs_count[entry]=1
# Print the populated dictionary
print(langs_count)
`
You can implement the count functionality using groupby.
import pandas as pd
# Import Twitter data as DataFrame: df
df = pd.read_csv("tweets.csv")
# Populate dictionary with count of occurrences in 'lang' column
langs_count = dict(df.groupby(['lang']).size())
# Print the populated dictionary
print(langs_count)

Dataframe with empty column in the data

I have a list of lists with an header row and then the different value rows.
It could happen that is some cases the last "column" has an empty value for all the rows (if just a row has a value it works fine), but DataFrame is not happy about that as the number of columns differs from the header.
I'm thinking to add a None value to the first list without any value before creating the DF, but I wondering if there is a better way to handle this case?
data = [
["data1", "data2", "data3"],
["value11", "value12"],
["value21", "value22"],
["value31", "value32"]]
headers = data.pop(0)
dataframe = pandas.DataFrame(data, columns = headers)
You could do this:
import pandas as pd
data = [
["data1", "data2", "data3"],
["value11", "value12"],
["value21", "value22"],
["value31", "value32"]
]
# create dataframe
df = pd.DataFrame(data)
# set new column names
# this will use ["data1", "data2", "data3"] as new columns, because they are in the first row
df.columns = df.iloc[0].tolist()
# now that you have the right column names, just jump the first line
df = df.iloc[1:].reset_index(drop=True)
df
data1 data2 data3
0 value11 value12 None
1 value21 value22 None
2 value31 value32 None
Is this that you want?
You can use pd.reindex function to add missing columns. You can possibly do something like this:
import pandas as pd
df = pd.DataFrame(data)
# To prevent throwing exception.
df.columns = headers[:df.shape[1]]
df = df.reindex(headers,axis=1)

Cannot assign to function call when looping through and converting excel files

With this code:
xls = pd.ExcelFile('test.xlsx')
sn = xls.sheet_names
for i,snlist in list(zip(range(1,13),sn)):
'df{}'.format(str(i)) = pd.read_excel('test.xlsx',sheet_name=snlist, skiprows=range(6))
I get this error:
'df{}'.format(str(i)) = pd.read_excel('test.xlsx',sheet_name=snlist,
skiprows=range(6))
^ SyntaxError: cannot assign to function call
I can't understand the error and how solve. What's the problem?
df+str(i) also return error
i want to make result as:
df1 = pd.read_excel.. list1...
df2 = pd.read_excel... list2....
You can't assign the result of df.read_excel to 'df{}'.format(str(i)) -- which is a string that looks like "df0", "df1", "df2" etc. That is why you get this error message. The error message is probably confusing since its treating this as assignment to a "function call".
It seems like you want a list or a dictionary of DataFrames instead.
To do this, assign the result of df.read_excel to a variable, e.g. df and then append that to a list, or add it to a dictionary of DataFrames.
As a list:
dataframes = []
xls = pd.ExcelFile('test.xlsx')
sn = xls.sheet_names
for i, snlist in list(zip(range(1, 13), sn)):
df = pd.read_excel('test.xlsx', sheet_name=snlist, skiprows=range(6))
dataframes.append(df)
As a dictionary:
dataframes = {}
xls = pd.ExcelFile('test.xlsx')
sn = xls.sheet_names
for i, snlist in list(zip(range(1, 13), sn)):
df = pd.read_excel('test.xlsx', sheet_name=snlist, skiprows=range(6))
dataframes[i] = df
In both cases, you can access the DataFrames by indexing like this:
for i in range(len(dataframes)):
print(dataframes[i])
# Note indexes will start at 0 here instead of 1
# You may want to change your `range` above to start at 0
Or more simply:
for df in dataframes:
print(df)
In the case of the dictionary, you'd probably want:
for i, df in dataframes.items():
print(i, df)
# Here, `i` is the key and `df` is the actual DataFrame
If you really do want df1, df2 etc as the keys, then do this instead:
dataframes[f'df{i}'] = df

Separate column data with a comma to two columns for dataframe

The data set I pulled from an API return looks like this:
([['Date', 'Value']],
[[['2019-08-31', 445000.0],
['2019-07-31', 450000.0],
['2019-06-30', 450000.0]]])
I'm trying to create a DataFrame with two columns from the data:
Date & Value
Here's what I've tried:
df = pd.DataFrame(city_data, index =['a', 'b'], columns =['Names'] .
['Names1'])
city_data[['Date','Value']] =
city_data['Date'].str.split(',',expand=True)
city_data
city_data.append({"header": column_value,
"Value": date_value})
city_data = pd.DataFrame()
This code was used to create the dataset. I pulled the lists from the API return:
column_value = data["dataset"]["column_names"]
date_value = data["dataset"]["data"]
city_data = ([column_value], [date_value])
city_data
Instead of creating a dataframe with two columns from the data, in most cases I get the "TypeError: list indices must be integers or slices, not str"
is it what you are looking for:
d = ([['Date', 'Value']],
[[['2019-08-31', 445000.0],
['2019-07-31', 450000.0],
['2019-06-30', 450000.0]]])
pd.DataFrame(d[1][0], columns=d[0][0])
return:

Save data frame from inside for loop

I have a function that takes in a dataframe and returns a (reduced) dataframe, e.g. like this:
def transforming_data(dataframe, col_1, col_2, normalized = True):
''' takes in dataframe, groups col_1 according to col_2 and returns dataframe
'''
df = dataframe[col_1].groupby(dataframe[col_2]).value_counts(normalize = normalized).unstack(fill_value = 0)
return dataframe
For the following code, this gives me:
import pandas as pd
import numpy as np
np.random.seed(12)
def transforming_data(df, col_1, col_2, normalized = True):
''' takes in df, groups col_1 according to col_2 and returns df '''
df = dataframe[col_1].groupby(dataframe[col_2]).value_counts(normalize = normalized).unstack(fill_value = 0)
return df
numrows = 1000
dataframe = pd.DataFrame({'Numerical': np.random.randn(numrows),
'Category': np.random.choice(['Panda', 'Elephant', 'Anaconda'], numrows),
'Response 1': np.random.choice(['Yes', 'Maybe', 'No', 'Don\'t know'], numrows),
'Response 2': np.random.choice(['Very Much', 'Much', 'A bit', 'Not at all'], numrows)})
test = transforming_data(dataframe, 'Response 1', 'Category')
print(test)
# Output
# Response 1 Don't know Maybe No Yes
# Category
# Anaconda 0.275229 0.232416 0.217125 0.275229
# Elephant 0.220588 0.270588 0.255882 0.252941
# Panda 0.258258 0.222222 0.273273 0.246246
So far, so good.
Now I want to use the function transforming_data inside a for loop for every column in dataframe (as I have lots of columns, not just two) and save the resulting dataframe to a new dataframe, e.g. test_response_1 and test_response_2 for this example.
Can someone point me in the right direction - i.e. how to implement the loop correctly?
So far, I am using something like this - but cannot figure out how to save the data frame
for column in dataframe.columns.tolist():
temp_df = transforming_data(dataframe, column, 'Category')
# here, I need to save tmp_df outside of the loop but don't know how to
Thanks a lot for pointers and help. (Note: the most similar question I found does not talk about actually saving the data frame, so it doesn't help me with this.
If you want to save (in memory) all of the temp_df's from your loop, you can append them to a list that you can then index afterwards:
temp_dfs = []
for column in dataframe.columns.tolist(): #you don't actually need the tolist() method here
temp_df = transforming_data(dataframe, column, 'Category')
temp_dfs.append(temp_df)
If you rather be able to access these temp_df's by the column name that was used to transform them, then you could assign each to a dictionary, using the column as the key:
temp_dfs = {}
for column in dataframe.columns.tolist():
temp_df = transforming_data(dataframe, column, 'Category')
temp_dfs[column] = temp_df
If by "save" you meant "write to disk", then you can use one of the many to_<file_format>() methods that pandas provides:
temp_dfs = {}
for column in dataframe.columns.tolist():
temp_df = transforming_data(dataframe, column, 'Category')
temp_df.to_csv('temp_df{}.csv'.format(column))
Here's the to_csv() docs.
The most simple solution would be to save the result dataframes into a list. Assuming that all columns that you want to loop over have the text Response in their column name:
result_dframes = []
for col_name in dataframe.filter(like='Response').columns:
result_dframe = transforming_data(dataframe, col_name, 'Category')
result_dframes.append(result_dframe)
Alternatively you can also obtain the exact same result with a list comprehension instead of a for-loop:
result_dframes = [
transforming_data(dataframe, col_name, 'Category')
for col_name in dataframe.filter(like='Response')
]

Categories