How do I get the groupby operation to work? - python

Can't get Pandas Groupby operation to work.
I suspect I need to convert the data to a pandas dataframe first? However, I can't seem to get that to work either.
import requests
import json
import pandas as pd
baseurl = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/module%201/datasets/githubposting.json"
response = requests.get(baseurl)
data = response.json()
print(data)
def get_number_of_jobs(technology):
number_of_jobs = 0
number_of_jobs=data.groupby('technology').sum().loc[technology,:][0]
return technology,number_of_jobs
print(get_number_of_jobs('python'))
Thanks

data is a list of dictionaries, not DataFrame, so it doesn't have groupby. You don't really need it anyway, you can create the DataFrame while replacing the A and B columns with the first values in the json response and search for 'Python' there, it's already a single entry
baseurl = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/module%201/datasets/githubposting.json"
response = requests.get(baseurl)
data = response.json()
df = pd.DataFrame(columns=list(data[0].values()), data=[d.values() for d in data[1:]])
number_of_jobs = df.loc[df['technology'] == 'Python', 'number of job posting'].iloc[0]
print(number_of_jobs) # 51

Related

Convert json data into dataframe

I am unable to flatten the json data from this API into a dataframe. I have tried using json_normalize but I gives a NotImplemented error. Can someone help me with it? I need the columns: stationId, start, timestep, temperature where there are several values for temperature and rest of the columns should have same values.
import requests
import json
import pandas as pd
response_API = requests.get('https://dwd.api.proxy.bund.dev/v30/stationOverviewExtended?stationIds=10865,G005')
print(response_API.status_code)
data = response_API.text
json.loads(data)
df= ?
You can do it many ways, but your current approach should use json() instead of text
import requests
import json
import pandas as pd
response_API = requests.get('https://dwd.api.proxy.bund.dev/v30/stationOverviewExtended?stationIds=10865,G005')
print(response_API.status_code)
data = response_API.json() <--- it should be json()
print(data)
OR directly read json to df from the URL using read_json()
df = pd.read_json("https://dwd.api.proxy.bund.dev/v30/stationOverviewExtended?stationIds=10865,G005")
print(df)
Edit:
import requests
import json
import pandas as pd
response_API = requests.get('https://dwd.api.proxy.bund.dev/v30/stationOverviewExtended?stationIds=10865,G005')
print(response_API.status_code)
data = response_API.json()
result = []
for station, value in data.items():
for forecast, val in value.items():
if forecast in ['forecast1', 'forecast2']:
result.append(val)
df = pd.DataFrame(result)
print(df)

convert json to dataframe in for loops in python

I'm trying to call the data using api and making a dataframe using for loops with returned json. I am able to create the first dataframe but my for loop only returns the first json -> dataframe. After a few days struggle, I decided to ask guidance from experts here..
import requests
import json
import pandas as pd
# create an Empty DataFrame object
df = pd.DataFrame()
# api header
headers = {"Accept": "application/json","Authorization": "api_secret"}
#email for loops
email_list = ["abc#gmail.com", "xyz#gmail.com"]
#supposed to read 2 emails in the list and append each df but only reads the first one...#
for i in email_list:
querystring = {"where":i}
response = requests.request("GET", "https://example.com/api/2.0/export", headers=headers, params=querystring)
with open('test.jsonl', 'w') as writefile:
writefile.write(response.text)
data = [json.loads(line) for line in open('test.jsonl', 'r')]
FIELDS = ["event"]
df = pd.json_normalize(data)[FIELDS]
df = df.append(df)
I wonder if I need to change something in df append but I can't pinpoint where needs to be changed. thank you so much in advance!
df = pd.json_normalize(data)[FIELDS]
df = df.append(df)
overwrites the dataframe each time instead, create a new one before appending:
df2 = pd.json_normalize(data)[FIELDS]
df = df.append(df2)

Pandas function explode does not work on this DataSeries

The pandas explode function doesn't drop the object elements into rows like it should.
import pandas as pd
import requests
import io
from pandas.io.json import json_normalize
response = requests.request("GET", url, headers=headers, data = payload)
response_text = response.text.encode('utf8')
fundingRate = pd.read_json(response_text,orient='columns',typ='frame')
fundingC = pd.DataFrame(fundingRate['data'])
fundingC = fundingC.T
fundingC = fundingC.astype(object)
fundingdataMap = fundingC['dataMap']
fundingdataMap = fundingdataMap.astype(str)
fundingdataMap = fundingdataMap.str.slice(start=10)
fundingdataMap.explode()
fundingdataMap DataSeries
https://www.pythonanywhere.com/user/armaniallie93/files/home/armaniallie93/fundingdataMap.txt
output
data [0.07280400000000001, 0.013058, 0.01, 0.01, 0....
Name: dataMap, dtype: object
After setting the column elements as a string and slicing the portion I want, no error but it still doesn't produce the explode function correctly. Any insight to why?
The reason for the error is quite simple. You have a dictionary which you are trying to explode, which would not work.
#Removing the first row with dictionary
df.iloc[1:].explode('data')
#Without removing first row
df.explode('data')
You will have to take a call on how you want to convert this dictionary into a list. That would require a lambda function.

How to create a dataframe from urlopen (csv)

My code:
# parse json returned from the API to Pandas DF
openUrl = urlopen(url)
r = openUrl.read()
openUrl.close()
#d = json.loads(r.decode())
#df = pd.DataFrame(d, index=[0])
df = pd.DataFrame(r, index=[0])
The error:
ValueError: DataFrame constructor not properly called!
Help would be aprreacited.
The DataFrame constructor requires an nd-array like input (or dict, iterable).
You can use pandas.read_csv if you want to directly input a csv and get a DataFrame.
Try printing r to see what is actually inside the response.
pandas.read_csv has a lot of option parameters to handle different types of csv, which of course depends on what you're getting from the url.
This snippet might help you.
import urllib.request
import pandas as pd
r = urllib.request.urlopen('HERE GOES YOUR LINK')
x = r.read()
print(type(x))
y = str(x)
df = pd.DataFrame([y], columns=['string_values'])
print (df)

Generating Dataframe from JSON URL in a column in another DataFrame

I am trying to generate one dataframe based on Json Url in another Dataframe called Data
import requests
import pandas as pd
import numpy as np
resp = requests.get('https://financialmodelingprep.com/api/v3/company/stock/list')
txt = resp.json()
Data = pd.DataFrame(txt['symbolsList'])
Data = Data.assign(keymetric= 'https://financialmodelingprep.com/api/v3/company-key-metrics/'+ Data.symbol + '?period=quarter')
Data = Data.assign(profile= 'https://financialmodelingprep.com/api/v3/company/profile/'+ Data.symbol)
Data = Data.assign(financials= 'https://financialmodelingprep.com/api/v3/financial-statement-growth/'+ Data.symbol + '?period=quarter')
I have 3 problems:
1) when I am downloading the JSON URL in the Dataframe ('Data') I don't have in the output the symbol
in the code below 'AAPL'
resp = requests.get('https://financialmodelingprep.com/api/v3/company-key-metrics/AAPL?period=quarter')
txt = resp.json()
key= pd.DataFrame(txt['metrics'])
2) I don't know how to automate the code above, using as an import the column 'keymetrics' in the dataframe 'Data'
3) once the process is done I am trying to have just one dataframe instead of having one per each symbol
Expected output for keymetrics. Each column should be divided not all aggregated under one column called 'keymetric'
This code can work.
import pandas as pd
import requests
resp = requests.get('https://financialmodelingprep.com/api/v3/company/stock/list')
txt = resp.json()
Data = pd.DataFrame(txt['symbolsList'])
def get_value(symbol):
resp_keymetric = requests.get(f'https://financialmodelingprep.com/api/v3/company-key-metrics/{symbol}?period=quarter')
resp_profile = requests.get(f'https://financialmodelingprep.com/api/v3/company/profile/{symbol}?period=quarter')
resp_financials = requests.get(f'https://financialmodelingprep.com/api/v3/financial-statement-growth/{symbol}?period=quarter')
try:
txt_keymetric = resp_keymetric.json()['metrics'][0]
txt_profile = resp_profile.json()['profile']
txt_financials = resp_financials.json()['growth'][0]
df_keymetric = pd.DataFrame([txt_keymetric])
df_profile = pd.DataFrame([txt_profile])
df_financials = pd.DataFrame([txt_financials])
df = pd.concat([df_keymetric, df_profile, df_financials], axis=1)
return df
except:
pass
result = []
for symbol in Data['symbol'].values.tolist()[:5]:
try:
df = get_value(symbol)
result.append(df)
except:
pass
result_df = pd.concat(result, axis=0)
print(result_df)
Expected output for keymetrics. Each column should be divided not all aggregated under one column called 'keymetric'
current output

Categories