How to append data to a pandas dataframe? - python

I have a fairly complex sequence of functions calling apis and having the result set be appended to a dataframe - the thing is when I print the dataframe during each loop of append, I see new values but at the end when the loop breaks, I only see what value for final_df ? Any thoughts as to why?
df = pd.DataFrame(columns = ['repo', 'number', 'title', 'branch', 'merged_at', 'created_at', 'authored_by', 'merged_by', 'from_version', 'to_version'] )
def get_prs(repo,pr_number):
response = requests.request("GET", pgv.github_pr_url + str(repo) + '/pulls/' + str(pr_number), headers=pgv.headers)
response = response.json()
return response
def get_commits(repo,from_version,to_version):
response = requests.request("GET", pgv.github_commits_url + str(repo) +'/compare/' + str(from_version) + '...' + str(to_version) , headers=pgv.headers)
response = response.json()
# print(len(response['commits']))
# print(response['commits'])
for i in range(0,len(response['commits'])):
# print(response['commits'][i])
# x = re.match(r"\AMerge pull request #(?P<number>\d+) from/(?P<branch>(.+)\s*$)", response['commits'][i].get('commit').get('message'))
x = re.search("\AMerge pull request #(?P<number>\d+) from/(?P<branch>.*)", response['commits'][i].get('commit').get('message'))
# print(x)
if x is None:
pass
else:
# return re.search("(\d+)",x.group(0)).group(0), response['commits'][i].get('branches_url')
return x.group('number'), x.group('branch')
# print(x.group('branch'))
#query GitHub to get all commits between from_version and to_version.
def return_deploy_events():
final_object = []
response = requests.request('POST',pgv.url, params = {'api_key' : pgv.key}, json = pgv.query_params)
response = response.json()
if "jobs" in response:
time.sleep(5)
else:
for i in range(0,len(response['query_result']['data']['rows'])):
# print(response['query_result']['data']['rows'])
# get_prs(response['query_result']['data']['rows'][i].get('REPO'),get_commits(response['query_result']['data']['rows'][i].get('REPO'),response['query_result']['data']['rows'][i].get('FROM_VERSION'), response['query_result']['data']['rows'][i].get('TO_VERSION'))).get('merged_at')
try:
repo = response['query_result']['data']['rows'][i].get('REPO')
from_version = response['query_result']['data']['rows'][i].get('FROM_VERSION')
to_version = response['query_result']['data']['rows'][i].get('TO_VERSION')
# print(get_prs(repo,get_commits(repo,from_version, to_version)))
pull_requests = get_prs(repo,get_commits(repo,from_version, to_version)[0])
##pack into all one return
final_df = df.append({
'repo':repo,
'title': pull_requests.get('title'),
'branch': get_commits(repo,from_version, to_version)[1],
'created_at': pull_requests.get('created_at'),
'merged_at': pull_requests.get('merged_at'),
'authored_by': pull_requests.get('user').get('login'),
'merged_by': pull_requests.get('merged_by').get('login'),
'number': get_commits(repo,from_version, to_version)[0],
'from_version': from_version,
'to_version': to_version,}, ignore_index = True)
# print(get_commits(repo,from_version, to_version))
**HERE, WHEN UNCOMMENTED, PRINTS ALL RECORDS I WANT APPENDED **
# print(final_df.head(10))
except Exception:
pass
# 'title':, 'branch',
# 'merged_at', 'created_at', 'authored_by', 'merged_by',
# 'from_version': response['query_result']['data']['rows'][i].get('FROM_VERSION'), 'to_version':response['query_result']['data']['rows'][i].get('TO_VERSION')},
# ignore_index = True)
**BELOW IS WHERE IT PRINTS ONLY 1 RECORD **
print(final_df)
# final_df = json.loads(final_df.to_json(orient = 'records'))
# gec.json_to_s3(final_df, glob_common_vars.s3_resource,glob_common_vars.s3_bucket_name, 'test/test.json.gzip')
return_deploy_events()

I think the problem is, you are assigning each rows to the same variable.
So the last row will be printed at the last. So try to append each rows to result list.
def return_deploy_events():
final_object = []
result = []
response = requests.request('POST',pgv.url, params = {'api_key' : pgv.key}, json = pgv.query_params)
response = response.json()
if "jobs" in response:
time.sleep(5)
else:
for i in range(0,len(response['query_result']['data']['rows'])):
# print(response['query_result']['data']['rows'])
# get_prs(response['query_result']['data']['rows'][i].get('REPO'),get_commits(response['query_result']['data']['rows'][i].get('REPO'),response['query_result']['data']['rows'][i].get('FROM_VERSION'), response['query_result']['data']['rows'][i].get('TO_VERSION'))).get('merged_at')
try:
repo = response['query_result']['data']['rows'][i].get('REPO')
from_version = response['query_result']['data']['rows'][i].get('FROM_VERSION')
to_version = response['query_result']['data']['rows'][i].get('TO_VERSION')
# print(get_prs(repo,get_commits(repo,from_version, to_version)))
pull_requests = get_prs(repo,get_commits(repo,from_version, to_version)[0])
##pack into all one return
final_df = df.append({
'repo':repo,
'title': pull_requests.get('title'),
'branch': get_commits(repo,from_version, to_version)[1],
'created_at': pull_requests.get('created_at'),
'merged_at': pull_requests.get('merged_at'),
'authored_by': pull_requests.get('user').get('login'),
'merged_by': pull_requests.get('merged_by').get('login'),
'number': get_commits(repo,from_version, to_version)[0],
'from_version': from_version,
'to_version': to_version,}, ignore_index = True)
# print(get_commits(repo,from_version, to_version))
**HERE, WHEN UNCOMMENTED, PRINTS ALL RECORDS I WANT APPENDED **
# print(final_df.head(10))
result.append(final_df) # append the current row to result
except Exception:
pass
**BELOW IS WHERE IT PRINTS ONLY 1 RECORD **
print(result) # print the final result
I just added two lines of code, but I hope it works.

Related

Create merged df based on the url list [pandas]

I was able to extract the data from url_query url, but additionally, I would like to get the data from the urls_list created based on the query['ids'] column from dataframe. Please see below the current logic:
url = 'https://instancename.some-platform.com/api/now/table/data?display_value=true&'
team = 'query=group_name=123456789'
url_query = url+team
dataframe: query
[ids]
0 aaabbb1cccdddeee4ffggghhhhh5iijj
1 aa1bbb2cccdddeee5ffggghhhhh6iijj
issue_list = []
for issue in query['ids']:
issue_list.append(f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}')
response = requests.get(url_query, headers=headers,auth=auth, proxies=proxies)
data = response.json()
def api_response(k):
dct = dict(
event_id= k['number'],
created_time = k[‘created’],
status = k[‘status’],
created_by = k[‘raised_by’],
short_desc = k[‘short_description’],
group = k[‘team’]
)
return dct
raw_data = []
for p in data['result']:
rec = api_response(k)
raw_data.append(rec)
df = pd.DataFrame.from_records(raw_data)
df:
The url_query response extracts what I need, but the key is that I would like to add to the existing one 'df' add the data from the issue_list = []. I don't know how to put the issue_list = [] to the response. I've tried to add issue_list to the response = requests.get(issue_list, headers=headers,auth=auth, proxies=proxies) statement, but I've got invalid schema error.
You can create list of DataFrames with query q instead url_query and last join together by concat:
dfs = []
for issue in query['ids']:
q = f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}'
response = requests.get(q, headers=headers,auth=auth, proxies=proxies)
data = response.json()
raw_data = [api_response(k) for p in data['result']]
df = pd.DataFrame.from_records(raw_data)
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

Store result from URL into Pandas Data frame

I am new to Pandas & Python . Have a requirement where..
I am passing 100 post codes to a URL using for loop & trying to extract the latitude & longitude for each of the post codes passed.
The result of it I need to save in data frame . Below is the code I have am using .
query_cust = "select custMasterID,Full_Name,POSTCODE from DMON.BANK_CUSTOMERS"
df_cust = pd.read_sql(query_cust, con=con_str)
df_cust["URL"] = "https://api.getthedata.com/postcode/" + df_cust['POSTCODE'].str.replace(" ", "")
for column in df_cust["URL"]:
# print(column)
response = requests.get(column)
response_text = response.text
#df = json.loads(response_text)['data']
parse_json = json.loads(response_text)
df_cust["Lat"] = pd.json_normalize(parse_json['data']['latitude'])
df_cust["Long"] = parse_json['data']['longitude']
print(df_cust)
Below is the error which is coming when i try running it .
df_cust["Lat"] = pd.json_normalize(parse_json['data']['latitude'])
in _json_normalize
raise NotImplementedError
NotImplementedError
You don't need to use json_normalize to get what you need from the response data. Just iterate through each row of the dataframe and update the values:
import pandas as pd
import json
import requests
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df_cust = pd.DataFrame(columns=['POSTCODE'])
# Just appending some data
df_cust = df_cust.append({'POSTCODE': 'SW1A-1AA'}, ignore_index=True)
df_cust = df_cust.append({'POSTCODE': 'WC2B-4AB'}, ignore_index=True)
df_cust = df_cust.append({'POSTCODE': 'ASDF-QWE'}, ignore_index=True) # Wrong postal code
for i, row in df_cust.iterrows():
df_cust.at[i, 'URL'] = 'https://api.getthedata.com/postcode/' + row['POSTCODE'].replace('-','')
response = requests.get(df_cust.loc[i, 'URL'])
parse_json = json.loads(response.text)
if 'data' in parse_json:
if 'latitude' in parse_json['data']:
df_cust.at[i, 'LAT'] = parse_json['data']['latitude']
else:
df_cust.at[i, 'LAT'] = None
if 'longitude' in parse_json['data']:
df_cust.at[i, 'LON'] = parse_json['data']['longitude']
else:
df_cust.at[i, 'LON'] = None
else:
df_cust.at[i, 'LAT'] = None
df_cust.at[i, 'LON'] = None
print(df_cust)
Output:
POSTCODE URL LAT LON
0 SW1A-1AA https://api.getthedata.com/postcode/SW1A1AA 51.501009 -0.141588
1 WC2B-4AB https://api.getthedata.com/postcode/WC2B4AB 51.514206 -0.119893
2 ASDF-QWE https://api.getthedata.com/postcode/ASDFQWE None None

for loop only takes last value in python aws dynamodb

I am trying to insert records into a table, but only last record(result data) from the loop is inserting into the table
Here is the code i tried:
CDates = ['2020-05-10','2020-05-12','2020-05-13','2020-05-16','2020-05-20']
ResultData = {}
for date in CDates:
filterDate = Key('Date').eq(id)
appResponse = appTable.scan(FilterExpression = filterDate)
accResp = table.query(KeyConditionExpression = Key('PrimaryId').eq('Key'),FilterExpression = Key('Date').eq(date))
if len(accResp['Items']) == 0:
ResultData['PrimaryId'] = 'Key'
ResultData['CreatedDate'] = date
ResultData['Type'] = 'Appt'
ResultData['Id'] = str(uuid.uuid4())
print(ResultData)
table.put_item(Item=ResultData)
Not getting where did I go wrong
You assigned ResultData outside of the loop and changed the values for the same keys every time the loop ran. Try this:
CDates = ['2020-05-10', '2020-05-12', '2020-05-13', '2020-05-16', '2020-05-20']
for date in CDates:
filterDate = Key('Date').eq(id)
appResponse = appTable.scan(FilterExpression=filterDate)
accResp = table.query(
KeyConditionExpression=Key('PrimaryId').eq('Key'),
FilterExpression=Key('Date').eq(date))
if len(accResp['Items']) == 0:
ResultData = {
'PrimaryId': 'Key',
'CreationDate': date,
'Type': 'Appt',
'Id': str(uuid.uuid4())
}
print(ResultData)
table.put_item(Item=ResultData)

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

Read data from OECD API into python (and pandas)

I'm trying to download data from OECD API (https://data.oecd.org/api/sdmx-json-documentation/) into python.
I managed to download data in SDMX-JSON format (and transform it to JSON) so far:
OECD_ROOT_URL = "http://stats.oecd.org/SDMX-JSON/data"
def make_OECD_request(dsname, dimensions, params = None, root_dir = OECD_ROOT_URL):
"""Make URL for the OECD API and return a response"""
"""4 dimensions: location, subject, measure, frequency"""
if not params:
params = {}
dim_args = ['+'.join(d) for d in dimensions]
dim_str = '.'.join(dim_args)
url = root_dir + '/' + dsname + '/' + dim_str + '/all'
print('Requesting URL ' + url)
return rq.get(url = url, params = params)
response = make_OECD_request('MEI'
, [['USA', 'CZE'], [], [], ['M']]
, {'startTime': '2009-Q1', 'endTime': '2010-Q1'})
if (response.status_code == 200):
json = response.json()
How can I transform the data set into pandas.DataFrame? I tried pandas.read_json() and pandasdmx library, but I was not able to solve this.
The documentation the original question points to does not (yet?) mention that the API accepts the parameter contentType, which may be set to csv. That makes it trivial to use with Pandas.
import pandas as pd
def get_from_oecd(sdmx_query):
return pd.read_csv(
f"https://stats.oecd.org/SDMX-JSON/data/{sdmx_query}?contentType=csv"
)
print(get_from_oecd("MEI_FIN/IRLT.AUS.M/OECD").head())
Update:
The function to automatically download the data from OECD API is now available in my Python library CIF (abbreviation for the Composite Indicators Framework, installable via pip):
from cif import cif
data, subjects, measures = cif.createDataFrameFromOECD(countries = ['USA'], dsname = 'MEI', frequency = 'M')
Original answer:
If you need your data in Pandas DataFrame format, it is IMHO better to send your request to OECD with additional parameter 'dimensionAtObservation': 'AllDimensions', which results in more comprehensive JSON file.
Use following functions to download the data:
import requests as rq
import pandas as pd
import re
OECD_ROOT_URL = "http://stats.oecd.org/SDMX-JSON/data"
def make_OECD_request(dsname, dimensions, params = None, root_dir = OECD_ROOT_URL):
# Make URL for the OECD API and return a response
# 4 dimensions: location, subject, measure, frequency
# OECD API: https://data.oecd.org/api/sdmx-json-documentation/#d.en.330346
if not params:
params = {}
dim_args = ['+'.join(d) for d in dimensions]
dim_str = '.'.join(dim_args)
url = root_dir + '/' + dsname + '/' + dim_str + '/all'
print('Requesting URL ' + url)
return rq.get(url = url, params = params)
def create_DataFrame_from_OECD(country = 'CZE', subject = [], measure = [], frequency = 'M', startDate = None, endDate = None):
# Request data from OECD API and return pandas DataFrame
# country: country code (max 1)
# subject: list of subjects, empty list for all
# measure: list of measures, empty list for all
# frequency: 'M' for monthly and 'Q' for quarterly time series
# startDate: date in YYYY-MM (2000-01) or YYYY-QQ (2000-Q1) format, None for all observations
# endDate: date in YYYY-MM (2000-01) or YYYY-QQ (2000-Q1) format, None for all observations
# Data download
response = make_OECD_request('MEI'
, [[country], subject, measure, [frequency]]
, {'startTime': startDate, 'endTime': endDate, 'dimensionAtObservation': 'AllDimensions'})
# Data transformation
if (response.status_code == 200):
responseJson = response.json()
obsList = responseJson.get('dataSets')[0].get('observations')
if (len(obsList) > 0):
print('Data downloaded from %s' % response.url)
timeList = [item for item in responseJson.get('structure').get('dimensions').get('observation') if item['id'] == 'TIME_PERIOD'][0]['values']
subjectList = [item for item in responseJson.get('structure').get('dimensions').get('observation') if item['id'] == 'SUBJECT'][0]['values']
measureList = [item for item in responseJson.get('structure').get('dimensions').get('observation') if item['id'] == 'MEASURE'][0]['values']
obs = pd.DataFrame(obsList).transpose()
obs.rename(columns = {0: 'series'}, inplace = True)
obs['id'] = obs.index
obs = obs[['id', 'series']]
obs['dimensions'] = obs.apply(lambda x: re.findall('\d+', x['id']), axis = 1)
obs['subject'] = obs.apply(lambda x: subjectList[int(x['dimensions'][1])]['id'], axis = 1)
obs['measure'] = obs.apply(lambda x: measureList[int(x['dimensions'][2])]['id'], axis = 1)
obs['time'] = obs.apply(lambda x: timeList[int(x['dimensions'][4])]['id'], axis = 1)
obs['names'] = obs['subject'] + '_' + obs['measure']
data = obs.pivot_table(index = 'time', columns = ['names'], values = 'series')
return(data)
else:
print('Error: No available records, please change parameters')
else:
print('Error: %s' % response.status_code)
You can create requests like these:
data = create_DataFrame_from_OECD(country = 'CZE', subject = ['LOCOPCNO'])
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'Q', startDate = '2009-Q1', endDate = '2010-Q1')
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'M', startDate = '2009-01', endDate = '2010-12')
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'M', subject = ['B6DBSI01'])
data = create_DataFrame_from_OECD(country = 'USA', frequency = 'Q', subject = ['B6DBSI01'])
You can recover the data from the source using code like this.
from urllib.request import urlopen
import json
URL = 'http://stats.oecd.org/SDMX-JSON/data/MEI/USA+CZE...M/all'
response = urlopen(URL).read()
responseDict = json.loads(str(response)[2:-1])
print (responseDict.keys())
print (len(responseDict['dataSets']))
Here is the output from this code.
dict_keys(['header', 'structure', 'dataSets'])
1
If you are curious about the appearance of the [2:-1] (I would be) it's because for some reason unknown to me the str function leaves some extraneous characters at the beginning and end of the string when it converts the byte array passed to it. json.loads is documented to require a string as input.
This is the code I used to get to this point.
>>> from urllib.request import urlopen
>>> import json
>>> URL = 'http://stats.oecd.org/SDMX-JSON/data/MEI/USA+CZE...M/all'
>>> response = urlopen(URL).read()
>>> len(response)
9886387
>>> response[:50]
b'{"header":{"id":"1975590b-346a-47ee-8d99-6562ccc11'
>>> str(response[:50])
'b\'{"header":{"id":"1975590b-346a-47ee-8d99-6562ccc11\''
>>> str(response[-50:])
'b\'"uri":"http://www.oecd.org/contact/","text":""}]}}\''
I understand that this is not a complete solution as you must still crack into the dataSets structure for the data to put into pandas. It's a list but you could explore it starting with this sketch.
The latest release of pandasdmx (pandasdmx.readthedocs.io) fixes previous issues accessing OECD data in sdmx-json.

Categories