Create merged df based on the url list [pandas] - python

I was able to extract the data from url_query url, but additionally, I would like to get the data from the urls_list created based on the query['ids'] column from dataframe. Please see below the current logic:
url = 'https://instancename.some-platform.com/api/now/table/data?display_value=true&'
team = 'query=group_name=123456789'
url_query = url+team
dataframe: query
[ids]
0 aaabbb1cccdddeee4ffggghhhhh5iijj
1 aa1bbb2cccdddeee5ffggghhhhh6iijj
issue_list = []
for issue in query['ids']:
issue_list.append(f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}')
response = requests.get(url_query, headers=headers,auth=auth, proxies=proxies)
data = response.json()
def api_response(k):
dct = dict(
event_id= k['number'],
created_time = k[‘created’],
status = k[‘status’],
created_by = k[‘raised_by’],
short_desc = k[‘short_description’],
group = k[‘team’]
)
return dct
raw_data = []
for p in data['result']:
rec = api_response(k)
raw_data.append(rec)
df = pd.DataFrame.from_records(raw_data)
df:
The url_query response extracts what I need, but the key is that I would like to add to the existing one 'df' add the data from the issue_list = []. I don't know how to put the issue_list = [] to the response. I've tried to add issue_list to the response = requests.get(issue_list, headers=headers,auth=auth, proxies=proxies) statement, but I've got invalid schema error.

You can create list of DataFrames with query q instead url_query and last join together by concat:
dfs = []
for issue in query['ids']:
q = f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}'
response = requests.get(q, headers=headers,auth=auth, proxies=proxies)
data = response.json()
raw_data = [api_response(k) for p in data['result']]
df = pd.DataFrame.from_records(raw_data)
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

Related

How to save the results of a function as a new CSV?

The code is required to take addresses from a csv file and then use a function to compute the corresponding Latitudes and Longitudes. While I get the correct Latitudes and Longitudes but I am unable to save them to a new csv file.
import requests
import urllib.parse
import pandas as pd
#function to get the Coordinates:
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return
#function is called to get the 5 Address Values from the CSV File and pass on to the function
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
lat_long(add)
Output is:
Latitude Longitude
34.0096961 71.8990106
34.0123846 71.5787458
33.6038766 73.048136
33.6938118 73.0651511
24.8546842 67.0207055
I want to save this output into a new file and I am unable to get the results.
Just a small modification might help
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
Lat = response[0]["lat"]
Long = response[0]["lon"]
return Lat, Long
Lat_List = []
Long_List = []
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
Lat =lat_long(add)[0]
Long = lat_long(add)[1]
Lat_List.append(Lat)
Long_List.append(Long)
df1 = pd.DataFrame(data, columns=['Latitude', 'Longitude])
df1['Latitude'] = Lat_List
df1['Longitude'] = Long_List
df1.to_csv("LatLong.csv)
#one line of change here
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return response[0]["lat"], response[0]["lon"] # return the lat and long
# three lines added here
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
l=[] # define empty list
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
l.append(lat_long(add)) # append to the empty l
# create a dataframe and output as csv
pd.DataFrame(l, columns=['Longitude', 'Latitude']).to_csv('test.csv', sep= ' ')

cannot concatenate object of type '<class 'list'>' when convering from df.append to pd.concat

I have a little parser that is gathering RSS feed channel to pandas df. Everything works as expected but I get this waring
The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead
After some research, I converted my dicts to list and then started to concatenate but now I get the
type '<class 'list'>'; only Series and DataFrame objs are valid
how to rewrite my for loop to get expected result
working code with warning
df = pd.DataFrame(columns = ['title', 'link'])
with response as r:
items = r.html.find('item', first=False)
for item in items:
title = item.find('title', first=True).text
link = item.find('guid', first=True).text
row = {'title': title, 'link': link}
df = df.append(row, ignore_index=True)
slightly modified, gives error
df = pd.DataFrame(columns = ['title', 'link'])
tmp = []
with response as r:
items = r.html.find('item', first=False)
for item in items:
title = item.find('title', first=True).text
link = item.find('guid', first=True).text
row = [title, link]
tmp.append(row)
df = pd.concat(tmp)
You can use pd.concat() for dataframes. You just need the create your dataframe with the tmp list. Maybe you can get data with pd.read_html I don't know actually.
tmp = []
with response as r:
items = r.html.find('item', first=False)
for item in items:
title = item.find('title', first=True).text
link = item.find('guid', first=True).text
row = [title, link]
tmp.append(row)
df = pd.DataFrame(tmp, columns=['title', 'link'])
pd.concat works to concatenate two or more pandas objects.
If you have succesfully constructed a list of dicts containing your data (which you have in the tmp variable) then you can transform it into a dataframe just by using the default pd.DataFrame constructor:
df = pd.DataFrame(columns = ['title', 'link'])
tmp = []
with response as r:
items = r.html.find('item', first=False)
for item in items:
title = item.find('title', first=True).text
link = item.find('guid', first=True).text
row = {'title': title, 'link': link}
tmp.append(row)
df = pd.DataFrame(tmp)
You need to change row to dict, e.g.:
row = {'col1': [title], 'col2': [link]}
and the append line to:
tmp = tmp.append(pd.DataFrame(row))
don't forget to reset the tmp to dataframe:
tmp = pd.DataFrame()

How to append data to a pandas dataframe?

I have a fairly complex sequence of functions calling apis and having the result set be appended to a dataframe - the thing is when I print the dataframe during each loop of append, I see new values but at the end when the loop breaks, I only see what value for final_df ? Any thoughts as to why?
df = pd.DataFrame(columns = ['repo', 'number', 'title', 'branch', 'merged_at', 'created_at', 'authored_by', 'merged_by', 'from_version', 'to_version'] )
def get_prs(repo,pr_number):
response = requests.request("GET", pgv.github_pr_url + str(repo) + '/pulls/' + str(pr_number), headers=pgv.headers)
response = response.json()
return response
def get_commits(repo,from_version,to_version):
response = requests.request("GET", pgv.github_commits_url + str(repo) +'/compare/' + str(from_version) + '...' + str(to_version) , headers=pgv.headers)
response = response.json()
# print(len(response['commits']))
# print(response['commits'])
for i in range(0,len(response['commits'])):
# print(response['commits'][i])
# x = re.match(r"\AMerge pull request #(?P<number>\d+) from/(?P<branch>(.+)\s*$)", response['commits'][i].get('commit').get('message'))
x = re.search("\AMerge pull request #(?P<number>\d+) from/(?P<branch>.*)", response['commits'][i].get('commit').get('message'))
# print(x)
if x is None:
pass
else:
# return re.search("(\d+)",x.group(0)).group(0), response['commits'][i].get('branches_url')
return x.group('number'), x.group('branch')
# print(x.group('branch'))
#query GitHub to get all commits between from_version and to_version.
def return_deploy_events():
final_object = []
response = requests.request('POST',pgv.url, params = {'api_key' : pgv.key}, json = pgv.query_params)
response = response.json()
if "jobs" in response:
time.sleep(5)
else:
for i in range(0,len(response['query_result']['data']['rows'])):
# print(response['query_result']['data']['rows'])
# get_prs(response['query_result']['data']['rows'][i].get('REPO'),get_commits(response['query_result']['data']['rows'][i].get('REPO'),response['query_result']['data']['rows'][i].get('FROM_VERSION'), response['query_result']['data']['rows'][i].get('TO_VERSION'))).get('merged_at')
try:
repo = response['query_result']['data']['rows'][i].get('REPO')
from_version = response['query_result']['data']['rows'][i].get('FROM_VERSION')
to_version = response['query_result']['data']['rows'][i].get('TO_VERSION')
# print(get_prs(repo,get_commits(repo,from_version, to_version)))
pull_requests = get_prs(repo,get_commits(repo,from_version, to_version)[0])
##pack into all one return
final_df = df.append({
'repo':repo,
'title': pull_requests.get('title'),
'branch': get_commits(repo,from_version, to_version)[1],
'created_at': pull_requests.get('created_at'),
'merged_at': pull_requests.get('merged_at'),
'authored_by': pull_requests.get('user').get('login'),
'merged_by': pull_requests.get('merged_by').get('login'),
'number': get_commits(repo,from_version, to_version)[0],
'from_version': from_version,
'to_version': to_version,}, ignore_index = True)
# print(get_commits(repo,from_version, to_version))
**HERE, WHEN UNCOMMENTED, PRINTS ALL RECORDS I WANT APPENDED **
# print(final_df.head(10))
except Exception:
pass
# 'title':, 'branch',
# 'merged_at', 'created_at', 'authored_by', 'merged_by',
# 'from_version': response['query_result']['data']['rows'][i].get('FROM_VERSION'), 'to_version':response['query_result']['data']['rows'][i].get('TO_VERSION')},
# ignore_index = True)
**BELOW IS WHERE IT PRINTS ONLY 1 RECORD **
print(final_df)
# final_df = json.loads(final_df.to_json(orient = 'records'))
# gec.json_to_s3(final_df, glob_common_vars.s3_resource,glob_common_vars.s3_bucket_name, 'test/test.json.gzip')
return_deploy_events()
I think the problem is, you are assigning each rows to the same variable.
So the last row will be printed at the last. So try to append each rows to result list.
def return_deploy_events():
final_object = []
result = []
response = requests.request('POST',pgv.url, params = {'api_key' : pgv.key}, json = pgv.query_params)
response = response.json()
if "jobs" in response:
time.sleep(5)
else:
for i in range(0,len(response['query_result']['data']['rows'])):
# print(response['query_result']['data']['rows'])
# get_prs(response['query_result']['data']['rows'][i].get('REPO'),get_commits(response['query_result']['data']['rows'][i].get('REPO'),response['query_result']['data']['rows'][i].get('FROM_VERSION'), response['query_result']['data']['rows'][i].get('TO_VERSION'))).get('merged_at')
try:
repo = response['query_result']['data']['rows'][i].get('REPO')
from_version = response['query_result']['data']['rows'][i].get('FROM_VERSION')
to_version = response['query_result']['data']['rows'][i].get('TO_VERSION')
# print(get_prs(repo,get_commits(repo,from_version, to_version)))
pull_requests = get_prs(repo,get_commits(repo,from_version, to_version)[0])
##pack into all one return
final_df = df.append({
'repo':repo,
'title': pull_requests.get('title'),
'branch': get_commits(repo,from_version, to_version)[1],
'created_at': pull_requests.get('created_at'),
'merged_at': pull_requests.get('merged_at'),
'authored_by': pull_requests.get('user').get('login'),
'merged_by': pull_requests.get('merged_by').get('login'),
'number': get_commits(repo,from_version, to_version)[0],
'from_version': from_version,
'to_version': to_version,}, ignore_index = True)
# print(get_commits(repo,from_version, to_version))
**HERE, WHEN UNCOMMENTED, PRINTS ALL RECORDS I WANT APPENDED **
# print(final_df.head(10))
result.append(final_df) # append the current row to result
except Exception:
pass
**BELOW IS WHERE IT PRINTS ONLY 1 RECORD **
print(result) # print the final result
I just added two lines of code, but I hope it works.

Store result from URL into Pandas Data frame

I am new to Pandas & Python . Have a requirement where..
I am passing 100 post codes to a URL using for loop & trying to extract the latitude & longitude for each of the post codes passed.
The result of it I need to save in data frame . Below is the code I have am using .
query_cust = "select custMasterID,Full_Name,POSTCODE from DMON.BANK_CUSTOMERS"
df_cust = pd.read_sql(query_cust, con=con_str)
df_cust["URL"] = "https://api.getthedata.com/postcode/" + df_cust['POSTCODE'].str.replace(" ", "")
for column in df_cust["URL"]:
# print(column)
response = requests.get(column)
response_text = response.text
#df = json.loads(response_text)['data']
parse_json = json.loads(response_text)
df_cust["Lat"] = pd.json_normalize(parse_json['data']['latitude'])
df_cust["Long"] = parse_json['data']['longitude']
print(df_cust)
Below is the error which is coming when i try running it .
df_cust["Lat"] = pd.json_normalize(parse_json['data']['latitude'])
in _json_normalize
raise NotImplementedError
NotImplementedError
You don't need to use json_normalize to get what you need from the response data. Just iterate through each row of the dataframe and update the values:
import pandas as pd
import json
import requests
pd.options.display.max_columns = None
pd.options.display.max_rows = None
df_cust = pd.DataFrame(columns=['POSTCODE'])
# Just appending some data
df_cust = df_cust.append({'POSTCODE': 'SW1A-1AA'}, ignore_index=True)
df_cust = df_cust.append({'POSTCODE': 'WC2B-4AB'}, ignore_index=True)
df_cust = df_cust.append({'POSTCODE': 'ASDF-QWE'}, ignore_index=True) # Wrong postal code
for i, row in df_cust.iterrows():
df_cust.at[i, 'URL'] = 'https://api.getthedata.com/postcode/' + row['POSTCODE'].replace('-','')
response = requests.get(df_cust.loc[i, 'URL'])
parse_json = json.loads(response.text)
if 'data' in parse_json:
if 'latitude' in parse_json['data']:
df_cust.at[i, 'LAT'] = parse_json['data']['latitude']
else:
df_cust.at[i, 'LAT'] = None
if 'longitude' in parse_json['data']:
df_cust.at[i, 'LON'] = parse_json['data']['longitude']
else:
df_cust.at[i, 'LON'] = None
else:
df_cust.at[i, 'LAT'] = None
df_cust.at[i, 'LON'] = None
print(df_cust)
Output:
POSTCODE URL LAT LON
0 SW1A-1AA https://api.getthedata.com/postcode/SW1A1AA 51.501009 -0.141588
1 WC2B-4AB https://api.getthedata.com/postcode/WC2B4AB 51.514206 -0.119893
2 ASDF-QWE https://api.getthedata.com/postcode/ASDFQWE None None

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

Categories