How to save the results of a function as a new CSV? - python

The code is required to take addresses from a csv file and then use a function to compute the corresponding Latitudes and Longitudes. While I get the correct Latitudes and Longitudes but I am unable to save them to a new csv file.
import requests
import urllib.parse
import pandas as pd
#function to get the Coordinates:
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return
#function is called to get the 5 Address Values from the CSV File and pass on to the function
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
lat_long(add)
Output is:
Latitude Longitude
34.0096961 71.8990106
34.0123846 71.5787458
33.6038766 73.048136
33.6938118 73.0651511
24.8546842 67.0207055
I want to save this output into a new file and I am unable to get the results.

Just a small modification might help
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
Lat = response[0]["lat"]
Long = response[0]["lon"]
return Lat, Long
Lat_List = []
Long_List = []
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
Lat =lat_long(add)[0]
Long = lat_long(add)[1]
Lat_List.append(Lat)
Long_List.append(Long)
df1 = pd.DataFrame(data, columns=['Latitude', 'Longitude])
df1['Latitude'] = Lat_List
df1['Longitude'] = Long_List
df1.to_csv("LatLong.csv)

#one line of change here
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return response[0]["lat"], response[0]["lon"] # return the lat and long
# three lines added here
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
l=[] # define empty list
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
l.append(lat_long(add)) # append to the empty l
# create a dataframe and output as csv
pd.DataFrame(l, columns=['Longitude', 'Latitude']).to_csv('test.csv', sep= ' ')

Related

Create merged df based on the url list [pandas]

I was able to extract the data from url_query url, but additionally, I would like to get the data from the urls_list created based on the query['ids'] column from dataframe. Please see below the current logic:
url = 'https://instancename.some-platform.com/api/now/table/data?display_value=true&'
team = 'query=group_name=123456789'
url_query = url+team
dataframe: query
[ids]
0 aaabbb1cccdddeee4ffggghhhhh5iijj
1 aa1bbb2cccdddeee5ffggghhhhh6iijj
issue_list = []
for issue in query['ids']:
issue_list.append(f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}')
response = requests.get(url_query, headers=headers,auth=auth, proxies=proxies)
data = response.json()
def api_response(k):
dct = dict(
event_id= k['number'],
created_time = k[‘created’],
status = k[‘status’],
created_by = k[‘raised_by’],
short_desc = k[‘short_description’],
group = k[‘team’]
)
return dct
raw_data = []
for p in data['result']:
rec = api_response(k)
raw_data.append(rec)
df = pd.DataFrame.from_records(raw_data)
df:
The url_query response extracts what I need, but the key is that I would like to add to the existing one 'df' add the data from the issue_list = []. I don't know how to put the issue_list = [] to the response. I've tried to add issue_list to the response = requests.get(issue_list, headers=headers,auth=auth, proxies=proxies) statement, but I've got invalid schema error.
You can create list of DataFrames with query q instead url_query and last join together by concat:
dfs = []
for issue in query['ids']:
q = f'https://instancename.some-platform.com/api/now/table/data?display_value=true&?display_value=true&query=group_name&sys_id={issue}'
response = requests.get(q, headers=headers,auth=auth, proxies=proxies)
data = response.json()
raw_data = [api_response(k) for p in data['result']]
df = pd.DataFrame.from_records(raw_data)
dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

Retrieving data from the Air Quality Index (AQI) website through the API and only recieving small nr. of stations

I'm working on a personal project and I'm trying to retrieve air quality data from the https://aqicn.org website using their API.
I've used this code, which I've copied and adapted for the city of Bucharest as follows:
import pandas as pd
import folium
import requests
# GET data from AQI website through the API
base_url = "https://api.waqi.info"
path_to_file = "~/path"
# Got token from:- https://aqicn.org/data-platform/token/#/
with open(path_to_file) as f:
contents = f.readlines()
key = contents[0]
# (lat, long)-> bottom left, (lat, lon)-> top right
latlngbox = "44.300264,25.920181,44.566991,26.297836" # For Bucharest
trail_url=f"/map/bounds/?token={key}&latlng={latlngbox}" #
my_data = pd.read_json(base_url + trail_url) # Joined parts of URL
print('columns->', my_data.columns) #2 cols ‘status’ and ‘data’ JSON
### Built a dataframe from the json file
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],
each_row['lat'],
each_row['lon'],
each_row['aqi']])
df = pd.DataFrame(all_rows, columns=['station_name', 'lat', 'lon', 'aqi'])
# Cleaned the DataFrame
df['aqi'] = pd.to_numeric(df.aqi, errors='coerce') # Invalid parsing to NaN
# Remove NaN entries in col
df1 = df.dropna(subset = ['aqi'])
Unfortunately it only retrieves 4 stations whereas there are many more available on the actual site. In the API documentation the only limitation I saw was for "1,000 (one thousand) requests per second" so why can't I get more of them?
Also, I've tried to modify the lat-long values and managed to get more stations, but they were outside the city I was interested in.
Here is a view of the actual perimeter I've used in the embedded code.
If you have any suggestions as of how I can solve this issue, I'd be very happy to read your thoughts. Thank you!
Try using waqi through aqicn... not exactly a clean API but I found it to work quite well
import pandas as pd
url1 = 'https://api.waqi.info'
# Get token from:- https://aqicn.org/data-platform/token/#/
token = 'XXX'
box = '113.805332,22.148942,114.434299,22.561716' # polygon around HongKong via bboxfinder.com
url2=f'/map/bounds/?latlng={box}&token={token}'
my_data = pd.read_json(url1 + url2)
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],each_row['lat'],each_row['lon'],each_row['aqi']])
df = pd.DataFrame(all_rows,columns=['station_name', 'lat', 'lon', 'aqi'])
From there its easy to plot
df['aqi'] = pd.to_numeric(df.aqi,errors='coerce')
print('with NaN->', df.shape)
df1 = df.dropna(subset = ['aqi'])
df2 = df1[['lat', 'lon', 'aqi']]
init_loc = [22.396428, 114.109497]
max_aqi = int(df1['aqi'].max())
print('max_aqi->', max_aqi)
m = folium.Map(location = init_loc, zoom_start = 5)
heat_aqi = HeatMap(df2, min_opacity = 0.1, max_val = max_aqi,
radius = 60, blur = 20, max_zoom = 2)
m.add_child(heat_aqi)
m
Or as such
centre_point = [22.396428, 114.109497]
m2 = folium.Map(location = centre_point,tiles = 'Stamen Terrain', zoom_start= 6)
for idx, row in df1.iterrows():
lat = row['lat']
lon = row['lon']
station = row['station_name'] + ' AQI=' + str(row['aqi'])
station_aqi = row['aqi']
if station_aqi > 300:
pop_color = 'red'
elif station_aqi > 200:
pop_color = 'orange'
else:
pop_color = 'green'
folium.Marker(location= [lat, lon],
popup = station,
icon = folium.Icon(color = pop_color)).add_to(m2)
m2
checking for stations within HK, returns 19
df[df['station_name'].str.contains('HongKong')]

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

Error while trying to get dictionary to pandas dataframe, python

I got "Pandas ValueError Arrays Must be All Same Length"
Before I start, I checked answers to similar problems and folks suggest to use something like:
DataFrame(dict([ (k,Series(v)) for k,v in d.iteritems() ]))
if you have only two values in dictionary or,
a = {'Links' : lines ,'Titles' : titles , 'Singers': finalsingers , 'Albums':finalalbums , 'Years' : years}
df = pd.DataFrame.from_dict(a, orient='index')
df.transpose()
But neither of them worked for me. What my code does is, goes to file in directory, captures the name and last_modified time, opens the file and use it in function called phash and returns a value. I think there could be a problem with phash function, maybe sometimes it returns a null value.
So in my case data is something like this:
raw_data = {}
hash_11 = []
time_1 = []
file_name_1 = []
for file in date_file_list:
try:
#print(file[1])
y = file[1]
file_name = os.path.basename(y) # extract just the filename or #file_name = os.path.split(file[1])
file_name = file_name.split('_-_')[0]
file_name_1.append(file_name)
#print(file_name)
# convert date tuple to MM/DD/YYYY HH:MM:SS format
#file_date = time.strftime("%m/%d/%y %H:%M:%S", file[0])
time = file[0]
time_1.append(time)
img = Image.open(str(file[1]))
hash_1 = imagehash.dhash(img)
hash_11.append(hash_1)
#hash_1 = str(hash_1)
#data = [hash_1, time, file_name]
#hamming_list.append(data)
#print(file_name, hash_1, file_date)
data ={'hash_1': hash_11,'time': time_1, 'file_name': file_name_1}
raw_data.update(data)
except:
pass
df = pd.DataFrame(raw_data, columns = ['hash_1', 'time','file_name'])

Python / Pandas Dataframe: Automatically fill in missing rows

My goal is to ultimately create a scatter plot with date on the x-axis and won delegates (of each candidate) on the y-axis. I'm unsure of how to "fill in the blanks" when it comes to missing dates. I've attached a picture of the table I get.
For example, I'm trying to put March 1 as the date for Alaska, Arkansas, etc. to make it possible to plot the data.
# CREATE DATAFRAME WITH DELEGATE WON/TARGET INFORMATION
import requests
from lxml import html
import pandas
url = "http://projects.fivethirtyeight.com/election-2016/delegate-targets/"
response = requests.get(url)
doc = html.fromstring(response.text)
tables = doc.findall('.//table[#class="delegates desktop"]')
election = tables[0]
election_rows = election.findall('.//tr')
def extractCells(row, isHeader=False):
if isHeader:
cells = row.findall('.//th')
else:
cells = row.findall('.//td')
return [val.text_content() for val in cells]
def parse_options_data(table):
rows = table.findall(".//tr")
header = extractCells(rows[1], isHeader=True)
data = [extractCells(row, isHeader=False) for row in rows[2:]]
trumpdata = "Trump Won Delegates"
cruzdata = "Cruz Won Delegates"
kasichdata = "Kasich Won Delegates"
data = pandas.DataFrame(data, columns=["Date", "State or Territory", "Total Delegates", trumpdata, cruzdata, kasichdata, "Rubio"])
data.insert(4, "Trump Target Delegates", data[trumpdata].str.extract(r'(\d{0,3}$)'))
data.insert(6, "Cruz Target Delegates", data[cruzdata].str.extract(r'(\d{0,3}$)'))
data.insert(8, "Kasich Target Delegates", data[kasichdata].str.extract(r'(\d{0,3}$)'))
data = data.drop('Rubio', 1)
data[trumpdata] = data[trumpdata].str.extract(r'(^\d{0,3})')
data[cruzdata] = data[cruzdata].str.extract(r'(^\d{0,3})')
data[kasichdata] = data[kasichdata].str.extract(r'(^\d{0,3})')
return df
election_data = parse_options_data(election)
df = pandas.DataFrame(election_data)
df
You could do,
data.fillna('March 1')
I would advise you to go through the documentation
http://pandas.pydata.org/pandas-docs/stable/10min.html

Categories