Python / Pandas Dataframe: Automatically fill in missing rows - python

My goal is to ultimately create a scatter plot with date on the x-axis and won delegates (of each candidate) on the y-axis. I'm unsure of how to "fill in the blanks" when it comes to missing dates. I've attached a picture of the table I get.
For example, I'm trying to put March 1 as the date for Alaska, Arkansas, etc. to make it possible to plot the data.
# CREATE DATAFRAME WITH DELEGATE WON/TARGET INFORMATION
import requests
from lxml import html
import pandas
url = "http://projects.fivethirtyeight.com/election-2016/delegate-targets/"
response = requests.get(url)
doc = html.fromstring(response.text)
tables = doc.findall('.//table[#class="delegates desktop"]')
election = tables[0]
election_rows = election.findall('.//tr')
def extractCells(row, isHeader=False):
if isHeader:
cells = row.findall('.//th')
else:
cells = row.findall('.//td')
return [val.text_content() for val in cells]
def parse_options_data(table):
rows = table.findall(".//tr")
header = extractCells(rows[1], isHeader=True)
data = [extractCells(row, isHeader=False) for row in rows[2:]]
trumpdata = "Trump Won Delegates"
cruzdata = "Cruz Won Delegates"
kasichdata = "Kasich Won Delegates"
data = pandas.DataFrame(data, columns=["Date", "State or Territory", "Total Delegates", trumpdata, cruzdata, kasichdata, "Rubio"])
data.insert(4, "Trump Target Delegates", data[trumpdata].str.extract(r'(\d{0,3}$)'))
data.insert(6, "Cruz Target Delegates", data[cruzdata].str.extract(r'(\d{0,3}$)'))
data.insert(8, "Kasich Target Delegates", data[kasichdata].str.extract(r'(\d{0,3}$)'))
data = data.drop('Rubio', 1)
data[trumpdata] = data[trumpdata].str.extract(r'(^\d{0,3})')
data[cruzdata] = data[cruzdata].str.extract(r'(^\d{0,3})')
data[kasichdata] = data[kasichdata].str.extract(r'(^\d{0,3})')
return df
election_data = parse_options_data(election)
df = pandas.DataFrame(election_data)
df

You could do,
data.fillna('March 1')
I would advise you to go through the documentation
http://pandas.pydata.org/pandas-docs/stable/10min.html

Related

How to save the results of a function as a new CSV?

The code is required to take addresses from a csv file and then use a function to compute the corresponding Latitudes and Longitudes. While I get the correct Latitudes and Longitudes but I am unable to save them to a new csv file.
import requests
import urllib.parse
import pandas as pd
#function to get the Coordinates:
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return
#function is called to get the 5 Address Values from the CSV File and pass on to the function
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
lat_long(add)
Output is:
Latitude Longitude
34.0096961 71.8990106
34.0123846 71.5787458
33.6038766 73.048136
33.6938118 73.0651511
24.8546842 67.0207055
I want to save this output into a new file and I am unable to get the results.
Just a small modification might help
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
Lat = response[0]["lat"]
Long = response[0]["lon"]
return Lat, Long
Lat_List = []
Long_List = []
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
Lat =lat_long(add)[0]
Long = lat_long(add)[1]
Lat_List.append(Lat)
Long_List.append(Long)
df1 = pd.DataFrame(data, columns=['Latitude', 'Longitude])
df1['Latitude'] = Lat_List
df1['Longitude'] = Long_List
df1.to_csv("LatLong.csv)
#one line of change here
def lat_long(add):
url = 'https://nominatim.openstreetmap.org/search/'+urllib.parse.quote(add)+'?format=json'
response = requests.get(url).json()
print(response[0]["lat"], response[0]["lon"])
return response[0]["lat"], response[0]["lon"] # return the lat and long
# three lines added here
df = pd.read_csv('C:\\Users\\Umer Abbas\\Desktop\\lat_long.csv')
i = 0
l=[] # define empty list
print("Latitude","","Longitude")
for i in range (0,5):
add = df._get_value(i, 'Address')
l.append(lat_long(add)) # append to the empty l
# create a dataframe and output as csv
pd.DataFrame(l, columns=['Longitude', 'Latitude']).to_csv('test.csv', sep= ' ')

How would I convert this Python Script to PowerShell?

Im really struggling trying to figure out how I'm supposed to translate a Python script to powershell.
The code is:
list_files = ['DC-JAN-2017.xlsx', 'DC-FEB-2017.xlsx', 'DC-MAR-2017.xlsx','DC-APR-2017.xlsx', 'DC-MAY-2017.xlsx', 'DC-JUN-2017.xlsx','DC-JUL-2017.xlsx', 'DC-AUG-2017.xlsx', 'DC-SEP-2017.xlsx','DC-OCT-2017.xlsx', 'DC-NOV-2017.xlsx', 'DC-DEC-2017.xlsx']
zip_loop = zip(list_files, [i[3:6] for i in list_files])
# Final report DataFrame
df_report = pd.DataFrame()
for file_name, month in zip_loop:
# Import and Clean Data
df_clean = clean(file_raw, month)
# Build Monthly report
df_month = process_month(df_clean, month)
# Merge with previous Months report
if df_report.empty
df_report = df_month
else:
df_report = df_report.merge(df_month, on = 'index')
# Save Final Report
df_report.to_excel('Final Report.xlsx')
What i've tried so far
$list_files = ['DC-JAN-2017.xlsx', 'DC-FEB-2017.xlsx', 'DC-MAR-2017.xlsx','DC-APR-2017.xlsx', 'DC-MAY-2017.xlsx', 'DC-JUN-2017.xlsx',
'DC-JUL-2017.xlsx', 'DC-AUG-2017.xlsx', 'DC-SEP-2017.xlsx','DC-OCT-2017.xlsx', 'DC-NOV-2017.xlsx', 'DC-DEC-2017.xlsx']
$zip_loop = $zip($list_files, [$i[3:6] for $i in $list_files])
# Final report DataFrame
$df_report = $pd.DataFrame()
for $file_name, $month in $zip_loop:
# Import and Clean Data
$df_clean = clean($file_raw, $month)
# Build Monthly report
$df_month = process_month($df_clean, $month)
# Merge with previous Months report
if $df_report.empty
$df_report = $df_month
else:
$df_report = $df_report.merge($df_month, on = 'index')
# Save Final Report
$df_report.to_excel('Final Report.xlsx')
I'm not too sure what the "in", "on", statements is supposed to be in powershell.

Folium put markers in marker clusters AND in layers based on a value

So, I'm working with a dataset of stores, each store with its lat, lng, name and category.
Since we are talking about several hundreds of even thousands of stores, I'm using marker clusters, and they are working fine...
Now, I need to also set these stores in different layers based on their category, so that when I click on say "electronics stores", I only get those stores in the map (and they should be removed from the marker cluster as well)
Consider this sample data:
stores = [(-23.5578906,-46.6665546, 'store1','electronics'),
(-23.562711,-46.674363, 'store2','home goods'),
(-23.5642399,-46.6681833, 'store3','beauty'),
(-23.584167,-46.678497, 'store4','electronics'),
(-23.5956238,-46.6865377, 'store5','electronics'),
(-23.5868682,-46.6773554,'store6','home goods'),
(-23.6011096,-46.6739275, 'store7','beauty'),
(-23.6087354,-46.6973713, 'store8','home goods'),
(-23.5943515,-46.6846959, 'store9','beauty')]
My code works ok for putting the markers in clusters, but when I try to also add them to layers based on their categories it doesn't work. I get no errors, and the map "loads", but the markers and clusters don't get displayed, and I get no layers on the map.
This is my code:
mymap = folium.Map(location=[y_map, x_map], zoom_start=11,tiles=None)
folium.TileLayer(name="Mapbox Bright",control=False).add_to(mymap)
markers_list = []
all_gp = []
for lat, lng, name, category zip(df_stores['LAT'],
df_stores['LNG'],
df_stores['NAME'],
df_stores['CATEGORY']
):
html = '''NAME: ''' + name + '''<br>CATEGORY: ''' + category
iframe = folium.IFrame(html,
width=300,
height=130)
popup = folium.Popup(iframe,
max_width=300)
lead_marker = folium.Marker(
[lat, lng],
popup=popup,
icon=folium.Icon(color='purple', icon='glyphicon-cutlery', prefix='glyphicon')
)
markers_list.append(lead_marker)
pg = category
all_gp.append(pg)
mCluster = MarkerCluster(name="Stores").add_to(mymap)
for pnt in markers_list:
pnt.add_to(mCluster)
######################################################################
# Create point_layer object
unique_gp = list(set(all_gp))
vlist = []
for i,k in enumerate(unique_gp):
locals()[f'point_layer{i}'] = folium.FeatureGroup(name=k)
vlist.append(locals()[f'point_layer{i}'])
# Creating list for point_layer
pl_group = []
for n in all_gp:
for v in vlist:
if n == vars(v)['layer_name']:
pl_group.append(v)
for pnt,pg in zip(markers_list,pl_group):
pnt.add_to(pg)
pg.add_to(mymap)
######################################################################
folium.LayerControl().add_to(mymap)
mymap.add_child(MeasureControl())
mymap.render()
mymap.save('stores.html')
The code between the lines of ############ I took form another post here (How to add categorical layered data to LayerControl() in python Folium map?) and adapted it to my code, but it seems I'm missing something. If I take out the last for cycle from the code, the map loads correctly with its clusters working ok, any suggestions?
I will answer with the understanding that the question is how to create a category layer, add markers for the information that belongs to it, and control the show/hide with a layer control. First, set the respective column data from the row information in the data frame and add the pop-up information. Add the category information based on the category information to the pre-prepared per-category layer.
import pandas as pd
import numpy as np
import folium
from folium.plugins import MarkerCluster
stores = [(-23.5578906,-46.6665546, 'store1','electronics'),
(-23.562711,-46.674363, 'store2','home goods'),
(-23.5642399,-46.6681833, 'store3','beauty'),
(-23.584167,-46.678497, 'store4','electronics'),
(-23.5956238,-46.6865377, 'store5','electronics'),
(-23.5868682,-46.6773554,'store6','home goods'),
(-23.6011096,-46.6739275, 'store7','beauty'),
(-23.6087354,-46.6973713, 'store8','home goods'),
(-23.5943515,-46.6846959, 'store9','beauty')]
df = pd.DataFrame(stores, columns=['LAT','LNG','NAME','CATEGORY'])
mymap = folium.Map(location=[df['LAT'].mean(), df['LNG'].mean()], zoom_start=12)
#mCluster = MarkerCluster(name="Stores").add_to(mymap)
mCluster_hg = MarkerCluster(name="home goods").add_to(mymap)
mCluster_ele = MarkerCluster(name="electronics").add_to(mymap)
mCluster_bea = MarkerCluster(name="beauty").add_to(mymap)
for row in df.itertuples():
#print(row)
location = row[1], row[2]
icon=folium.Icon(color='purple', icon='glyphicon-cutlery', prefix='glyphicon')
html = '''NAME: ''' + row[3] + '''<br>CATEGORY: ''' + row[4]
iframe = folium.IFrame(html, width=300, height=130)
popup = folium.Popup(iframe, max_width=300)
marker = folium.Marker(location=location, popup=popup, icon=icon)
#folium.Popup(popup).add_to(marker)
#mCluster_bea.add_child(marker)
if row[4] == 'electronics':
mCluster_ele.add_child(marker)
elif row[4] == 'home goods':
mCluster_hg.add_child(marker)
elif row[4] == 'beauty':
mCluster_bea.add_child(marker)
folium.LayerControl().add_to(mymap);
mymap

Retrieving data from the Air Quality Index (AQI) website through the API and only recieving small nr. of stations

I'm working on a personal project and I'm trying to retrieve air quality data from the https://aqicn.org website using their API.
I've used this code, which I've copied and adapted for the city of Bucharest as follows:
import pandas as pd
import folium
import requests
# GET data from AQI website through the API
base_url = "https://api.waqi.info"
path_to_file = "~/path"
# Got token from:- https://aqicn.org/data-platform/token/#/
with open(path_to_file) as f:
contents = f.readlines()
key = contents[0]
# (lat, long)-> bottom left, (lat, lon)-> top right
latlngbox = "44.300264,25.920181,44.566991,26.297836" # For Bucharest
trail_url=f"/map/bounds/?token={key}&latlng={latlngbox}" #
my_data = pd.read_json(base_url + trail_url) # Joined parts of URL
print('columns->', my_data.columns) #2 cols ‘status’ and ‘data’ JSON
### Built a dataframe from the json file
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],
each_row['lat'],
each_row['lon'],
each_row['aqi']])
df = pd.DataFrame(all_rows, columns=['station_name', 'lat', 'lon', 'aqi'])
# Cleaned the DataFrame
df['aqi'] = pd.to_numeric(df.aqi, errors='coerce') # Invalid parsing to NaN
# Remove NaN entries in col
df1 = df.dropna(subset = ['aqi'])
Unfortunately it only retrieves 4 stations whereas there are many more available on the actual site. In the API documentation the only limitation I saw was for "1,000 (one thousand) requests per second" so why can't I get more of them?
Also, I've tried to modify the lat-long values and managed to get more stations, but they were outside the city I was interested in.
Here is a view of the actual perimeter I've used in the embedded code.
If you have any suggestions as of how I can solve this issue, I'd be very happy to read your thoughts. Thank you!
Try using waqi through aqicn... not exactly a clean API but I found it to work quite well
import pandas as pd
url1 = 'https://api.waqi.info'
# Get token from:- https://aqicn.org/data-platform/token/#/
token = 'XXX'
box = '113.805332,22.148942,114.434299,22.561716' # polygon around HongKong via bboxfinder.com
url2=f'/map/bounds/?latlng={box}&token={token}'
my_data = pd.read_json(url1 + url2)
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],each_row['lat'],each_row['lon'],each_row['aqi']])
df = pd.DataFrame(all_rows,columns=['station_name', 'lat', 'lon', 'aqi'])
From there its easy to plot
df['aqi'] = pd.to_numeric(df.aqi,errors='coerce')
print('with NaN->', df.shape)
df1 = df.dropna(subset = ['aqi'])
df2 = df1[['lat', 'lon', 'aqi']]
init_loc = [22.396428, 114.109497]
max_aqi = int(df1['aqi'].max())
print('max_aqi->', max_aqi)
m = folium.Map(location = init_loc, zoom_start = 5)
heat_aqi = HeatMap(df2, min_opacity = 0.1, max_val = max_aqi,
radius = 60, blur = 20, max_zoom = 2)
m.add_child(heat_aqi)
m
Or as such
centre_point = [22.396428, 114.109497]
m2 = folium.Map(location = centre_point,tiles = 'Stamen Terrain', zoom_start= 6)
for idx, row in df1.iterrows():
lat = row['lat']
lon = row['lon']
station = row['station_name'] + ' AQI=' + str(row['aqi'])
station_aqi = row['aqi']
if station_aqi > 300:
pop_color = 'red'
elif station_aqi > 200:
pop_color = 'orange'
else:
pop_color = 'green'
folium.Marker(location= [lat, lon],
popup = station,
icon = folium.Icon(color = pop_color)).add_to(m2)
m2
checking for stations within HK, returns 19
df[df['station_name'].str.contains('HongKong')]

How to store the result of function to datafram with related column

Return Data from function as dictionary and store it in data frame.
While run it using for loop getting error.
import pyowm
from pyowm.utils import config
from pyowm.utils import timestamps
owm = pyowm.OWM(" your free api key from OpenWeatherMap")
mgr = owm.weather_manager()
data =[]
# Create function to get weather details
def get_weather(city):
observation = mgr.weather_at_place(city)
l = observation.weather
Wind_Speed = l.wind()['speed']
Temp = l.temperature('celsius')['temp']
Max_temp = l.temperature('celsius')['temp_max']
Min_temp = l.temperature('celsius')['temp_min']
#Heat_index = l.heat_index
Humidity = l.humidity
Pressure = l.pressure['press']
weather = {"City": city, "Wind_Speed" : Wind_Speed, "Temp":
Temp,"Max_temp":Max_temp, "Min_temp":Min_temp,
"Humidity":Humidity, "Pressure":Pressure}
return weather
for city in df_location['City']:
get_weather(city)
df = df.append(data, True)
Want to store that weather details in same df with relative city.
Current df_location is like:

Categories