How can I construct a bar chart using plt function - python

I did a web scrape on ps4 games and I wanted to construct a bar chart using label x as the product_name variable I have and label y as the price variable I have. And another pie chart of the cheapest ps4 games using my price variable to determine how much percent out of 100 this 5 brands (all found under my brand variable.): playstation, Ubisoft, Activision, Sega, and electronic arts, represent in my data. so the pie chart would have the name and percent of each brand and the rest of the percent would be labels "others".
any hints how can I start.
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
from collections import defaultdict
import re
import matplotlib.pyplot as plt
url='https://www.newegg.com/PS4-Video-Games/SubCategory/ID-3141'
with uReq(url) as uClient:
page = uClient.read()
# parsing
page_soup = soup(page, "html.parser")
# grabs products
containers= page_soup.findAll("div",{"class":"item-container"})
# save to file
filename = "products.csv"
#creating two empty dictionaries
d = defaultdict(list)
d1 = defaultdict(list)
# for loop fills dict
for container in containers:
#brand name
brand = container.div.div.a.img["title"]
#product name
title = container.findAll("a", {"class":"item-title"})
product_name = title[0].text
#shipping
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
#price column
pricec = container.find("li", {"class":"price-current"})
#removing all white spaces
price= pricec.text.strip('price-current')
d['Product'].append(product_name)
d['shipping'].append(shipping)
d1['Product'].append(product_name)
d1['Brand'].append(brand)
d1['price'].append(price)
# create dataframe
df = pd.DataFrame(d) #product and shipping
df1 =pd.DataFrame(d1) #product and brand
# clean shipping column
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Free Shipping' else x)
#cleaning price column
df1['price'] = df1['price'].str.extract('(\d+\.?\d+)').astype(float)
#string converted to float
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Special Shipping' else x) # probably should be handled in a special way
df['shipping'] = df['shipping'].apply(lambda x: x if x == 0 else re.sub("[^0-9]", "", x))
df['shipping'] = df['shipping'].astype(float)
# save dataframe to csv file
df.to_csv('dataframe.csv', index=False)
df1.to_csv('dataframe1.csv', index=False)
df2 = pd.merge(df,df1, how ='inner') #pandas and merge data frames
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
from collections import defaultdict
import re
url='https://www.newegg.com/PS4-Video-Games/SubCategory/ID-3141'
with uReq(url) as uClient:
page = uClient.read()
# parsing
page_soup = soup(page, "html.parser")
# grabs products
containers= page_soup.findAll("div",{"class":"item-container"})
# save to file
filename = "products.csv"
#creating two empty dictionaries
d = defaultdict(list)
d1 = defaultdict(list)
# for loop fills dict
for container in containers:
#brand name
brand = container.div.div.a.img["title"]
#product name
title = container.findAll("a", {"class":"item-title"})
product_name = title[0].text
#shipping
shipping_container = container.findAll("li", {"class":"price-ship"})
shipping = shipping_container[0].text.strip()
#price column
pricec = container.find("li", {"class":"price-current"})
#removing all white spaces
price= pricec.text.strip('price-current')
d['Product'].append(product_name)
d['shipping'].append(shipping)
d1['Product'].append(product_name)
d1['Brand'].append(brand)
d1['price'].append(price)
# create dataframe
df = pd.DataFrame(d) #product and shipping
df1 =pd.DataFrame(d1) #product and brand
# clean shipping column
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Free Shipping' else x)
#cleaning price column
df1['price'] = df1['price'].str.extract('(\d+\.?\d+)').astype(float)
#string converted to float
df['shipping'] = df['shipping'].apply(lambda x: 0 if x == 'Special Shipping' else x) # probably should be handled in a special way
df['shipping'] = df['shipping'].apply(lambda x: x if x == 0 else re.sub("[^0-9]", "", x))
df['shipping'] = df['shipping'].astype(float)
# save dataframe to csv file
df.to_csv('dataframe.csv', index=False)
df1.to_csv('dataframe1.csv', index=False)
df2 = pd.merge(df,df1, how ='inner') #pandas and merge data frames
plt.bar(brand, product_name, color='blue')
plt.xlabel("Product")
plt.ylabel("Prince")
plt.title("PopularitY of Programming Language\n" + "Worldwide, Oct 2017 compared to a year ago")
plt.xticks(brand, product_name)
# Turn on the grid
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth='0.5', color='red')
# Customize the minor grid
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()```

I used the dataframe you saved as as dataframe1.csv in your code.
The bar graph is straight forward. The first 2 arguments of plt.bar() are the product names and price.
The pie chart is a little bit more tricky. You need to to manipulate the dataframe so you only have 6 rows (your 5 chosen brands, and then "other"), and create another column with the calculated percentage.
Refer to the matplotlib documentation and online tutorials to make it all look pretty.
import pandas as pd
import matplotlib.pyplot as plt
df1 = pd.read_csv('dataframe1.csv')
plt.bar(df1.Product, df1.price)
plt.title("Game prices")
plt.xlabel("Product name")
plt.xticks(rotation=90)
plt.ylabel("Price ($)")
plt.show()
top5 = ["PlayStation", "Ubisoft", "Electronic Arts", "Activision", "Sega"]
# Change all the other brands to "other"
df1.loc[~df1.Brand.isin(top5), "Brand"] = "others"
# Change every price so each brand has its own total
df1.price = df1.groupby("Brand").price.transform("sum")
# Remove duplicates from the Brand column so we can easily calculate percentages
df1 = df1.drop_duplicates(subset="Brand")
# Create a new Percentage column for the percentages
df1["Percentage"] = df1.price / df1.price.sum() * 100
plt.pie(df1.Percentage, labels=df1.Brand, autopct="%1.2f%%")
plt.show()
Output:

Related

From Python to Excel - Building an excel worksheet

With the help of some very kind people on here I finally got a working script to scrape some data. I now desire to transfer this data from Python to Excel, in a specific format. I have tried multiple approaches, but did not manage to get the desired result.
My script is the following:
import requests
from bs4 import BeautifulSoup
def analyze(i):
url = f"https://ktarena.com/fr/207-dofus-world-cup/match/{i}/1"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
arena = soup.find("span", attrs=('name')).text
title = soup.select_one("[class='team'] .name a").text
point = soup.select(".result .points")[0].text
image_titles = ', '.join([i['title'] for i in soup.select("[class$='dead'] > img")])
title_ano = soup.select("[class='team'] .name a")[1].text
point_ano = soup.select(".result .points")[1].text
image_titles_ano = ', '.join([i['title'] for i in soup.select("[class='class'] > img")])
print((title,point,image_titles),(title_ano,point_ano,image_titles_ano),arena)
for i in range(46270, 46394):
analyze(i)
To summarize, I scrape a couple of things:
Team names (title & title_ano)
Image titles (image_titles & image_titles_ano)
Team points (points & points_ano)
A string of text (arena)
One line of output currently looks like this:
('Thunder', '0 pts', 'roublard, huppermage, ecaflip') ('Tweaps', '60 pts', 'steamer, feca, sacrieur') A10
My goal is to transfer this output to excel, making it look like this:
To clarify, in terms of the variables I have it would be this:
Currently I can manage to transfer my data to excel, but I can't figure out how to format my data this way. Any help would be greatly appreciated :)
First of all, the code that you are using is not actually wholly correct. E.g.:
analyze(46275)
(('Grind', '10 pts', 'roublard, ecaflip'),
('SOLARY', '50 pts', 'enutrof, eniripsa, steamer, eliotrope'), 'A10')
Notice that the first player only has two image titles, and the second one has four. This is incorrect, and happens because your code assumes that img tags with the class ending in "dead" belong to the first player, and the ones that have a class named "class" belong to the second. This happens to be true for your first match (i.e. https://ktarena.com/fr/207-dofus-world-cup/match/46270), but very often this is not true at all. E.g. if I compare my result below with the same method applied to your analyze function, I end up with mismatches is 118 rows out of 248.
Here's a suggested rewrite:
import requests
from bs4 import BeautifulSoup
import pandas as pd
def analyze_new(i):
# You don't need `/1` at the end of the url
url = f"https://ktarena.com/fr/207-dofus-world-cup/match/{i}"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
arena = soup.find('span',class_='name').get_text()
# find all teams, and look for info inside each team
teams = soup.findAll('div',class_='team')
my_teams = [tuple()]*2
for idx, team in enumerate(teams):
my_teams[idx] = my_teams[idx] + \
(team.select(".name a")[0].get_text(),)
my_teams[idx] = my_teams[idx] + \
(soup.select(".result .points")[idx].get_text(),)
my_teams[idx] = my_teams[idx] + \
(', '.join([img['title'] for img in team.findAll('img')[1:]]),)
# notice, we need `return` instead of `print` to use the data
return *my_teams,arena
print(analyze_new(46275))
(('Grind', '10 pts', 'roublard, ecaflip, enutrof'),
('SOLARY', '50 pts', 'eniripsa, steamer, eliotrope'), 'A10')
Before writing this data to excel, I would create a pd.DataFrame, which can then be exported very easily:
# capture info per player in a single row
rows = []
for i in range(46270, 46394):
one, two, arena = analyze_new(i)
# adding `i` to rows, as "Match" seems like a useful `column` to have!
# but if not, you can delete `i` here below (N.B. do NOT delete the COMMA!)
# and cut 'Match' twice below
rows.append(one+(arena,i))
rows.append(two+(arena,i))
cols = ['Team','Points', 'Images', 'Arena','Match']
# create df
df = pd.DataFrame(data=rows,columns=cols)
# split up the images strings in `df.Images` and make new columns for them
# finally, drop the `df.Images` column itself
df = pd.concat([df,
df.Images.str.split(',',expand=True)\
.rename(columns={i:f'Image Title {i+1}'
for i in range(3)})], axis=1)\
.drop('Images', axis=1)
# Strip " pts" from the strings in `df.Points` and convert the type to an `int`
df['Points'] = df.Points.str.replace(' pts','').astype(int)
# Re-order the columns
df = df.loc[:, ['Match', 'Arena','Team', 'Image Title 1', 'Image Title 2',
'Image Title 3', 'Points']]
print(df.head())
Match Arena Team Image Title 1 Image Title 2 Image Title 3 Points
0 46270 A10 Thunder roublard huppermage ecaflip 0
1 46270 A10 Tweaps steamer feca sacrieur 60
2 46271 A10 Shadow Zoo feca osamodas ouginak 0
3 46271 A10 UndisClosed eniripsa sram pandawa 60
4 46272 A10 Laugh Tale osamodas ecaflip iop 0
# Finally, write the `df` to an Excel file
df.to_excel('fname.xlsx')
Result:
If you dislike the default styles added to the header row and index column, you can write it away like so:
df.T.reset_index().T.to_excel('test.xlsx', index=False, header=False)
Result:
Incidentally, I assume you have a particular reason for wanting the function to return the relevant data as *my_teams,arena. If not, it would be better to let the function itself do most of the heavy lifting. E.g. we could write something like this, and return a df directly.
def analyze_dict(i):
url = f"https://ktarena.com/fr/207-dofus-world-cup/match/{i}"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
d = {'Match': [i]*2,
'Arena': [soup.find('span',class_='name').get_text()]*2,
'Team': [],
'Image Title 1': [],
'Image Title 2': [],
'Image Title 3': [],
'Points': [],
}
teams = soup.findAll('div',class_='team')
for idx, team in enumerate(teams):
d['Team'].append(team.select(".name a")[0].get_text())
d['Points'].append(int(soup.select(".result .points")[idx].get_text().split(' ')[0]))
for img_idx, img in enumerate(team.findAll('img')[1:]):
d[f'Image Title {img_idx+1}'].append(img['title'])
return pd.DataFrame(d)
print(analyze_dict(46275))
Match Arena Team Image Title 1 Image Title 2 Image Title 3 Points
0 46275 A10 Grind roublard ecaflip enutrof 10
1 46275 A10 SOLARY eniripsa steamer eliotrope 50
Now, we only need to do the following outside the function:
dfs = []
for i in range(46270, 46394):
dfs.append(analyze_dict(i))
df = pd.concat(dfs, axis=0, ignore_index=True)
print(df.head())
Match Arena Team Image Title 1 Image Title 2 Image Title 3 Points
0 46270 A10 Thunder roublard huppermage ecaflip 0
1 46270 A10 Tweaps steamer feca sacrieur 60
2 46271 A10 Shadow Zoo feca osamodas ouginak 0
3 46271 A10 UndisClosed eniripsa sram pandawa 60
4 46272 A10 Laugh Tale osamodas ecaflip iop 0
With hardly any changes from your post, you can use the openpyxl library to write the output to an excel file as shown below:
import requests
from openpyxl import Workbook
from bs4 import BeautifulSoup
def analyze(i):
url = f"https://ktarena.com/fr/207-dofus-world-cup/match/{i}/1"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
arena = soup.find("span", attrs=('name')).text
title = soup.select_one("[class='team'] .name a").text
point = soup.select(".result .points")[0].text
image_titles = image_titles = [i['title'] for i in soup.select("[class='team']:nth-of-type(1) [class^='class'] > img")]
try:
image_title_one = image_titles[0]
except IndexError: image_title_one = ""
try:
image_title_two = image_titles[1]
except IndexError: image_title_two = ""
try:
image_title_three = image_titles[2]
except IndexError: image_title_three = ""
ws.append([arena,title,image_title_one,image_title_two,image_title_three,point])
title_ano = soup.select("[class='team'] .name a")[1].text
point_ano = soup.select(".result .points")[1].text
image_titles_ano = [i['title'] for i in soup.select("[class='team']:nth-of-type(2) [class^='class'] > img")]
try:
image_title_ano_one = image_titles_ano[0]
except IndexError: image_title_ano_one = ""
try:
image_title_ano_two = image_titles_ano[1]
except IndexError: image_title_ano_two = ""
try:
image_title_ano_three = image_titles_ano[2]
except IndexError: image_title_ano_three = ""
ws.append([arena,title_ano,image_title_ano_one,image_title_ano_two,image_title_ano_three,point_ano])
print((title,point,image_titles),(title_ano,point_ano,image_titles_ano),arena)
if __name__ == '__main__':
wb = Workbook()
wb.remove(wb['Sheet'])
ws = wb.create_sheet("result")
ws.append(['Arena','Team','Image Title 1','Image Title 2','Image Title 3','Points'])
for i in range(46270, 46290):
analyze(i)
wb.save("output.xlsx")
I've fixed the selectors to grab the right number of image titles.

Retrieving data from the Air Quality Index (AQI) website through the API and only recieving small nr. of stations

I'm working on a personal project and I'm trying to retrieve air quality data from the https://aqicn.org website using their API.
I've used this code, which I've copied and adapted for the city of Bucharest as follows:
import pandas as pd
import folium
import requests
# GET data from AQI website through the API
base_url = "https://api.waqi.info"
path_to_file = "~/path"
# Got token from:- https://aqicn.org/data-platform/token/#/
with open(path_to_file) as f:
contents = f.readlines()
key = contents[0]
# (lat, long)-> bottom left, (lat, lon)-> top right
latlngbox = "44.300264,25.920181,44.566991,26.297836" # For Bucharest
trail_url=f"/map/bounds/?token={key}&latlng={latlngbox}" #
my_data = pd.read_json(base_url + trail_url) # Joined parts of URL
print('columns->', my_data.columns) #2 cols ‘status’ and ‘data’ JSON
### Built a dataframe from the json file
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],
each_row['lat'],
each_row['lon'],
each_row['aqi']])
df = pd.DataFrame(all_rows, columns=['station_name', 'lat', 'lon', 'aqi'])
# Cleaned the DataFrame
df['aqi'] = pd.to_numeric(df.aqi, errors='coerce') # Invalid parsing to NaN
# Remove NaN entries in col
df1 = df.dropna(subset = ['aqi'])
Unfortunately it only retrieves 4 stations whereas there are many more available on the actual site. In the API documentation the only limitation I saw was for "1,000 (one thousand) requests per second" so why can't I get more of them?
Also, I've tried to modify the lat-long values and managed to get more stations, but they were outside the city I was interested in.
Here is a view of the actual perimeter I've used in the embedded code.
If you have any suggestions as of how I can solve this issue, I'd be very happy to read your thoughts. Thank you!
Try using waqi through aqicn... not exactly a clean API but I found it to work quite well
import pandas as pd
url1 = 'https://api.waqi.info'
# Get token from:- https://aqicn.org/data-platform/token/#/
token = 'XXX'
box = '113.805332,22.148942,114.434299,22.561716' # polygon around HongKong via bboxfinder.com
url2=f'/map/bounds/?latlng={box}&token={token}'
my_data = pd.read_json(url1 + url2)
all_rows = []
for each_row in my_data['data']:
all_rows.append([each_row['station']['name'],each_row['lat'],each_row['lon'],each_row['aqi']])
df = pd.DataFrame(all_rows,columns=['station_name', 'lat', 'lon', 'aqi'])
From there its easy to plot
df['aqi'] = pd.to_numeric(df.aqi,errors='coerce')
print('with NaN->', df.shape)
df1 = df.dropna(subset = ['aqi'])
df2 = df1[['lat', 'lon', 'aqi']]
init_loc = [22.396428, 114.109497]
max_aqi = int(df1['aqi'].max())
print('max_aqi->', max_aqi)
m = folium.Map(location = init_loc, zoom_start = 5)
heat_aqi = HeatMap(df2, min_opacity = 0.1, max_val = max_aqi,
radius = 60, blur = 20, max_zoom = 2)
m.add_child(heat_aqi)
m
Or as such
centre_point = [22.396428, 114.109497]
m2 = folium.Map(location = centre_point,tiles = 'Stamen Terrain', zoom_start= 6)
for idx, row in df1.iterrows():
lat = row['lat']
lon = row['lon']
station = row['station_name'] + ' AQI=' + str(row['aqi'])
station_aqi = row['aqi']
if station_aqi > 300:
pop_color = 'red'
elif station_aqi > 200:
pop_color = 'orange'
else:
pop_color = 'green'
folium.Marker(location= [lat, lon],
popup = station,
icon = folium.Icon(color = pop_color)).add_to(m2)
m2
checking for stations within HK, returns 19
df[df['station_name'].str.contains('HongKong')]

Pandas how to search one df for a certain date and return that data

I have two data frames and I am trying to search each row by date in the user.csv file and find the corresponding date in the Raven.csv file and then return the Price from the df1 and the date and amount from df2.
This is working but my Price is returning a value like this [[0.11465]], is there a way to remove these brackets or a better way to do this?
import pandas as pd
df1 = pd.read_csv('Raven.csv',)
df2 = pd.read_csv('User.csv')
df1 = df1.reset_index(drop=False)
df1.columns = ['index', 'Date', 'Price']
df2['Timestamp'] = pd.to_datetime(df2['Timestamp'], format="%Y-%m-%d %H:%M:%S").dt.date
df1['Date'] = pd.to_datetime(df1['Date'], format="%Y-%m-%d").dt.date
Looper = 0
Date = []
Price = []
amount = []
total_value = []
for x in df2['Timestamp']:
search = df2['Timestamp'].values[Looper]
Date.append(search)
price =(df1.loc[df1['Date'] == search,['index']] )
value = df1['Price'].values[price]
Price.append(value)
payout = df2['Amount'].values[Looper]
amount.append(payout)
payout_value = value * payout
total_value.append(payout_value)
Looper = Looper + 1
dict = {'Date': Date, 'Price': Price, 'Payout': amount, "Total Value": total_value}
df = pd.DataFrame(dict)
df.to_csv('out.csv')
You can do indexing to get the value:
value = [[0.11465]][0][0]
print(value)
You get:
0.11465
I hope this is what you need.

My function doesn't calculate the means correctly for all the rows

I used this script to scrape some data:
import re
import json
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
import time
import random
root_url = 'https://www.tripadvisor.ca/Hotel_Review-g186338-d215539-Reviews-or'
urls = [ '{root}{i}-OYO_Apollo_Hotel_Bayswater-London_England.html#REVIEWS'.format(root=root_url, i=i) for i in range(5,440,5) ]
comms = []
notes = []
#datestostay = []
dates = []
for url in urls:
results = requests.get(url)
#time.sleep(20)
soup = BeautifulSoup(results.text, "html.parser")
commentary = soup.find_all('div', class_='_2wrUUKlw _3hFEdNs8')
for container in commentary:
comm = container.find('q', class_ = 'IRsGHoPm').text.strip()
comms.append(comm)
comm1 = str(container.find("div", class_="nf9vGX55").find('span'))
rat = re.findall(r'\d+', str(comm1))
rat1 = (str(rat))[2]
notes.append(rat1)
datereal = container.find("div", class_= "_2fxQ4TOx").text
date = datereal[-9:]
dates.append(date)
data = pd.DataFrame({
'comms' : comms,
'notes' : notes,
'dates' : dates
})
data['dates'] = pd.to_datetime(data['dates']).dt.date
data['dates'] = pd.to_datetime(data['dates'])
data['dates'] = data.dates.dt.strftime('%Y-%m')
data.to_csv('table4.csv', sep=';', index=False)
I load the data into my notebook: df4 = pd.read_csv('datatrip/table4.csv', sep = ';')
Here's what my database looks like right now:
database
And I calculate some trigram with those functions:
def get_top_n_gram(corpus,ngram_range,n=None):
vec = CountVectorizer(ngram_range=ngram_range,stop_words = stop_words).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
def process(corpus):
corpus = pd.DataFrame(corpus, columns= ['Text', 'count']).sort_values('count', ascending = False)
return corpus
trigramlow = get_top_n_gram(df4['comms_clean'], (3,3), 50)
trigramlow = process(trigramlow)
And here's the result (I only give some rows, not the entire dataframe):
trigram
And so, here's my problem, when I used this function:
means = []
for i in range(0,50):
trigrambase = df4[df4['comms_clean'].str.contains(trigramlow.Text[i],regex=False, case=False, na=False)]
mean = round(trigrambase['notes'].mean(), 2)
means.append(mean)
trigramlow['means'] = means
it give me this (I only give some rows, not the entire dataframe):
means
I don't understand why but some means are not correctly calculated..
Like this:
df20 = df4[df4['comms_clean'].str.contains('queensway bayswater tube',regex=False, case=False, na=False)]
print(round(df20['notes'].mean(),2))
# 2.0
With the function, I obtained 1.0.
It seems that most of the means are calculated correctly though:
df20 = df4[df4['comms_clean'].str.contains('worst hotel ever',regex=False, case=False, na=False)]
print(round(df20['notes'].mean(),2))
# 1.0
df20 = df4[df4['comms_clean'].str.contains('hotel ever stayed',regex=False, case=False, na=False)]
print(round(df20['notes'].mean(),2))
# 1.11
I cannot figure out where is the problem?

Python / Pandas Dataframe: Automatically fill in missing rows

My goal is to ultimately create a scatter plot with date on the x-axis and won delegates (of each candidate) on the y-axis. I'm unsure of how to "fill in the blanks" when it comes to missing dates. I've attached a picture of the table I get.
For example, I'm trying to put March 1 as the date for Alaska, Arkansas, etc. to make it possible to plot the data.
# CREATE DATAFRAME WITH DELEGATE WON/TARGET INFORMATION
import requests
from lxml import html
import pandas
url = "http://projects.fivethirtyeight.com/election-2016/delegate-targets/"
response = requests.get(url)
doc = html.fromstring(response.text)
tables = doc.findall('.//table[#class="delegates desktop"]')
election = tables[0]
election_rows = election.findall('.//tr')
def extractCells(row, isHeader=False):
if isHeader:
cells = row.findall('.//th')
else:
cells = row.findall('.//td')
return [val.text_content() for val in cells]
def parse_options_data(table):
rows = table.findall(".//tr")
header = extractCells(rows[1], isHeader=True)
data = [extractCells(row, isHeader=False) for row in rows[2:]]
trumpdata = "Trump Won Delegates"
cruzdata = "Cruz Won Delegates"
kasichdata = "Kasich Won Delegates"
data = pandas.DataFrame(data, columns=["Date", "State or Territory", "Total Delegates", trumpdata, cruzdata, kasichdata, "Rubio"])
data.insert(4, "Trump Target Delegates", data[trumpdata].str.extract(r'(\d{0,3}$)'))
data.insert(6, "Cruz Target Delegates", data[cruzdata].str.extract(r'(\d{0,3}$)'))
data.insert(8, "Kasich Target Delegates", data[kasichdata].str.extract(r'(\d{0,3}$)'))
data = data.drop('Rubio', 1)
data[trumpdata] = data[trumpdata].str.extract(r'(^\d{0,3})')
data[cruzdata] = data[cruzdata].str.extract(r'(^\d{0,3})')
data[kasichdata] = data[kasichdata].str.extract(r'(^\d{0,3})')
return df
election_data = parse_options_data(election)
df = pandas.DataFrame(election_data)
df
You could do,
data.fillna('March 1')
I would advise you to go through the documentation
http://pandas.pydata.org/pandas-docs/stable/10min.html

Categories