How to fix beautiful soup list index out of range - python

I want to get specific information from the website. It is okey to run first four url, but when we run the fifth one, we get 'IndexError: list index out of range' at 'company = soup.select('.companyName')[0].get_text().strip()'.
we have url like
https://www.indeed.com/jobs?q=data analyst&l=remote
## Number of postings to scrape
postings = 100
jn=0
for i in range(0, postings, 10):
driver.get(url + "&start=" + str(i))
driver.implicitly_wait(3)
jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
for job in jobs:
result_html = job.get_attribute('innerHTML')
soup = BeautifulSoup(result_html, 'html.parser')
jn += 1
liens = job.find_elements(By.TAG_NAME, "a")
links = liens[0].get_attribute("href")
title = soup.select('.jobTitle')[0].get_text().strip()
company = soup.select('.companyName')[0].get_text().strip()
location = soup.select('.companyLocation')[0].get_text().strip()
try:
salary = soup.select('.salary-snippet-container')[0].get_text().strip()
except:
salary = 'NaN'
try:
rating = soup.select('.ratingNumber')[0].get_text().strip()
except:
rating = 'NaN'
try:
date = soup.select('.date')[0].get_text().strip()
except:
date = 'NaN'
try:
description = soup.select('.job-snippet')[0].get_text().strip()
except:
description = ''
dataframe = pd.concat([dataframe, pd.DataFrame([{'Title': title,
"Company": company,
'Location': location,
'Rating': rating,
'Date': date,
"Salary": salary,
"Description": description,
"Links": links}])], ignore_index=True)
print("Job number {0:4d} added - {1:s}".format(jn,title))

Generally, it's safer to check that select/find returns something before applying .get.... When you have to select-and-get from multiple elements, it's more convenient to use a function on loop.
[This is a simplified version of another function I often use when scraping; if interested, see an example with the full version.]
def extractAttr(tag, sel, attr='', defVal=None):
s = tag.select_one(sel)
if s is None: return defVal
if attr == '':
stxt = s.get_text(' ').strip()
return stxt if stxt else defVal
return s.get(attr, defVal)
then you just need to create a reference list with selectors for all the information you need:
selRef = [ # key, selector, attribute, default
('Title', '.jobTitle', '', '?'),
('Company', '.companyName', '', '?'),
('Location', '.companyLocation', '', '?'),
('Rating', '.ratingNumber', '', 'NaN'),
('Date', '.date', '', 'NaN'),
('Salary', '.salary-snippet-container', '', 'NaN'),
('Description', '.job-snippet', '', ''),
('Links', 'a[href]', 'href', None)
] # be careful to have exactly 4 items in each tuple
and you can just simplify your loop to
for job in jobs:
result_html = job.get_attribute('innerHTML')
soup = BeautifulSoup(result_html, 'html.parser')
jn += 1
jDict = {k: extractAttr(soup, s, a, d) for k, s, a, d in selRef}
dataframe = pd.concat([dataframe, pd.DataFrame([jDict])])
print("Job number {0:4d} added - {1:s}".format(jn, jDict['Title']))

Related

Python function returning None when trying to display dataframe

I am trying to get a url from bbc recipe and then get the information and then put it into a dataframe. When I am trying to run the function I made The result I am getting is 'None' and I am unsure why because it worked before I tried to organise them into a function
columns_name=['title', 'total_time', 'image', 'ingredients', 'rating_val',
'rating_count',
'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url']
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
def print_dataframe(df):
return df
def insert_df(name,totalTime,image,rating_count,rating_value,Category,Ingredients,diet,vegan,vegetarian,url,df):
new_row = {'name':name,'totalTime':totalTime,'image':image,'rating_count':rating_count,'rating_value':rating_value,'Category':Category,'Ingredients':Ingredients,'diet':diet,'vegan':vegan,'vegetarian':vegetarian,'url':url}
df = df.append(new_row, ignore_index=True)
def collect_page_data(url,columns_name):
df = pd.DataFrame(columns = columns_name)
page = requests.get(url)
page_soup = BeautifulSoup(page.text,'html.parser')
res = page_soup.find("script", {"type":
"application/ld+json"})
data = json.loads(res.text)
name = data['author']['name']
image= data['image']
rating_count = data['aggregateRating']['ratingCount']
rating_value = data['aggregateRating']['ratingValue']
Category = data['recipeCategory']
Ingredients = data['recipeIngredient']
diet = data['suitableForDiet'][1]
vegan = data['suitableForDiet'][2]
vegetarian = data['suitableForDiet'][3]
prepTime = data['prepTime']
cookTime = data['cookTime']
l = ['P','T','M']
for i in l:
prepTime = prepTime.replace(i,"")
cookTime = cookTime.replace(i,"")
totalTime = int(prepTime) + int(cookTime)
insert_df(name,totalTime,image,rating_count,rating_value,Category,Ingredients,diet,vegan,vegetarian,url,df)
print_dataframe(df)
print(collect_page_data(url,columns_name))
You have problem with two return.
First:
In insert_df() you use
df = df.append(...)
which create local df inside insert_df() - it doesn't change external df
You should rather use return and use
return df.append(...)
and execute function as
df = insert_df()
Second():
At the end of collect_page_data() you run
print_dataframe(df)
which get df and only returns it back. It can't get it back from `collect_page_data()
At the end of collect_page_data() you should run
return df
And this is full code:
I my version of BeautifulSoup I had to use res.string instead of res.text to get text.
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# --- functions ---
def insert_df(name, totalTime, image, rating_count, rating_value, Category, Ingredients, diet, vegan, vegetarian, url, df):
new_row = {'name':name,'totalTime':totalTime,'image':image,'rating_count':rating_count,'rating_value':rating_value,'Category':Category,'Ingredients':Ingredients,'diet':diet,'vegan':vegan,'vegetarian':vegetarian,'url':url}
return df.append(new_row, ignore_index=True)
def collect_page_data(url, columns_name):
df = pd.DataFrame(columns=columns_name)
page = requests.get(url)
page_soup = BeautifulSoup(page.text, 'html.parser')
res = page_soup.find("script", {"type": "application/ld+json"})
#data = json.loads(res.text)
data = json.loads(res.string)
name = data['author']['name']
image= data['image']
rating_count = data['aggregateRating']['ratingCount']
rating_value = data['aggregateRating']['ratingValue']
Category = data['recipeCategory']
Ingredients = data['recipeIngredient']
diet = data['suitableForDiet'][1]
vegan = data['suitableForDiet'][2]
vegetarian = data['suitableForDiet'][3]
prepTime = data['prepTime']
cookTime = data['cookTime']
l = ['P','T','M']
for i in l:
prepTime = prepTime.replace(i, "")
cookTime = cookTime.replace(i, "")
totalTime = int(prepTime) + int(cookTime)
df = insert_df(name, totalTime, image, rating_count, rating_value, Category, Ingredients, diet, vegan, vegetarian, url, df)
return df
# --- main ---
columns_name = [
'title', 'total_time', 'image', 'ingredients', 'rating_val',
'rating_count', 'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url'
]
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
df = collect_page_data(url, columns_name)
print(df.iloc[0])
Result:
(I get only first row - so I have Series which displays data as column)
title NaN
total_time NaN
image [https://food-images.files.bbci.co.uk/food/rec...
ingredients NaN
rating_val NaN
rating_count 22
category NaN
cuisine NaN
diet http://schema.org/LowCalorieDiet
vegan http://schema.org/VeganDiet
vegetarian http://schema.org/VegetarianDiet
url https://www.bbc.co.uk/food/recipes/avocado_pas...
Category Main course
Ingredients [375g/13oz pasta, such as penne or fusilli, 1 ...
name Nadiya Hussain
rating_value 4.363636
totalTime 40.0
Name: 0, dtype: object
EDIT:
As for me insert_df() is totaly useless and you could run its code directly in collect_page_data(). It could create more readable code.
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# --- functions ---
def collect_page_data(url, columns_name):
# --- scraping ---
page = requests.get(url)
page_soup = BeautifulSoup(page.text, 'html.parser')
res = page_soup.find("script", {"type": "application/ld+json"})
#data = json.loads(res.text)
data = json.loads(res.string) #
prep_time = data['prepTime']
cook_time = data['cookTime']
for char in ['P', 'T', 'M']:
prep_time = prep_time.replace(char, "")
cook_time = cook_time.replace(char, "")
total_time = int(prep_time) + int(cook_time)
# --- dataframe ---
df = pd.DataFrame(columns=columns_name)
df = df.append({
'name': data['author']['name'],
'total_time': total_time,
'image': data['image'],
'rating_count': data['aggregateRating']['ratingCount'],
'rating_value': data['aggregateRating']['ratingValue'],
'category': data['recipeCategory'],
'ingredients': data['recipeIngredient'],
'diet': data['suitableForDiet'][1],
'vegan': data['suitableForDiet'][2],
'vegetarian': data['suitableForDiet'][3],
'url': url
}, ignore_index=True)
return df
# --- main ---
columns_name = [
'title', 'name', 'total_time', 'image',
'ingredients', 'rating_value', 'rating_count',
'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url'
]
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
df = collect_page_data(url, columns_name)
print(df.iloc[0])

How do I return multiple 'scorers' when scraping for football results using Python?

I'm just a few hours into learning Python so please go easy with me! I'm just wanting to scrape scores and scorers off a website, I've been able to do that, however, I'm only getting one scorer (if there is one!), when there are multiple goal scorers I am only getting the first. I think I'm trying to look for multiple scorers under '# Home Scorers'.
My code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://www.skysports.com/football-results"
match_results = {}
match_details = {}
match_no = 0
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
matches = soup.find_all('div',{'class':'fixres__item'})
for match in matches:
try:
match_url_get = match.find('a',{'class':'matches__item matches__link'}).get('href')
match_url = match_url_get if match_url_get else "unknown"
event_id = match_url[-6:]
match_response = requests.get(match_url)
match_data = match_response.text
match_soup = BeautifulSoup(match_data,'html.parser')
# Match Details
match_date = match_soup.find('time',{'class':'sdc-site-match-header__detail-time'}).text
match_location = match_soup.find('span',{'class':'sdc-site-match-header__detail-venue'}).text
match_info = match_soup.find('p',{'class':'sdc-site-match-header__detail-fixture'}).text
# Home Scores & Team
home_details = match_soup.find_all('span',{'class':'sdc-site-match-header__team-name sdc-site-match-header__team-name--home'})
for home_detail in home_details:
home_team = home_detail.find('span',{'class':'sdc-site-match-header__team-name-block-target'}).text
home_score_get = match_soup.find('span',{'class':'sdc-site-match-header__team-score-block','data-update':'score-home'})
home_score = home_score_get.text if home_score_get else "none"
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
for home_scorer_detail in home_scorer_details:
goal_scorer_get = home_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
goal_scorer = goal_scorer_get.text if goal_scorer_get else "none"
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
# Away Scores & Team
away_details = match_soup.find_all('span',{'class':'sdc-site-match-header__team-name sdc-site-match-header__team-name--away'})
for away_detail in away_details:
away_team = away_detail.find('span',{'class':'sdc-site-match-header__team-name-block-target'}).text
away_score_get = match_soup.find('span',{'class':'sdc-site-match-header__team-score-block','data-update':'score-away'})
away_score = away_score_get.text if away_score_get else "none"
# Home Scorers
away_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-away'})
for away_scorer_detail in away_scorer_details:
away_goal_scorer_get = away_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
away_goal_scorer = away_goal_scorer_get.text if away_goal_scorer_get else "none"
away_goal_score_minute_get = away_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
away_goal_score_minute = away_goal_score_minute_get.text if away_goal_score_minute_get else "none"
print("Match: ",event_id , "Match Date:", match_date, "Match Location:", match_location, "Match Info:", match_info, "\nResult: ", home_team, home_score, away_team, away_score)
print("Home Scorer:", goal_scorer, "Minute:",goal_score_minute, "\nAway Scorer:", away_goal_scorer, "Minute:",away_goal_score_minute)
print(match_date)
except:
pass
match_no+=1
match_results[match_no] = [event_id, home_team, home_score, away_team, away_score, match_url, match_date, match_location, match_info]
match_details[match_no] = [event_id, goal_scorer, goal_score_minute, away_goal_scorer, away_goal_score_minute]
Period = "2021-22"
print("Total Matches: ", match_no)
match_results = pd.DataFrame.from_dict(match_results, orient='index', columns = ['Event_ID:', 'Home Team:','Home Score:','Away Team:','Away Score:','Link:','Match Date:','Match Location:','Match Info:'])
match_results.to_csv("Python/FL/Premier League Results (SkySports.com) " + Period + ".csv")
match_details = pd.DataFrame.from_dict(match_details, orient='index', columns = ['Event_ID:', 'Home Goal:','Home Goal Minute:','Away Goal:','Away Goal Minute:'])
match_details.to_csv("Python/FL/Premier League Details (SkySports.com) " + Period + ".csv")
So the bit that's not working correctly is:
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
for home_scorer_detail in home_scorer_details:
goal_scorer_get = home_scorer_detail.find('li',{'class':'sdc-site-match-header__team-synopsis-line'})
goal_scorer = goal_scorer_get.text if goal_scorer_get else "none"
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
Any ideas how I can return multiple rows for that bit?!
Thanks in advance :)
home_scorer_details only has 1 item, the unordered list itself.
To get all the scorers you need to get the items in that list.
The following code, which is pretty rough, will create a list of dictionaries where each dictionary has the name of the scorer and the minute(s) they scored.
You could use similar code to get all the away scorers.
Like I said, this code is rough and needs refined but it should give you a start.
# Home Scorers
home_scorer_details = match_soup.find_all('ul',{'class':'sdc-site-match-header__team-synopsis','data-update':'synopsis-home'})
home_scorers = []
for home_scorer_detail in home_scorer_details[0].find_all('li'):
goal_scorer = home_scorer_detail.text
goal_score_minute_get = home_scorer_detail.find('span',{'class':'sdc-site-match-header__event-time'})
goal_score_minute = goal_score_minute_get.text if goal_score_minute_get else "none"
home_scorers.append({'scorer': goal_scorer, 'minute': goal_score_minute})
print(home_scorers)

Returning multiple values in python and appending them to unique columns to a dataframe

Background:
I have a function that gets a bunch of attributes from a database. Here is the function:
def getData(key, full_name, address, city, state, zipcode):
try:
url = 'https://personator.melissadata.net/v3/WEB/ContactVerify/doContactVerify'
payload={
'TransmissionReference': "test", # used by you to keep track of reference
'Actions': 'Check',
'Columns': 'Gender','DateOfBirth','DateOfDeath','EthnicCode','EthnicGroup','Education','PoliticalParty','MaritalStatus','HouseholdSize','ChildrenAgeRange','PresenceOfChildren','PresenceOfSenior','LengthOfResidence','OwnRent','CreditCardUser','Occupation','HouseholdIncome',
'CustomerID': key,# key
'Records': [{'FullName': str(full_name), 'AddressLine1': str(address), 'City': str(city), 'State': str(state), 'PostalCode': str(zipcode)}]
}
headers = {'Content-Type': 'application/json; charset=utf-8', 'Accept':'application/json', 'Host':'personator.melissadata.net','Expect': '100-continue', 'Connection':'Keep-Alive'}
r = requests.post(url, data=json.dumps(payload), headers=headers)
dom = json.loads(r.text)
Gender = dom['Records'][0]['Gender']
DateOfBirth = dom['Records'][0]['DateOfBirth']
DateOfDeath = dom['Records'][0]['DateOfDeath']
EthnicCode = dom['Records'][0]['EthnicCode']
EthnicGroup = dom['Records'][0]['EthnicGroup']
Education = dom['Records'][0]['Education']
PoliticalParty = dom['Records'][0]['PoliticalParty']
MaritalStatus = dom['Records'][0]['MaritalStatus']
HouseholdSize = dom['Records'][0]['HouseholdSize']
ChildrenAgeRange = dom['Records'][0]['ChildrenAgeRange']
PresenceOfChildren = dom['Records'][0]['PresenceOfChildren']
PresenceOfSenior = dom['Records'][0]['PresenceOfSenior']
LengthOfResidence = dom['Records'][0]['LengthOfResidence']
OwnRent = dom['Records'][0]['OwnRent']
CreditCardUser = dom['Records'][0]['CreditCardUser']
Occupation = dom['Records'][0]['Occupation']
HouseholdIncome = dom['Records'][0]['HouseholdIncome']
return Gender
except:
return None
To make a 'Gender' column I wrap the function into a lambda as so
df['Gender'] = df.apply(lambda row: getData(key, row['Full Name'], row['Address'], row['City'], row['State'], row['Zipcode']))
Objective:
I want to do this process simultaneously for all the other attributes you see below Gender, how can I do this in Python.
You can return a dictionary, then expand a series of dictionary objects:
fields = ['Gender', 'DateOfBirth', etc.]
def getData(key, full_name, address, city, state, zipcode):
try:
# your code as before
dom = json.loads(r.text)
return {k: dom['Records'][0][k] for k in fields}
# modify below: good practice to specify exactly which error(s) to catch
except:
return {}
Then expand your series of dictionaries:
dcts = df.apply(lambda row: getData(key, row['Full Name'], row['Address'], row['City'],
row['State'], row['Zipcode']), axis=1)
df = df.join(pd.DataFrame(dcts.tolist()))
As per #spaniard's comment, if you want all available fields, you can simply use:
return json.loads(r.text)['Records'][0]

BeautifulSoup fill missing information with "NA" in csv

I am working on a web scraper that creates a .csv file of all chemicals on the Sigma-Aldrich website. The .csv file would have the chemical name followed by variables such as product number, cas number, molecular weight and chemical formula. 1 chemical + info per row.
The issue I'm having is that not all chemicals have all their fields, many only have product and cas numbers. This results in my .csv file being offset and chemical rows having incorrect info associated with another chemical.
To right this wrong, I want to add 'N/A' if the field is empty.
Here is my scraping method:
def scraap(urlLi):
for url in urlLi:
content = requests.get(url).content
soup = BeautifulSoup(content, 'lxml')
containers = soup.find_all('div', {'class': 'productContainer-inner'})
for c in containers:
sub = c.find_all('div', {'class': 'productContainer-inner-content'})
names = c.find_all('div', {'class': 'searchResultSubstanceBlock clearfix'})
for n in names:
hope = n.find("h2").text
print(hope)
nombres.append(hope.encode('utf-8'))
for s in sub:
info = s.find_all('ul', {'class': 'nonSynonymProperties'})
proNum = s.find_all('div', {'class': 'product-listing-outer'})
for p in proNum:
ping = p.find_all('div', {'class': 'row clearfix'})
for po in ping:
pro = p.find_all('li', {'class': 'productNumberValue'})
pnPp = []
for pri in pro:
potus = pri.get_text()
pnPp.append(potus.encode('utf-8'))
ProductNumber.append(pnPp)
print(pnPp)
for i in info:
c = 1
for gling in i:
print(gling.get_text())
if c == 1:
formu.append(gling.get_text().encode('utf-8'))
elif c == 2:
molWei.append(gling.get_text().encode('utf-8'))
else:
casNum.append(gling.get_text().encode('utf-8'))
c += 1
c == 1
print("---")
here is my writing method:
def pipeUp():
with open('sigma_pipe_out.csv', mode='wb') as csv_file:
fieldnames = ['chem_name', 'productNum', 'formula', 'molWei', 'casNum']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
# writer.writeheader()
# csv_file.write(' '.join(fieldnames))
for n, p, f, w, c in zip(nombres, ProductNumber, formu, molWei, casNum):
# writer.writerow([n, p, f, w, c])
writer.writerow({'chem_name': n, 'productNum': p, 'formula': f, 'molWei': w, 'casNum': c})
The issue arises in the get i from info: section. The formu, molWei and casNum list are off.
How can I add "N/a" if formu and molWei are missing information?
I'm assuming get_text() returns an empty string if there's no information on the formula and molecular weight etc. In that case you can just add:
if not molWei:
molWei = "N/A"
Which updates molWei to be N/A if the string is empty.
you cannot use index as value checking (if c == 1:), use string check before adding to the list
replace:
for i in info:
....
....
print("---")
with:
rowNames = ['formu', 'molWei', 'casNum']
for li in info[0].find_all('li'):
textVal = li.text.encode('utf-8')
#print(textVal)
if b'Formula' in textVal:
formu.append(textVal)
rowNames.remove('formu')
elif b'Molecular' in textVal:
molWei.append(textVal)
rowNames.remove('molWei')
else:
casNum.append(textVal)
rowNames.remove('casNum')
# add missing row here
if len(rowNames) > 1:
for item in rowNames:
globals()[item].append('NA')
print("---")

Error while grabbing the table data from a website

I am trying to grab some stock related data from the web for my project.I encountered couple of problems.
Problem 1:
I tried to grab the table from this site http://sharesansar.com/c/today-share-price.html
It worked but the columns aren't grabbed in order.For eg: Column 'Company Name' has values of 'Open price'. How can I solve this?
Problem 2:
I also tried to grab a company specific data from http://merolagani.com/CompanyDetail.aspx?symbol=ADBL under 'Price History' tab.
This time I got an error while grabbing the table data.The error I got was:
self.data[key].append(cols[index].get_text())
IndexError: list index out of range
The code is as shown below:
import logging
import requests
from bs4 import BeautifulSoup
import pandas
module_logger = logging.getLogger('mainApp.dataGrabber')
class DataGrabberTable:
''' Grabs the table data from a certain url. '''
def __init__(self, url, csvfilename, columnName=[], tableclass=None):
module_logger.info("Inside 'DataGrabberTable' constructor.")
self.pgurl = url
self.tableclass = tableclass
self.csvfile = csvfilename
self.columnName = columnName
self.tableattrs = {'class':tableclass} #to be passed in find()
module_logger.info("Done.")
def run(self):
'''Call this to run the datagrabber. Returns 1 if error occurs.'''
module_logger.info("Inside 'DataGrabberTable.run()'.")
try:
self.rawpgdata = (requests.get(self.pgurl, timeout=5)).text
except Exception as e:
module_logger.warning('Error occured: {0}'.format(e))
return 1
#module_logger.info('Headers from the server:\n {0}'.format(self.rawpgdata.headers))
soup = BeautifulSoup(self.rawpgdata, 'lxml')
module_logger.info('Connected and parsed the data.')
table = soup.find('table',attrs = self.tableattrs)
rows = table.find_all('tr')[1:]
#initializing a dict in a format below
# data = {'col1' : [...], 'col2' : [...], }
#col1 and col2 are from columnName list
self.data = {}
self.data = dict(zip(self.columnName, [list() for i in range(len(self.columnName))]))
module_logger.info('Inside for loop.')
for row in rows:
cols = row.find_all('td')
index = 0
for key in self.data:
if index > len(cols): break
self.data[key].append(cols[index].get_text())
index += 1
module_logger.info('Completed the for loop.')
self.dataframe = pandas.DataFrame(self.data) #make pandas dataframe
module_logger.info('writing to file {0}'.format(self.csvfile))
self.dataframe.to_csv(self.csvfile)
module_logger.info('written to file {0}'.format(self.csvfile))
module_logger.info("Done.")
return 0
def getData(self):
""""Returns 'data' dictionary."""
return self.data
# Usage example
def main():
url = "http://sharesansar.com/c/today-share-price.html"
classname = "table"
fname = "data/sharesansardata.csv"
cols = [str(i) for i in range(18)] #make a list of columns
'''cols = [
'S.No', 'Company Name', 'Symbol', 'Open price', 'Max price',
'Min price','Closing price', 'Volume', 'Previous closing',
'Turnover','Difference',
'Diff percent', 'Range', 'Range percent', '90 days', '180 days',
'360 days', '52 weeks high', '52 weeks low']'''
d = DataGrabberTable(url, fname, cols, classname)
if d.run() is 1:
print('Data grabbing failed!')
else:
print('Data grabbing done.')
if __name__ == '__main__':
main()
Few suggestions would help.Thank you!
Your col list is missing an element there are 19 columns, not 18:
>>> len([str(i) for i in range(18)])
18
Besides you seem to over complicate things. The following should do:
import requests
from bs4 import BeautifulSoup
import pandas as pd
price_response = requests.get('http://sharesansar.com/c/today-share-price.html')
price_table = BeautifulSoup(price_response.text, 'lxml').find('table', {'class': 'table'})
price_rows = [[cell.text for cell in row.find_all(['th', 'td'])] for row in price_table.find_all('tr')]
price_df = pd.DataFrame(price_rows[1:], columns=price_rows[0])
com_df = None
for symbol in price_df['Symbol']:
comp_response = requests.get('http://merolagani.com/CompanyDetail.aspx?symbol=%s' % symbol)
comp_table = BeautifulSoup(comp_response.text, 'lxml').find('table', {'class': 'table'})
com_header, com_value = list(), list()
for tbody in comp_table.find_all('tbody'):
comp_row = tbody.find('tr')
com_header.append(comp_row.find('th').text.strip().replace('\n', ' ').replace('\r', ' '))
com_value.append(comp_row.find('td').text.strip().replace('\n', ' ').replace('\r', ' '))
df = pd.DataFrame([com_value], columns=com_header)
com_df = df if com_df is None else pd.concat([com_df, df])
print(price_df)
print(com_df)

Categories