Error while grabbing the table data from a website - python

I am trying to grab some stock related data from the web for my project.I encountered couple of problems.
Problem 1:
I tried to grab the table from this site http://sharesansar.com/c/today-share-price.html
It worked but the columns aren't grabbed in order.For eg: Column 'Company Name' has values of 'Open price'. How can I solve this?
Problem 2:
I also tried to grab a company specific data from http://merolagani.com/CompanyDetail.aspx?symbol=ADBL under 'Price History' tab.
This time I got an error while grabbing the table data.The error I got was:
self.data[key].append(cols[index].get_text())
IndexError: list index out of range
The code is as shown below:
import logging
import requests
from bs4 import BeautifulSoup
import pandas
module_logger = logging.getLogger('mainApp.dataGrabber')
class DataGrabberTable:
''' Grabs the table data from a certain url. '''
def __init__(self, url, csvfilename, columnName=[], tableclass=None):
module_logger.info("Inside 'DataGrabberTable' constructor.")
self.pgurl = url
self.tableclass = tableclass
self.csvfile = csvfilename
self.columnName = columnName
self.tableattrs = {'class':tableclass} #to be passed in find()
module_logger.info("Done.")
def run(self):
'''Call this to run the datagrabber. Returns 1 if error occurs.'''
module_logger.info("Inside 'DataGrabberTable.run()'.")
try:
self.rawpgdata = (requests.get(self.pgurl, timeout=5)).text
except Exception as e:
module_logger.warning('Error occured: {0}'.format(e))
return 1
#module_logger.info('Headers from the server:\n {0}'.format(self.rawpgdata.headers))
soup = BeautifulSoup(self.rawpgdata, 'lxml')
module_logger.info('Connected and parsed the data.')
table = soup.find('table',attrs = self.tableattrs)
rows = table.find_all('tr')[1:]
#initializing a dict in a format below
# data = {'col1' : [...], 'col2' : [...], }
#col1 and col2 are from columnName list
self.data = {}
self.data = dict(zip(self.columnName, [list() for i in range(len(self.columnName))]))
module_logger.info('Inside for loop.')
for row in rows:
cols = row.find_all('td')
index = 0
for key in self.data:
if index > len(cols): break
self.data[key].append(cols[index].get_text())
index += 1
module_logger.info('Completed the for loop.')
self.dataframe = pandas.DataFrame(self.data) #make pandas dataframe
module_logger.info('writing to file {0}'.format(self.csvfile))
self.dataframe.to_csv(self.csvfile)
module_logger.info('written to file {0}'.format(self.csvfile))
module_logger.info("Done.")
return 0
def getData(self):
""""Returns 'data' dictionary."""
return self.data
# Usage example
def main():
url = "http://sharesansar.com/c/today-share-price.html"
classname = "table"
fname = "data/sharesansardata.csv"
cols = [str(i) for i in range(18)] #make a list of columns
'''cols = [
'S.No', 'Company Name', 'Symbol', 'Open price', 'Max price',
'Min price','Closing price', 'Volume', 'Previous closing',
'Turnover','Difference',
'Diff percent', 'Range', 'Range percent', '90 days', '180 days',
'360 days', '52 weeks high', '52 weeks low']'''
d = DataGrabberTable(url, fname, cols, classname)
if d.run() is 1:
print('Data grabbing failed!')
else:
print('Data grabbing done.')
if __name__ == '__main__':
main()
Few suggestions would help.Thank you!

Your col list is missing an element there are 19 columns, not 18:
>>> len([str(i) for i in range(18)])
18
Besides you seem to over complicate things. The following should do:
import requests
from bs4 import BeautifulSoup
import pandas as pd
price_response = requests.get('http://sharesansar.com/c/today-share-price.html')
price_table = BeautifulSoup(price_response.text, 'lxml').find('table', {'class': 'table'})
price_rows = [[cell.text for cell in row.find_all(['th', 'td'])] for row in price_table.find_all('tr')]
price_df = pd.DataFrame(price_rows[1:], columns=price_rows[0])
com_df = None
for symbol in price_df['Symbol']:
comp_response = requests.get('http://merolagani.com/CompanyDetail.aspx?symbol=%s' % symbol)
comp_table = BeautifulSoup(comp_response.text, 'lxml').find('table', {'class': 'table'})
com_header, com_value = list(), list()
for tbody in comp_table.find_all('tbody'):
comp_row = tbody.find('tr')
com_header.append(comp_row.find('th').text.strip().replace('\n', ' ').replace('\r', ' '))
com_value.append(comp_row.find('td').text.strip().replace('\n', ' ').replace('\r', ' '))
df = pd.DataFrame([com_value], columns=com_header)
com_df = df if com_df is None else pd.concat([com_df, df])
print(price_df)
print(com_df)

Related

How to fix beautiful soup list index out of range

I want to get specific information from the website. It is okey to run first four url, but when we run the fifth one, we get 'IndexError: list index out of range' at 'company = soup.select('.companyName')[0].get_text().strip()'.
we have url like
https://www.indeed.com/jobs?q=data analyst&l=remote
## Number of postings to scrape
postings = 100
jn=0
for i in range(0, postings, 10):
driver.get(url + "&start=" + str(i))
driver.implicitly_wait(3)
jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
for job in jobs:
result_html = job.get_attribute('innerHTML')
soup = BeautifulSoup(result_html, 'html.parser')
jn += 1
liens = job.find_elements(By.TAG_NAME, "a")
links = liens[0].get_attribute("href")
title = soup.select('.jobTitle')[0].get_text().strip()
company = soup.select('.companyName')[0].get_text().strip()
location = soup.select('.companyLocation')[0].get_text().strip()
try:
salary = soup.select('.salary-snippet-container')[0].get_text().strip()
except:
salary = 'NaN'
try:
rating = soup.select('.ratingNumber')[0].get_text().strip()
except:
rating = 'NaN'
try:
date = soup.select('.date')[0].get_text().strip()
except:
date = 'NaN'
try:
description = soup.select('.job-snippet')[0].get_text().strip()
except:
description = ''
dataframe = pd.concat([dataframe, pd.DataFrame([{'Title': title,
"Company": company,
'Location': location,
'Rating': rating,
'Date': date,
"Salary": salary,
"Description": description,
"Links": links}])], ignore_index=True)
print("Job number {0:4d} added - {1:s}".format(jn,title))
Generally, it's safer to check that select/find returns something before applying .get.... When you have to select-and-get from multiple elements, it's more convenient to use a function on loop.
[This is a simplified version of another function I often use when scraping; if interested, see an example with the full version.]
def extractAttr(tag, sel, attr='', defVal=None):
s = tag.select_one(sel)
if s is None: return defVal
if attr == '':
stxt = s.get_text(' ').strip()
return stxt if stxt else defVal
return s.get(attr, defVal)
then you just need to create a reference list with selectors for all the information you need:
selRef = [ # key, selector, attribute, default
('Title', '.jobTitle', '', '?'),
('Company', '.companyName', '', '?'),
('Location', '.companyLocation', '', '?'),
('Rating', '.ratingNumber', '', 'NaN'),
('Date', '.date', '', 'NaN'),
('Salary', '.salary-snippet-container', '', 'NaN'),
('Description', '.job-snippet', '', ''),
('Links', 'a[href]', 'href', None)
] # be careful to have exactly 4 items in each tuple
and you can just simplify your loop to
for job in jobs:
result_html = job.get_attribute('innerHTML')
soup = BeautifulSoup(result_html, 'html.parser')
jn += 1
jDict = {k: extractAttr(soup, s, a, d) for k, s, a, d in selRef}
dataframe = pd.concat([dataframe, pd.DataFrame([jDict])])
print("Job number {0:4d} added - {1:s}".format(jn, jDict['Title']))

Within a dataframe, how can we write one row at a time and one column at a time?

I'm looping through records in a dataframe column and trying to pull geocode data for each. Here's the code that I'm testing.
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="http")
for item in df_fin['market_address']:
try:
location = geolocator.geocode(item)
df_fin.loc['address'] = location.address
df_fin.loc['latitude'] = location.latitude
df_fin.loc['longitude'] = location.longitude
df_fin.loc['raw'] = location.raw
print(location.raw)
except:
df_fin.loc['raw'] = 'no info for: ' + item
print('no info for: ' + item)
I must be missing something simple, but I'm just not seeing what the issue is here.
UPDATE:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="http")
for index, row in df_fin.market_address.iterrows():
try:
location = geolocator.geocode(row)
row['address'] = location.address
row['latitude'] = location.latitude
row['longitude'] = location.longitude
row['raw'] = location.raw
print(location.raw)
except:
row['raw'] = 'no info for: ' + row
print('no info for: ' + row)
df_fin.tail(10)
You can reference below code :
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="http")
for index, row in df_fin.iterrows():
try:
location = geolocator.geocode(item)
row['address'] = location.address
row['latitude'] = location.latitude
row['longitude'] = location.longitude
row['raw'] = location.raw
print(location.raw)
except:
row['raw'] = 'no info for: ' + item
print('no info for: ' + item)
And if you are more familiar with Pandas, you can use #DYZ's answer.
You should define a function that converts market_address into the address, lat, and long, and .apply that function to the DataFrame.
def locate(market_address):
loc = geolocator.geocode(market_address)
return pd.Series({'address': loc.address if loc else np.nan,
'latitude': loc.latitude if loc else np.nan,
'longitude': loc.longitude if loc else np.nan,
'raw': loc.raw if loc else np.nan})
df_fin.join(df_fin['market_address'].apply(locate))
Note that loc.raw is a dictionary. When you store a dictionary in a DataFrame, you are looking for trouble in the future.

How to filter an API Search result in Python?

I am using the edamam recipe api and have been trying to filter the response by only saving recipes with a number of calories > the max inputed by the user. I keep getting an error. This is the code:
import requests
import pandas as pd
def recipe_search(ingredient):
app_id = ''
app_key = ''
result = requests.get('https://api.edamam.com/search?q={}&app_id={}&app_key={}'.format(ingredient, app_id, app_key))
data = result.json()
return data['hits']
def run():
ingredient = input('Enter an ingredient: ')
max_no_of_calories = float(input('Enter the max amount of calories desired in recipe: '))
data_label = []
data_uri = []
data_calories = []
results = recipe_search(ingredient)
for result in results:
recipe = result['recipe']
result['calories'] < max_no_of_calories
data_label.append(recipe['label'])
data_uri.append(recipe['uri'])
data_calories.append(recipe['calories'])
data = {'Label': data_label,
'URL': data_uri,
'No of Calories': data_calories
}
df = pd.DataFrame(data, columns=['Label', 'URL'])
df.to_csv(r'C:\Users\name\Documents/cfg-python/export_dataframe.csv',
index=False, header=True)
run()
df2 = pd.read_csv(r'C:\Users\name\Documents/cfg-python/export_dataframe.csv')
sorted_df = df2.sort_values(by=["calories"], ascending=True)
sorted_df.to_csv(r'C:\Users\name\Documents/cfg-python/export_dataframe.csv', index=False)
This is the error:
if result['calories'] < max_no_of_calories:
KeyError: 'calories'
Is anyone able to help? How could I re-write this code with the filter of only recipes with under the max_no_of_calories? 'max_no_of_calories' is input by the user.
You made a typo in your for loop
for result in results:
recipe = result['recipe']
if recipe["calories"] < max_no_of_calories:
print(recipe["calories"])
This will get rid of your key error

Python 3.7 KeyError

I like to retrieve information from NewsApi and ran into an issue. Enclosed the code:
from NewsApi import NewsApi
import pandas as pd
import os
import datetime as dt
from datetime import date
def CreateDF(JsonArray,columns):
dfData = pd.DataFrame()
for item in JsonArray:
itemStruct = {}
for cunColumn in columns:
itemStruct[cunColumn] = item[cunColumn]
# dfData = dfData.append(itemStruct,ignore_index=True)
# dfData = dfData.append({'id': item['id'], 'name': item['name'], 'description': item['description']},
# ignore_index=True)
# return dfData
return itemStruct
def main():
# access_token_NewsAPI.txt must contain your personal access token
with open("access_token_NewsAPI.txt", "r") as f:
myKey = f.read()[:-1]
#myKey = 'a847cee6cc254d8495632f83d5c77d39'
api = NewsApi(myKey)
# get sources of news
# columns = ['id', 'name', 'description']
# rst_source = api.GetSources()
# df = CreateDF(rst_source['sources'], columns)
# df.to_csv('source_list.csv')
#
#
# # get news for specific country
# rst_country = api.GetHeadlines()
# columns = ['author', 'publishedAt', 'title', 'description','content', 'url']
# df = CreateDF(rst_country['articles'], columns)
# df.to_csv('Headlines_country.csv')
# get news for specific symbol
symbol = "coronavirus"
sources = 'bbc.co.uk'
columns = ['author', 'publishedAt', 'title', 'description', 'content', 'source']
limit = 500 # maximum requests per day
i = 1
startDate = dt.datetime(2020, 3, 1, 8)
# startDate = dt.datetime(2020, 3, 1)
df = pd.DataFrame({'author': [], 'publishedAt': [], 'title': [], 'description': [], 'content':[], 'source': []})
while i < limit:
endDate = startDate + dt.timedelta(hours=2)
rst_symbol = api.GetEverything(symbol, 'en', startDate, endDate, sources)
rst = CreateDF(rst_symbol['articles'], columns)
df = df.append(rst, ignore_index=True)
# DF.join(df.set_index('publishedAt'), on='publishedAt')
startDate = endDate
i += 1
df.to_csv('Headlines_symbol.csv')
main()
I got following error:
rst = CreateDF(rst_symbol['articles'], columns)
KeyError: 'articles'
In this line:
rst = CreateDF(rst_symbol['articles'], columns)
I think there is some problem regarding the key not being found or defined - does anyone has an idea how to fix that? I'm thankful for every hint!
MAiniak
EDIT:
I found the solution after I tried a few of your hints. Apparently, the error occurred when the NewsAPI API key ran into a request limit. This happened every time, until I changed the limit = 500 to limit = 20. For some reason, there is no error with a new API Key and reduced limit.
Thanks for your help guys!
Probably 'articles' is not one of your columns in rst_symbol object.
The python documentation [2] [3] doesn't mention any method named NewsApi() or GetEverything(), but rather NewsApiClient() and get_everything(), i.e.:
from newsapi import NewsApiClient
# Init
newsapi = NewsApiClient(api_key='xxx')
# /v2/top-headlines
top_headlines = newsapi.get_top_headlines(q='bitcoin',
sources='bbc-news,the-verge',
category='business',
language='en',
country='us')
# /v2/everything
all_articles = newsapi.get_everything(q='bitcoin',
sources='bbc-news,the-verge',
domains='bbc.co.uk,techcrunch.com',
from_param='2017-12-01',
to='2017-12-12',
language='en',
sort_by='relevancy',
page=2)
# /v2/sources
sources = newsapi.get_sources()

Webscraping data from a json source, why i get only 1 row?

I'am trying to get some information from a website with python, from a webshop.
I tried this one:
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
# print(df) ## print df
df.to_csv(r'/usr/src/Python-2.7.13/test.csv', sep=',', encoding='utf-8-sig',index = False )
while True:
mytime=datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print mytime
proba()
mytime=datetime.now().strftime("%H:%M:%S")
In this webshop there are 9 items, but i see only 1 row in the csv file.
Not entirely sure what you intend as end result. Are you wanting to update an existing file? Get data and write out all in one go? Example of latter shown below where I add each new dataframe to an overall dataframe and use a Return statement for the function call to provide each new dataframe.
import requests
from datetime import datetime
import pandas as pd
def proba():
my_url = requests.get('https://www.telekom.hu/shop/categoryresults/?N=10994&contractType=list_price&instock_products=1&Ns=sku.sortingPrice%7C0%7C%7Cproduct.displayName%7C0&No=0&Nrpp=9&paymentType=FULL')
data = my_url.json()
results = []
products = data['MainContent'][0]['contents'][0]['productList']['products']
for product in products:
name = product['productModel']['displayName']
try:
priceGross = product['priceInfo']['priceItemSale']['gross']
except:
priceGross = product['priceInfo']['priceItemToBase']['gross']
url = product['productModel']['url']
results.append([name, priceGross, url])
df = pd.DataFrame(results, columns = ['Name', 'Price', 'Url'])
return df
headers = ['Name', 'Price', 'Url']
df = pd.DataFrame(columns = headers)
while True:
mytime = datetime.now().strftime("%H:%M:%S")
while mytime < "23:59:59":
print(mytime)
dfCurrent = proba()
mytime=datetime.now().strftime("%H:%M:%S")
df = pd.concat([df, dfCurrent])
df.to_csv(r"C:\Users\User\Desktop\test.csv", encoding='utf-8')

Categories