Web scrape to obtain table data from guru focus site - python

I want to scrape specific data from guru focus website.
https://www.gurufocus.com/stock/AAHTF/summary?search=AAPICO
Currently i am fetching number value. For example:financial strength value is "4" out of 10. Now i want to fetch sub components data as well.
code to fetch only number value:
for name in names:
start_time = time.time()
# getting the symbol
URL = f'https://www.gurufocus.com/search?s={name}'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(URL)
html_source = driver.page_source
driver.close()
soup = BeautifulSoup(html_source, 'html.parser')
headers = soup.find_all("span")
# saving only the first link
for i, head in enumerate(headers):
try:
h = head.find("a").get("href")
link = "https://www.gurufocus.com" + h
break
except:
pass
try:
# loading the link page
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(link)
html_source = driver.page_source
driver.close()
soup = BeautifulSoup(html_source, 'html.parser')
headers = soup.find_all("span", class_="t-default bold")
ratings = []
for head in headers:
ratings.append(int(head.get_text()))
if len(ratings) == 0:
continue
ratings_dict = {"Financial Strength": ratings[0],
"Growth Rank" : ratings[1],
"Momentum Rank" : ratings[2],
"Profitability Rank": ratings[3],
"GF Value Rank" : ratings[4],
}
print(ratings_dict)
# ratings_dict = json.loads(ratings_dict)
with open(f"output/gurufocus/{name}.json", 'w') as f:
json.dump(str(ratings_dict), f)
end_time = time.time()
print("time taken for %s is: %.2f" %(name, (end_time-start_time)))
except:
print("no data found")
output:
"{'Financial Strength': 6, 'Growth Rank': 4, 'Momentum Rank': 4, 'Profitability Rank': 7, 'GF Value Rank': 5}"
Expection:
I want to fetch full table data( below image) along with rank into data frame.
How do I need to change my code to obtain the other specific data?

You can use Pandas to write a clean solution for this problem:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from collections import ChainMap
tables = pd.read_html(
requests.get(
'https://www.gurufocus.com/stock/AAHTF/summary?search=AAPICO'
).text,
header=0
)
sub_table_values = [[{record["Name"]: record["Current"]} for record in json.loads(e)] for e in [i.to_json(orient="records") for i in tables]]
sub_formatted = [dict(ChainMap(*a)) for a in sub_table_values]
print(json.dumps(sub_formatted, indent=4))
Description:
First, I obtain all the tables and convert those to DataFrames(using pandas).
Then, I convert the dataframes to json and only extract the desired result(Name and Curent)
Format the data.
It would return:
[
{
"WACC vs ROIC": null,
"Beneish M-Score": "-2.28",
"Altman Z-Score": "1.98",
"Piotroski F-Score": "7/9",
"Interest Coverage": "4.68",
"Debt-to-EBITDA": "2.55",
"Debt-to-Equity": "0.85",
"Equity-to-Asset": "0.37",
"Cash-To-Debt": "0.1"
},
{
"Future 3-5Y Total Revenue Growth Rate": 13.71,
"3-Year Book Growth Rate": 2.8,
"3-Year FCF Growth Rate": 49.9,
"3-Year EPS without NRI Growth Rate": -5.2,
"3-Year EBITDA Growth Rate": 9.0,
"3-Year Revenue Growth Rate": 9.6
}...
]
However, this is solution works because the web is structured with tables. For complex/irregular websites I prefer to use scrapy as we use in my job.

Related

Web scraping content of ::before using BeautifulSoup?

I am quite new to python and tried scraping some websites. A few of em worked well but i now stumbled upon one that is giving me a hard time. the url im using is: https://www.drankdozijn.nl/groep/rum. Im trying to get all product titles and urls from this page. But since there is a ::before in the HTML code i am unable to scrape it. Any help would be very appreciated! This is the code i have so far:
try:
source = requests.get(url)
source.raise_for_status()
soup = BeautifulSoup(source.text,'html.parser')
wachttijd = random.randint(2, 4)
print("Succes! URL:", url, "Wachttijd is:", wachttijd, "seconden")
productlist = soup.find('div', {'id':'app'})
for productinfo in productlist:
productnaam = getTextFromHTMLItem(productinfo.find('h3', {'class':'card-title lvl3'}))
product_url = getHREFFromHTMLItem(productinfo.find('a' , {'class':'ptile-v2_link'}))
# print info
print(productnaam)
# Informatie in sheet row plaatsen
print("Sheet append")
sheet.append([productnaam])
#time.sleep(1)
time.sleep(wachttijd)
print("Sheet opslaan")
excel.save('C:/Python/Files/RumUrlsDrankdozijn.xlsx')
return soup
except Exception as e:
print(e)
The product details for that site are returned via a different URL using JSON. The HTML returned does not contain this. This could easily be accessed as follows:
from bs4 import BeautifulSoup
import requests
import openpyxl
url = "https://es-api.drankdozijn.nl/products"
params = {
"country" : "NL",
"language" : "nl",
"page_template" : "groep",
"group" : "rum",
"page" : "1",
"listLength" : "20",
"clientFilters" : "{}",
"response" : "paginated",
"sorteerOp" : "relevance",
"ascdesc" : "asc",
"onlyAvail" : "false",
"cacheKey" : "1",
"premiumMember" : "N",
}
wb = openpyxl.Workbook()
ws = wb.active
ws.append(['Description', 'Price', 'URL', "Land", "AlcoholPercentage"])
for page in range(1, 11):
params['page'] = page
req = requests.get(url, params=params)
req.raise_for_status()
soup = BeautifulSoup(req.content, 'html.parser')
data = req.json()
for product in data['data']:
land = "unknown"
alcoholpercentage = "unknown"
features = {feature["alias"] : feature["value"]["description"] for feature in product['features']}
ws.append([
product["description"],
product["pricePerLiterFormatted"],
product["structuredData"]["offers"]["url"],
features["land"],
features["alcoholpercentage"]
])
wb.save('output.xlsx')
This gets the first 10 pages of details, starting:
I recommend you print(data) to have a look at all of the information that is available.
The URL was found using the browser's network tools to watch the request it made whilst loading the page. An alternative approach would be to use something like Selenium to fully render the HTML, but this will be slower and more resource intensive.
openpyxl is used to create an output spreadsheet. You could modify the column width's and appearance if needed for the Excel output.

Web Scraping with Python BS

Trying to scrape some weather data off of Weather Underground. I haven't had any difficulty getting the data of interest until I came to getting the day/date, hi/lo temps, and forecast (ie. "Partly Cloudy"). Each is in a div without a class. The parent, of each, is a div with a class="obs-date" (see image below)
[WxUn HTML image][1]
Attempted code below with other options commented out. Each returns an empty list.
def get_wx(city, state):
city=city.lower()
state=state.lower()
# get current conditions; 'weather' in url
current_dict = get_current(city, state)
# get forecast; 'forecast' in url
f_url = f'https://www.wunderground.com/forecast/us/{state}/{city}'
f_response = req.get(f_url)
f_soup = BeautifulSoup(f_response.text, 'html.parser')
cast_dates = f_soup.find_all('div', class_="obs-date")
# cast_dates = f_soup.find_all('div', attrs={"class":"obs-date"})
# cast_dates = f_soup.select('div.obs-date')
print(cast_dates)
get_wx("Portland", "ME")
Any help with what I'm missing is appreciated.
As far as I can see the whole block you're trying to parse is driven by javascript, that's why you're getting empty results using beautifulsoup
The ADDITIONAL CONDITIONS part could be parsed completely using bs4 as well as everything below. Table at the end could be parsed using pandas.
To scrape javascript content, you can use requests-html or selenium libraries.
from requests_html import HTMLSession
import json
session = HTMLSession()
url = "https://www.wunderground.com/weather/us/me/portland"
response = session.get(url)
response.html.render(sleep=1)
data = []
current_date = response.html.find('.timestamp strong', first = True).text
weather_conditions = response.html.find('.condition-icon p', first = True).text
gusts = response.html.find('.medium-uncentered span', first = True).text
current_temp = response.html.find('.current-temp .is-degree-visible', first = True).text
data.append({
"Last update": current_date,
"Current weather": weather_conditions,
"Temperature": current_temp,
"Gusts": gusts,
})
print(json.dumps(data, indent = 2, ensure_ascii = False))
Output:
[
{
"Last update": "1:27 PM EDT on April 14, 2021",
"Current weather": "Fair",
"Temperature": "49 F",
"Gusts": "13 mph"
}
]

Scraping through all pages

I am trying to scrape this websites: voxnews.info
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
web='https://voxnews.info'
def main(req, num, web):
r = req.get(web+"/page/{}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
goal = [(x.time.text, x.h1.a.get_text(strip=True), x.select_one("span.cat-links").get_text(strip=True), x.p.get_text(strip=True))
for x in soup.select("div.site-content")]
return goal
with ThreadPoolExecutor(max_workers=30) as executor:
with requests.Session() as req:
fs = [executor.submit(main, req, num) for num in range(1, 2)] # need to scrape all the webpages in the website
allin = []
for f in fs:
allin.extend(f.result())
df = pd.DataFrame.from_records(
allin, columns=["Date", "Title", "Category", "Content"])
print(df)
But the code has two problems:
the first one is that I am not scraping all the pages (I currently put 1 and 2 in the range, but I would need all the pages);
it does not save correctly the dates.
If could have a look at the code and tell me how to improve it in order to fix these two issues,it would be awesome.
Some minor changes.
First it isn't necessary to use requests.Session() for single requests - you aren't trying to save data between requests.
A minor change to how you had your with statement, I don't know if it's more correct, or just how I do it, you don't need all of the code to run with the executer still open.
I gave you two options for parsing the date, either as it's written on the site, a string in Italian, or as a datetime object.
I didn't see any "p" tag within the articles, so I removed that part. It seems in order to get the "content" of the articles, you would have to actually navigate to and scrape them individually. I removed that line from the code.
In your original code, you weren't getting every single article on the page, just the first one of each. There is only one "div.site-content" tag per page, but multiple "article" tags. That's what that change is.
And finally, I prefer find over select, but that's just my style choice. This worked for me for the first three pages, I didn't try the entire site. Be careful when you do run this, 78 blocks of 30 requests might get you blocked...
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import datetime
def main(num, web):
r = requests.get(web+"/page/{}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
html = soup.find("div", class_="site-content")
articles = html.find_all("article")
# Date as string In italian
goal = [(x.time.get_text(), x.h1.a.get_text(strip=True), x.find("span", class_="cat-links").get_text(strip=True)) for x in articles]
# OR as datetime object
goal = [(datetime.datetime.strptime(x.time["datetime"], "%Y-%m-%dT%H:%M:%S%z"), x.h1.a.get_text(strip=True), x.find("span", class_="cat-links").get_text(strip=True)) for x in articles]
return goal
web='https://voxnews.info'
r = requests.get(web)
soup = BeautifulSoup(r.text, "html.parser")
last_page = soup.find_all("a", class_="page-numbers")[1].get_text()
last_int = int(last_page.replace(".",""))
### BE CAREFUL HERE WITH TESTING, DON'T USE ALL 2,320 PAGES ###
with ThreadPoolExecutor(max_workers=30) as executor:
fs = [executor.submit(main, num, web) for num in range(1, last_int)]
allin = []
for f in fs:
allin.extend(f.result())
df = pd.DataFrame.from_records(
allin, columns=["Date", "Title", "Category"])
print(df)
In order to fetch results from all pages, not just one or ten pages (i.e. hardcoded), the best solution is to use an infinite while loop and test for something (button, element) that will cause it to exit.
This solution is better than a hardcoded for loop since the while loop will go through all pages no matter how many there are until a certain condition is fulfilled. In our case, this is the presence of a button on the page (.next CSS selector):
if soup.select_one(".next"):
page_num += 1
else:
break
You can also add a limit on the number of pages, upon reaching which the cycle will also stop:
limit = 20 # paginate through 20 pages
if page_num == limit:
break
Check code in online IDE.
from bs4 import BeautifulSoup
import requests, json, lxml
# https://requests.readthedocs.io/en/latest/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
}
data = []
page_num = 1
limit = 20 # page limit
while True:
html = requests.get(f"https://voxnews.info/page/{page_num}", headers=headers, timeout=30)
soup = BeautifulSoup(html.text, "lxml")
print(f"Extracting page: {page_num}")
print("-" * 10)
for result in soup.select(".entry-header"):
title = result.select_one(".entry-title a").text
category = result.select_one(".entry-meta:nth-child(1)").text.strip()
date = result.select_one(".entry-date").text
data.append({
"title": title,
"category": category,
"date": date
})
# Condition for exiting the loop when the specified number of pages is reached.
if page_num == limit:
break
if soup.select_one(".next"):
page_num += 1
else:
break
print(json.dumps(data, indent=2, ensure_ascii=False))
Example output:
[
{
"title": "Italia invasa dai figli degli immigrati: “Italiani pezzi di merda” – VIDEO",
"category": "BREAKING NEWS, INVASIONE, MILANO, VIDEO",
"date": "Novembre 23, 2022"
},
{
"title": "Soumahoro accusato di avere fatto sparire altri 200mila euro – VIDEO",
"category": "BREAKING NEWS, POLITICA, VIDEO",
"date": "Novembre 23, 2022"
},
{
"title": "Città invase da immigrati: “Qui comandiamo noi” – VIDEO",
"category": "BREAKING NEWS, INVASIONE, VENEZIA, VIDEO",
"date": "Novembre 23, 2022"
},
# ...
]
There's a 13 ways to scrape any public data from any website blog post if you want to know more about website scraping.

BeautifulSoup data-reactid Python

Tried using data-reactid markers to search Yahoo Finance for a number, but I get a SyntaxError: keyword can't be an expression. My code:
Walmart stock
source = requests.get('https://finance.yahoo.com/quote/WMT?p=WMT&.tsrc=fin-srch').text
soup = BeautifulSoup(source, 'lxml')
price = soup.find('span', data-reactid_='35')
print("Walmart stock: " + price.text)
You just do in wrong way a little. In my view, it is more flexible to use dict than something like class_=
from bs4 import BeautifulSoup
import requests
source = requests.get('https://finance.yahoo.com/quote/WMT?p=WMT&.tsrc=fin-srch').text
soup = BeautifulSoup(source, 'lxml')
price = soup.find_all('span', {"data-reactid":True})
print(price)
Try it this way.
import quandl
quandl.ApiConfig.api_key = 'e6Rbk-YUCGHVbt5kDAh_'
# get the table for daily stock prices and,
# filter the table for selected tickers, columns within a time range
# set paginate to True because Quandl limits tables API to 10,000 rows per call
data = quandl.get_table('WIKI/PRICES', ticker = ['WMT'],
qopts = { 'columns': ['ticker', 'date', 'adj_close'] },
date = { 'gte': '2015-12-31', 'lte': '2016-12-31' },
paginate=True)
print(data)
This is probably worth a look too.
https://www.quandl.com/api/v3/datasets/EOD/WMT.csv?api_key=your_api_key-oges_here

How do I extract data from unspaced strings?

I need to extract data from four strings that have been parsed in BeautifulSoup. They are:
Arkansas72.21:59 AM76.29:04 AM5.22977.37:59 AM
Ashley71.93:39 AM78.78:59 AM0.53678.78:59 AM
Bradley72.64:49 AM77.28:59 AM2.41877.28:49 AM
Chicot-40.19:04 AM-40.19:04 AM2.573-40.112:09 AM
The data from the first string, for example, is Arkansas, 72.1, 1:59 AM, 76.2, 9:04 AM, 5.2, 29, 77.3, and 7:59 AM. Is there a simple way to do this?
Edit: full code
import urllib2
from bs4 import BeautifulSoup
import time
def scraper():
#Arkansas State Plant Board Weather Web data
url1 = 'http://170.94.200.136/weather/Inversion.aspx'
#opens url and parses HTML into Unicode
page1 = urllib2.urlopen(url1)
soup1 = BeautifulSoup(page1, 'lxml')
#print(soup.get_text()) gives a single Unicode string of relevant data in strings from the url
#Without print(), returns everything in without proper spacing
sp1 = soup1.get_text()
#datasp1 is the chunk with the website data in it so the search for Arkansas doesn't return the header
#everything else finds locations for Unicode strings for first four stations
start1 = sp1.find('Today')
end1 = sp1.find('new Sys.')
datasp1 = sp1[start1:end1-10]
startArkansas = datasp1.find('Arkansas')
startAshley = datasp1.find('Ashley')
dataArkansas = datasp1[startArkansas:startAshley-2]
startBradley = datasp1.find('Bradley')
dataAshley = datasp1[startAshley:startBradley-2]
startChicot = datasp1.find('Chicot')
dataBradley = datasp1[startBradley:startChicot-2]
startCleveland = datasp1.find('Cleveland')
dataChicot = datasp1[startChicot:startCleveland-2]
print(dataArkansas)
print(dataAshley)
print(dataBradley)
print(dataChicot)
Just improve the way you extract the tabular data. I would use pandas.read_html() to read it into the dataframe which, I'm pretty sure, you would find convenient to work with:
import pandas as pd
df = pd.read_html("http://170.94.200.136/weather/Inversion.aspx", attrs={"id": "MainContent_GridView1"})[0]
print(df)
You need to use beautifulsoup to parse the html page and retrieve your data:
url1 = 'http://170.94.200.136/weather/Inversion.aspx'
#opens url and parses HTML into Unicode
page1 = urlopen(url1)
soup1 = BeautifulSoup(page1)
# get the table
table = soup1.find(id='MainContent_GridView1')
# find the headers
headers = [h.get_text() for h in table.find_all('th')]
# retrieve data
data = {}
tr_elems = table.find_all('tr')
for tr in tr_elems:
tr_content = [td.get_text() for td in tr.find_all('td')]
if tr_content:
data[tr_content[0]] = dict(zip(headers[1:], tr_content[1:]))
print(data)
This example will shows:
{
"Greene West": {
"Low Temp (\u00b0F)": "67.7",
"Time Of High": "10:19 AM",
"Wind Speed (MPH)": "0.6",
"High Temp (\u00b0F)": "83.2",
"Wind Dir (\u00b0)": "20",
"Time Of Low": "6:04 AM",
"Current Time": "10:19 AM",
"Current Temp (\u00b0F)": "83.2"
},
"Cleveland": {
"Low Temp (\u00b0F)": "70.8",
"Time Of High": "10:14 AM",
"Wind Speed (MPH)": "1.9",
[.....]
}

Categories