site data not populated as browser, despite rendering with html-requests - python

I am experimenting with html-requests on various sites,
and I am having trouble extracting the price of a stock on this particular site:
https://www.morningstar.com/stocks/xnys/BABA/quote
I am using html-requests, and using html.render to render javascript.
Despite this, the data doesn't seem to be populated as it is within the browser.
from requests_html import HTMLSession
import requests_html
from bs4 import BeautifulSoup as bs
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
requests_html.DEFAULT_USER_AGENT = user_agent
def get_request(ticker):
session = HTMLSession()
print(url)
res = session.get(url)
try:
res.raise_for_status()
except ValueError as e:
raise('Dead link')
return res
def mstar():
url = 'https://www.morningstar.com/stocks/xnys/BABA/quote'
res = get_requesturl)
res.html.render()
price = res.html.find('div#message-box-price.message-partial.fill.up')[0].text
print(price)
price = res.html.find('div.message-partial.fill.up')[0].text
print(price)
change = res.html.find('div#message-box-percentage')[0].text
print(change)
The Expected outcome is this data:
262.20
4.26 | 1.65%
However,
either I am just getting back symbols:
- or % but no actual prices.
Any suggestions?
Thank you.

The data is generated by the JSON API and then dynamically inserted into the website via JavaScript, hence python requests cannot see it. You can verify it yourself by doing a curl https://www.morningstar.com/stocks/xnys/baba/quote and trying to find the 1.65% on it -- it is not there, simply because it is not in the HTML source.
I would suggest to use selenium instead, and parse the data as follows:
elements = driver.find_element(By.ID, "div")
for element in elements:
print element.text
print element.get_attribute('message-box-price.message-partial.fill.up')

Related

Beautiful Soup only returning the first 10 listings using soup.select(), What could be the issue here?

import requests
import lxml
from bs4 import BeautifulSoup
LISTINGS_URL = 'https://shorturl.at/ceoAB'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/95.0.4638.69 Safari/537.36 ",
"Accept-Language": "en-US,en;q=0.9"
}
response = requests.get(LISTINGS_URL, headers=headers)
listings = response.text
class DataScraper:
def __init__(self):
self.soup = BeautifulSoup(listings, "html.parser")
def get_links(self):
for a in self.soup.select(".list-card-top a"):
print(a)
# listing_text = [link.getText() for link in links]
def get_address(self):
pass
def get_prices(self):
pass
I Have Used the correct css selectors, even trying to find the elements using attrs in find_all()
What I am trying to achieve is to parse in all the anchor tags then to fetch the href links for the specific listings however it is only returning the first 10
You can make a GET request to this endpoint and fetch the data you need.
https://www.zillow.com/search/GetSearchPageState.htm?searchQueryState={"pagination":{"currentPage":1},"mapBounds":{"west":-123.33522421253342,"east":-121.44008261097092,"south":37.041584214606814,"north":38.39290664366326},"isMapVisible":false,"filterState":{"price":{"max":872627},"beds":{"min":1},"isForSaleForeclosure":{"value":false},"monthlyPayment":{"max":3000},"isAuction":{"value":false},"isNewConstruction":{"value":false},"isForRent":{"value":true},"isForSaleByOwner":{"value":false},"isComingSoon":{"value":false},"isForSaleByAgent":{"value":false}},"isListVisible":true,"mapZoom":9}&wants={"cat1":["listResults"]}
Change the "currentPage" url parameter value in the above URL to fetch data from different pages.
Since the response is JSON, you can easily parse it and extract the information using json module.
Website is using probably lazy loading, so you can either use something like selenium/puppeteer or use an API of this website (will be an easier way). To do this you need to make a GET request to an url which starts with https://www.zillow.com/search/GetSearchPageState.htm (see in your dev tools in browser), parse JSON response and you have your href link under cat1.searchResults.listResults[index in array].detailUrl.

Collecting the data displayed in browser but not in response

The Situation
I am trying to scrape webpages to get some data.
I need the html data which is viewable in the browser as a whole for my application.
The Problem
But when I scrape some urls, I am getting data which are not viewable from browser. But in the html code its there. So is there any way to scrape the data which is viewable only in the browser
Code
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service
options = webdriver.ChromeOptions()
options.add_argument("--headless")
service = Service("/home/nebu/selenium_drivers/chromedriver")
URL = "https://augustasymphony.com/event/top-of-the-world/"
try:
driver = webdriver.Chrome(service = service, options = options)
driver.get(URL)
driver.implicitly_wait(2)
html_content = driver.page_source
driver.quit()
except WebDriverException:
driver.quit()
soup = BeautifulSoup(html_content)
for each in ['header','footer']:
s = soup.find(each)
if s == None:
continue
else:
s.extract()
text = soup.getText(separator=u' ')
print(text)
The Question
Where am I going wrong here?
How can I go about debugging this?
This is simply a case of you needing to extract the data in a more specific manner.
You have 2 options really:
Option 1: (In my opinion the better, as it is faster and less resource heavy.)
import requests
from bs4 import BeautifulSoup as bs
headers = {'Accept': '*/*',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683 Safari/537.36 OPR/57.0.3098.91'}
res = requests.get("https://augustasymphony.com/event/top-of-the-world/", headers=headers)
soup = bs(res.text, "lxml")
event_header = soup.find("h2", {"class": "rhino-event-header"}).text.strip()
time = soup.find("p", {"class": "rhino-event-time"}).text.strip()
You can use requests quite simply to find the data as shown in the code above specifically selecting the data you want and perhap saving it in a dictionary. This is the normal way to go about it. It may contain a lot of scripts in the page, however the page doesn't require JavaScript to load said data dynamically.
Option2:
You continue using selenium and can collect the entire body information of the page using one of multiple selections.
driver.find_element_by_id('wrapper').get_attribute('innerHTML') # Entire body
driver.find_element_by_id('tribe-events').get_attribute('innerHTML') # the events list
driver.find_element_by_id('rhino-event-single-content').get_attribute('innerHTML') # the single event
This second option is a lot more just taking the whole html and dumping it.
Personally I would go with the first option creating dictionaries of the cleaned data.
Edit:
To futher illustrate my example
import requests
from bs4 import BeautifulSoup as bs
headers = {'Accept': '*/*',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683 Safari/537.36 OPR/57.0.3098.91'}
res = requests.get("https://augustasymphony.com/event/", headers=headers)
soup = bs(res.text, "lxml")
seedlist = {a["href"] for a in soup.find("div", {"id": "tribe-events-content-wrapper"}).find_all("a") if '?ical=1' not in a["href"]}
for seed in seedlist:
res = requests.get(seed, headers=headers)
soup = bs(res.text, "lxml")
data = dict()
data['event_header'] = soup.find("h2", {"class": "rhino-event-header"}).text.strip()
data['time'] = soup.find("p", {"class": "rhino-event-time"}).text.strip()
print(data)
Here I am generting a seedlist of event urls and then going into each one to find information.
It's because some websites detect if it's a web browser.
So they don't send the HTML file back.
That's why there is no HTML send back

webscraping python not showing all tags

I'm new to webscraping. I was trying to make a script that gets data from a balance sheet (here the site: https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000032019320000010/a10-qq1202012282019.htm). The problem is getting the data: when I watch at the source code in my browser, I'm able to find the tag and the correct value. Once I write down a script with bs4, I don't get anything.
I'm trying to get informations form the balance sheet: Products, Services, Cost of sales... and the data contained in the table 1. (I'm sorry, but I can't post the image. Anyway is the first table you see scrolling down).
Here's my code.
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
url = "https://www.sec.gov/ix?doc=/Archives/edgar/data/320193/000032019320000010/a10-qq1202012282019.htm"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
read_data = urlopen(req).read()
soup_data = BeautifulSoup(read_data,"lxml")
names = soup_data.find_all("td")
for name in names:
print(name)
Thanks for your time.
Try this URL:
Also include the headers to get the data.
import requests
from bs4 import BeautifulSoup
url = "https://www.sec.gov/Archives/edgar/data/320193/000032019320000010/a10-qq1202012282019.htm"
headers = {"User-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"}
req = requests.get(url, headers=headers)
soup_data = BeautifulSoup(req.text,"lxml")
You will be able to find the data you need.

Unable to parse two fields from a webpage using requests module

I'm trying to scrape two fields product_title and item_code from this webpage using requests module. When I execute the script below, I always get AttributeError in place of the result as the data I'm after are not in page source.
However, I've come across several solutions in here which are able to fetch data from javascript encrypted sites even when the data are not in page source, so I suppose there should be any way to grab the two fields from the webpage using requests.
import requests
from bs4 import BeautifulSoup
link = 'https://www.sainsburys.co.uk/gol-ui/Product/persil-small---mighty-non-bio-laundry-liquid-21l-60-washes'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
res = s.get(link)
soup = BeautifulSoup(res.text,"lxml")
product_title = soup.select_one("h1[data-test-id='pd-product-title']").get_text(strip=True)
item_code = soup.select_one("span#productSKU").get_text(strip=True)
print(product_title,item_code)
Expected output:
Persil Non-Bio Laundry Liquid 1.43L
Item code: 7637944
How can I fetch the two fields from that site using requests?
Actually the wesite calling apis, so you can use that directly to get the data
r = requests.get('https://www.sainsburys.co.uk/groceries-api/gol-services/product/v1/product?filter[product_seo_url]=gb%2Fgroceries%2Fpersil-small---mighty-non-bio-laundry-liquid-21l-60-washes&include[ASSOCIATIONS]=true&include[PRODUCT_AD]=citrus')
products = r.json()['products']
for each_product in products:
print(f"Item code: {each_product['product_uid']}")
print(each_product['name'])
# Item code: 7637944
# Persil Non-Bio Laundry Liquid 1.43L

This Code should return Product Title, But instead of title, I am getting "None" in return

I am trying to make a price tracker for Amazon by viewing a youtube tutorial, I am new to python and web scraping, Somehow I wrote this code and It should return Product name, But Instead its giving me "None" as an output, Can you please help me with this?
I tried with different URL's still its not working.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Nike-Rival-Track-Field-Shoes/dp/B07HYNB7VV/'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/57.36 (HTML, like Gecko) Chrome/75.0.30.100 Safari/537.4'}
page =requests.get(URL,headers)
soup = BeautifulSoup(page.content,'html.parser')
title = soup.find(id="productTitle")
print(title)import requests
I was inspecting the returned HTML, and realized that Amazon sends a (somewhat malformed?) HTML that trips the default html.parser, but using lxml I was able to scrape title just fine.
import requests
from bs4 import BeautifulSoup
def make_soup(url: str) -> BeautifulSoup:
res = requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0'
})
res.raise_for_status()
return BeautifulSoup(res.text, 'lxml')
def parse_product_page(soup: BeautifulSoup) -> dict:
title = soup.select_one('#productTitle').text.strip()
return {
'title': title
}
if __name__ == "__main__":
url = 'https://www.amazon.com/Nike-Rival-Track-Field-Shoes/dp/B07HYNB7VV/'
soup = make_soup(url)
info = parse_product_page(soup)
print(info)
output:
{'title': "Nike Men's Zoom Rival M 9 Track and Field Shoes"}
You can make your locator more specific using .select(). You need to change the parser as well.
Try this instead:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Nike-Rival-Track-Field-Shoes/dp/B07HYNB7VV/'
page = requests.get(URL,headers={"User-Agent":'Mozilla/5.0'})
soup = BeautifulSoup(page.text,'lxml') #make sure you use "lxml' or "html5lib" parser instead of "html.parser"
title = soup.select_one("h1 > #productTitle").get_text(strip=True)
print(title)
Output:
Nike Men's Zoom Rival M 9 Track and Field Shoes
Bot detection is pretty pervasive these days. No major site with any data worth mining, especially retail, is going to let you use requests on their site.
You're going to have to at the very least use Selenium / ChromeDriver to get a response from any reputable site. Even then if they use something like Distil for bot detection they will stop even Selenium.
Try a less popular site with Selenium, and you will get data back.

Categories