Parsing Webpage with BeautifulSoup Doesn't Give Full Page Contents - python

I'm trying to parse the description 'Enjoy the power to creat and control...' from this webpage: https://www.origin.com/zaf/en-us/store/the-sims/the-sims-4.
When I parse the page with Beautifulsoup, the page source doesn't include the description and I'm not sure why.
handle = 'sims 4'
query = handle + " origin.com" # enter query to search
print(query)
for topresult in search(query, tld="com", lang='en', num=10, stop=1, pause=2):
print('Query Successful:' + handle)
page = requests.get(topresult)
soup = BeautifulSoup(page, 'html.parser')
print(soup)
Any help would be appreciated. I've been trying to figure this out for a couple days. I've also tried using Selenium and the Chrome driver but get a similar result.

Requests and BeautifulSoup will not work for this because the page is loaded dynamically with javascript. That is why you cannot find the description. Selenium webdriver should work just fine. I wrote some code to get the description.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://www.origin.com/zaf/en-us/store/the-sims/the-sims-4')
desc = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//p[#ng-bind-html="::$ctrl.description"]')))
print(desc.text)

Related

Get total number of review of permanently closed place in google map without API

I am learning python as well as web scrapping and I want to get number of review from google map of a permanently closed restaurant but I cannot do that, would you please help? Thank you
from bs4 import BeautifulSoup
url = 'https://www.google.com/maps?q=asia+halal+restaurant+aichi+japan+open+date&safe=strict&rlz=1C1GCEA_enID892ID892&sxsrf=ALeKk01NqaBLM8bXeVVS6M6tv9kAy0G6qQ:1616997971678&gs_lcp=Cgdnd3Mtd2l6EAM6BwgjELADECc6BQghEKABOgQIIRAVOgcIIRAKEKABUIUIWKojYOckaABwAHgAgAHHAogB7RGSAQcxLjUuNC4ymAEAoAEBqgEHZ3dzLXdpesgBAcABAQ&uact=5&um=1&ie=UTF-8&sa=X&ved=2ahUKEwjbhef-7NTvAhWa93MBHaFHCzYQ_AUoAXoECAEQAw'
import requests
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
ps = soup.find_all(string = 'クチコミ')
ps
I also tried to use find 'class' and 'span aria-label' based on developer tool of chrome below but still cannot do that
browser picture for html class
#ps = soup.find_all(class_='h0ySl-wcwwM-E70qVe-list')
#ps = soup.find_all('span aria-label')
#total_rev = ps.get_text()
#total_rev
Here is the code that I tried using selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
url = 'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653'
driver.get(url)
I have tried to get number of review using this code in "still operating" restaurant, but when it comes to permanently closed one I cannot get the number of review
span_review = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "section-star")]'))).click()
#Find the total number of reviews
total_number_of_reviews = driver.find_element_by_xpath('//*[#id="pane"]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[2]').text.split(" ")[0]
total_number_of_reviews = int(total_number_of_reviews.replace(',','')) if ',' in total_number_of_reviews else int(total_number_of_reviews)#Find scroll layout
total_reviews = driver.find_element_by_class_name("h0ySl-wcwwM-E70qVe-list")
total_reviews #= driver.get('aria-label')
total_reviews = total_reviews.get_text('aria-label')
total_reviews
total_reviews
total_number_of_reviews = total_reviews.text[0:]
total_number_of_reviews
Hopefully I can learn
Thanks!
I can't find your xpath in HTML. There is no <button> with text section-star but <li class="section-star">.
And aria-label is not text but attribute and you have to use .get_attribute('aria-label')
But I found other xpath //button[jsaction="pane.rating.moreReviews"] and it works for me for permanent closed and still operating
Tested on Firefox and Chrome, Linux.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome('F:/Download/SW/chromedriver_win32/chromedriver.exe')
#driver = webdriver.Chrome()
#driver = webdriver.Firefox()
all_urls = [
# permanent closed
'https://www.google.com/maps/place/%E3%82%A2%E3%83%83%E3%83%90%E3%82%B7+%E3%82%B9%E3%82%A4%E3%83%BC%E3%83%84/#35.0903185,136.8551766,17z/data=!3m1!4b1!4m5!3m4!1s0x600378381c4bb1f7:0x8e9d356b9ded5bcc!8m2!3d35.0903185!4d136.8573653',
# still operating
'https://www.google.com/maps/place/Seaside+Restaurant+Higashiyama+Garden+-+Port+Bldg./#35.0841323,136.8474088,14z/data=!3m1!5s0x6003790a61e056e7:0x7f307de064680a96!4m9!1m2!2m1!1srestaurants!3m5!1s0x600379a07cd9fcc7:0x89f84cc9f0422e30!8m2!3d35.0895485!4d136.8809243!15sCgtyZXN0YXVyYW50c1oNIgtyZXN0YXVyYW50c5IBCnJlc3RhdXJhbnQ',
]
for url in all_urls:
driver.get(url)
total_reviews = WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, '//button[#jsaction="pane.rating.moreReviews"]')))
total_reviews = total_reviews.get_attribute('aria-label')
print(total_reviews)

How to scrape text from a hidden element?

I am trying to scrape the text of the Swiss constitution from Link and convert it to markdown. However, the page source is different from what I see in the inspector: The source only contains no script warnings in various languages with the element "app-root" hidden.
The inspector shows a .html file served from here with which I am able to get the desired result. However, using this file directly would not allow me to scrape the subsequent revisions of the law automatically. Is there a way to extract the page source with the element "app-root" displayed?
This code returns "None" but works with the URL set to the .html file:
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver import FirefoxOptions
from bs4 import BeautifulSoup
from markdownify import markdownify
url = "https://www.fedlex.admin.ch/eli/cc/1999/404/en"
opts = FirefoxOptions()
opts.add_argument("--headless")
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=opts)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
div = soup.find("div", {"id": "lawcontent"})
content = markdownify(str(div))
print(content[:200])
Any help is much appreciated.
In your code, you're not giving any time for the driver to render the contents, resulting in incomplete source code.
Waits can be used to wait for required elements to be visible/present etc. The given code below waits for the div content to be visible and then returns the page source code.
Code snippet-
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
url = "https://www.fedlex.admin.ch/eli/cc/1999/404/en"
driver.get(url)
try:
delay=20 #20 second delay
WebDriverWait(driver, delay).until(EC.visibility_of_element_located((By.ID, 'lawcontent')))
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
div = soup.find("div", {"id": "lawcontent"})
content = markdownify(str(div))
print(content[:200])
#raises Exception if element is not visible within delay duration
except TimeoutException:
print("Timeout!!!")

Web scraping BeautifulSoup find_all doesn't work on APEC

the source code of the page is as in the picture
I want to findall div class container-result but it doesn't work I get an empty list
my code :
`
from bs4 import BeautifulSoup, NavigableString, Tag
import requests
import urllib.request
url = "https://www.apec.fr/candidat/recherche-emploi.html/emploi?page="
for page in range(0,10,1):
r = requests.get(url + str(page))
soup = BeautifulSoup(r.content,"html.parser")
ancher = soup.find_all('div', attrs={'class': 'container-result'})
print(ancher)
`
As the web page is rendered by javascript, requests / BeautifulSoup will not be able to retrieve DOM elements needed as they're added after some time whilst page is being rendered. You could try to use selenium for this purpose, here is an example:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# delay for selenium web driver wait
DELAY = 30
# create selenium driver
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome('<<PATH TO chromedriver>>', options=chrome_options)
# iterate over pages
for page in range(0, 10, 1):
# open web page
driver.get(f'https://www.apec.fr/candidat/recherche-emploi.html/emploi?page={page}')
# wait for element with class 'container-result' to be added
container_result = WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.CLASS_NAME, "container-result")))
# scroll to container-result
driver.execute_script("arguments[0].scrollIntoView();", container_result)
# get source HTML of the container-result element
source = container_result.get_attribute('innerHTML')
# print source
print(source)
# here you can continue work with the source variable either using selenium API or using BeautifulSoup API:
# soup = BeautifulSoup(source, "html.parser")
# quit webdriver
driver.quit()

BeautifulSoup scraping from a web page already opened by Selenium

I would like to make scrape a web page which was opened by Selenium from a different webpage.
I entered a search term into a website using Selenium and this landed me in a new page. My aim is to create soup out of this new page. But, the soup is getting created out of the previous page where I entered my search term. Help please!
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get('http://www.ratestar.in/')
inputElement = driver.find_element_by_css_selector("#txtStock")
inputElement.send_keys('GM Breweries')
inputElement.send_keys(Keys.ENTER)
driver.wait.until(staleness_of('txtStock')
source = driver.page_source
soup = BeautifulSoup(source)
You need to know the exect company names for your search. After you are using send_keys, you tried to check for staleness of an element. I did not understand how that statement should work. I added WebDriverWait for an element of the new page.
The following works for me reagrding the selenium part up to getting the page source:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Firefox()
driver.get('http://www.ratestar.in/')
inputElement = driver.find_element_by_css_selector("#txtStock")
inputElement.send_keys('GM Breweries Ltd.')
inputElement.send_keys(Keys.ENTER)
company = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'lblCompany')))
source = driver.page_source
You should add exception handling.
#Jens Dibbern has given a working solution. But it is not necessary that the exact name of the company should be given in the search. What happens is that when you type a non-exact name, a drop-down will pop up.
I have observed that until and unless this drop-down is present enter key is not working. You can check this by going to the site, pasting the name and without waiting press the enter key as fast as possible. Nothing happens.
You could also wait for this drop-down to be visible instead and the send the enter key.This also works perfectly. Note that this will end up selecting the first item in the drop-down if more than one is present.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get('http://www.ratestar.in/')
inputElement = driver.find_element_by_css_selector("#txtStock")
inputElement.send_keys('GM Breweries')
drop_down=driver.find_element_by_css_selector("#listPlacementStock")
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#listPlacementStock:not([style*="display: none"])')))
inputElement.send_keys(Keys.ENTER)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[#id="CompanyLink"]')))
source = driver.page_source
soup = BeautifulSoup(source,'html.parser')
print(soup)

Source data doesn't match actual content when scraping dynamic content with Beautiful Soup + Selenium

I'm trying to teach myself how to scrape data and found a nice dynamic website to test this on (releases.com in this case).
Since it's dynamic, I figured I'd have to use selenium to fetch its data.
However, the retrieved page source still only contains the initial html and its js: not the actual elements shown in the browser.
Why does that happen?
I'm assuming it's because I'm fetching the page source, but what other option is there?
My code looks like this:
from bs4 import BeautifulSoup as soup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
import chromedriver_binary
import time
#constants
my_url = "https://www.releases.com/l/Games/2018/1/"
# Start the WebDriver and load the page
wd = webdriver.Chrome()
wd.get(my_url)
# Wait for the elements to appear
time.sleep(10)
# And grab the page HTML source
html_page = wd.page_source
wd.quit()
#Make soup
pageSoup = soup(html_page, "html.parser")
#Get data
print(pageSoup.text)

Categories