Python web crawling result is less than expected - python

I tried using Selenium to crawl the web data, it loads all 346 products after clicking on the load more button for a few times on the browser, however, it only shows 96 / 346 product instead of 346 / 346 product, any idea how to fix it? I have already put the crawling code right after the while true loop for clicking the load more button
screen capture of the result
from urllib.request import urlopen
import requests
import ast
from selenium import webdriver
driver=webdriver.Chrome('e:/Users/fungc1/Documents/chromedriver.exe')
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
options=Options()
from bs4 import BeautifulSoup
url="https://www.toysrus.com.sg/lego"
#data = soup.findAll('div',attrs={'class':'card-image-wrapper'})
#toc = soup.find_all('div',attrs={'class':'result-count text-center'})
driver.get(url)
driver.maximize_window()
time.sleep(5)
driver.find_element_by_link_text("STAY ON THE SINGAPORE SITE").click()
while True:
try:
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
wait=WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.btn[data-url*='www.toysrus.com']"))).click()
time.sleep(5)
except Exception as e:
print(e)
break
time.sleep(5)
response = requests.get(url)
response_text = response.text
soup = BeautifulSoup(response_text, 'lxml')
text = urlopen(url).read()
soup = BeautifulSoup(text)
data = soup.findAll('div',attrs={'class':'card-image-wrapper'})
toc = soup.find_all('div',attrs={'class':'result-count text-center'})
emptylist2=[]
for item in toc:
print((item).text.strip()[:-1])
for div in data:
links = div.findAll('a')
for a in links:
catalogueresult=ast.literal_eval("" + a['href'][1:-5][-7:])
emptylist2.append(catalogueresult)
print (emptylist2)

You are mixing few things.
You opened the browser with Selenium and loaded all the items by clicking on load button. But after that you use requests library to request new html again from the url which has nothing to do with Selenium. So, you are doing two separate things. In your case even you remove the Selenium code you will get the same thing because you are not utilizing Selenium after loading all the products.
Now, what you need to do is to ask the Selenium to return the html code of all 396 products so that you can give it to BeautifulSoup for further parsing.
To do that, you don't need first 4 lines after your while loop ends. Do something like this:
html = driver.page_source #will return the html code with all products
soup = BeautifulSoup(html, 'lxml')
With this you will get all 396 products.

Related

How to scrape text from a hidden element?

I am trying to scrape the text of the Swiss constitution from Link and convert it to markdown. However, the page source is different from what I see in the inspector: The source only contains no script warnings in various languages with the element "app-root" hidden.
The inspector shows a .html file served from here with which I am able to get the desired result. However, using this file directly would not allow me to scrape the subsequent revisions of the law automatically. Is there a way to extract the page source with the element "app-root" displayed?
This code returns "None" but works with the URL set to the .html file:
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver import FirefoxOptions
from bs4 import BeautifulSoup
from markdownify import markdownify
url = "https://www.fedlex.admin.ch/eli/cc/1999/404/en"
opts = FirefoxOptions()
opts.add_argument("--headless")
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install(), options=opts)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
div = soup.find("div", {"id": "lawcontent"})
content = markdownify(str(div))
print(content[:200])
Any help is much appreciated.
In your code, you're not giving any time for the driver to render the contents, resulting in incomplete source code.
Waits can be used to wait for required elements to be visible/present etc. The given code below waits for the div content to be visible and then returns the page source code.
Code snippet-
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
url = "https://www.fedlex.admin.ch/eli/cc/1999/404/en"
driver.get(url)
try:
delay=20 #20 second delay
WebDriverWait(driver, delay).until(EC.visibility_of_element_located((By.ID, 'lawcontent')))
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
div = soup.find("div", {"id": "lawcontent"})
content = markdownify(str(div))
print(content[:200])
#raises Exception if element is not visible within delay duration
except TimeoutException:
print("Timeout!!!")

Web scraping BeautifulSoup find_all doesn't work on APEC

the source code of the page is as in the picture
I want to findall div class container-result but it doesn't work I get an empty list
my code :
`
from bs4 import BeautifulSoup, NavigableString, Tag
import requests
import urllib.request
url = "https://www.apec.fr/candidat/recherche-emploi.html/emploi?page="
for page in range(0,10,1):
r = requests.get(url + str(page))
soup = BeautifulSoup(r.content,"html.parser")
ancher = soup.find_all('div', attrs={'class': 'container-result'})
print(ancher)
`
As the web page is rendered by javascript, requests / BeautifulSoup will not be able to retrieve DOM elements needed as they're added after some time whilst page is being rendered. You could try to use selenium for this purpose, here is an example:
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
# delay for selenium web driver wait
DELAY = 30
# create selenium driver
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome('<<PATH TO chromedriver>>', options=chrome_options)
# iterate over pages
for page in range(0, 10, 1):
# open web page
driver.get(f'https://www.apec.fr/candidat/recherche-emploi.html/emploi?page={page}')
# wait for element with class 'container-result' to be added
container_result = WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.CLASS_NAME, "container-result")))
# scroll to container-result
driver.execute_script("arguments[0].scrollIntoView();", container_result)
# get source HTML of the container-result element
source = container_result.get_attribute('innerHTML')
# print source
print(source)
# here you can continue work with the source variable either using selenium API or using BeautifulSoup API:
# soup = BeautifulSoup(source, "html.parser")
# quit webdriver
driver.quit()

Python in Selenium/BeautifulSoup

I'm trying to extract real estate listing info from a site using selenium and beautiful soup using this tutorial: https://medium.com/#ben.sturm/scraping-house-listing-data-using-selenium-and-beautiful-soup-1cbb94ba9492
Aim is to gather all the href links from the first page before finding the 'next page' button, navigating to next and collecting all links on that page and so on.
Tried with a single function to achieve this and repeat for each page but can't figure out why it's not working. New to learning code and have seems too trivial to find an answer yet. Would appreciate any help
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import sys
import numpy as np
import pandas as pd
import regex as re
driver = webdriver.Chrome
url = "http://property.shw.co.uk/searchproperties/Level2-0/Level1-0-181-236-167-165/Units/Development-or-House-and-Flat-or-Investment-or-Land-or-Office-or-Other/UnitIds-0/For-Sale"
driver.get(url)
try:
wait = WebDriverWait(driver, 3)
wait.until(EC.presence_of_element_located((By.ID, "body1")))
print("Page is Ready!")
except TimeoutException:
print("page took too long to load")
def get_house_links(url, driver, pages=3):
house_links = []
driver.get(url)
for i in range(pages):
soup = BeautifulSoup(driver.page_source, 'html.parser')
listings = soup.find_all("a", class_="L")
page_data = [row['href'] for row in listings]
house_links.append(page_data)
time.sleep(np.random.lognormal(0, 1))
next_button = soup.find_all("a", class_="pageingBlock darkBorder")
next_button_link = ['http://property.shw.co.uk'+row['href'] for row in next_button]
if i < 3:
driver.get(next_button_link[0])
return house_links
get_house_links(url, driver)
class_="pageingBlock darkBorder" match the previous page button as well, so next_button_link[0] send you back to previous page. You need more precise locator
next_button = soup.select('img[src*="propNext"]')
if next_button:
next_button = next_button[0].find_parent('a')
next_button_link = 'http://property.shw.co.uk' + next_button['href']
driver.get(next_button_link)

Source data doesn't match actual content when scraping dynamic content with Beautiful Soup + Selenium

I'm trying to teach myself how to scrape data and found a nice dynamic website to test this on (releases.com in this case).
Since it's dynamic, I figured I'd have to use selenium to fetch its data.
However, the retrieved page source still only contains the initial html and its js: not the actual elements shown in the browser.
Why does that happen?
I'm assuming it's because I'm fetching the page source, but what other option is there?
My code looks like this:
from bs4 import BeautifulSoup as soup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
import chromedriver_binary
import time
#constants
my_url = "https://www.releases.com/l/Games/2018/1/"
# Start the WebDriver and load the page
wd = webdriver.Chrome()
wd.get(my_url)
# Wait for the elements to appear
time.sleep(10)
# And grab the page HTML source
html_page = wd.page_source
wd.quit()
#Make soup
pageSoup = soup(html_page, "html.parser")
#Get data
print(pageSoup.text)

Implicty wait selenium Python 2.7 not working

I am scraping public linkedIn data from specific people.
here is the code inside the while loop. For you to know, I used time.sleep() for the first 400 profils urls and it worked. However, it is not working anymore as it makes my firefox browser crash. I am pretty sure that the bug comes from the time.sleep() function that I tried to modify using implictly_wait() and WebdriverWait. However, none of this tries worked ;(
Here the code inside the while loop with the time.sleep() function that worked for around 400urls:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
browser = webdriver.Firefox()
browser.get("https://www.linkedin.com/uas/login")
time.sleep(4)
username = browser.find_element_by_id("session_key-login")
password = browser.find_element_by_id("session_password-login")
username.send_keys("yourmail")
password.send_keys("yourpassword")
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()
time.sleep(4)
browser.get(the profile link I wanna scrap)
html = browser.page_source
soup = BeautifulSoup(html,"html.parser")
formation = soup.find_all('div', {'class': "education"})
nom = soup.find_all('span', {'class': "full-name"})
for a in nom:
for b in formation:
print(a.text,b.text)
time.sleep(4)
browser.close()
I tried to replace the time.sleep() by Implicitly_wait() but it is not working. The browser does not wait at all.
I also tried this
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
browser = webdriver.Firefox()
browser.get("the profile url I wanna scrap")
delay = 30 # seconds
try:
WebDriverWait(browser, delay).until(EC.presence_of_element_located(browser.find_element_by('education'))
print("Page is ready!")
except TimeoutException:
print("Loading took too much time!")
But it is still not working.
Do you have any idea on how to solve the issue ?
If I could make the browser wait without using time.sleep() (which makes my browser crash) without any conditions that would be amazing !
other question ? If I use chrome instead of firefox, do I have a chance to overcome the problem ?
Thanks for your answers,
Raphaƫl
With WebDriverWait, the browser waits but firefox crashes again: here the code
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
browser = webdriver.Firefox()
browser.get("https://www.linkedin.com/uas/login")
username = browser.find_element_by_id("session_key-login")
password = browser.find_element_by_id("session_password-login")
username.send_keys("mail")
password.send_keys("password")
login_attempt = browser.find_element_by_xpath("//*[#type='submit']")
login_attempt.submit()
try:
element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.ID, "content")))
finally:
browser.get("profil linkedin to scrap")
html = browser.page_source
soup = BeautifulSoup(html,"html.parser")
formation = soup.find_all('div', {'class': "education"})
nom = soup.find_all('span', {'class': "full-name"})
for a in nom:
for b in formation:
print(a.text,b.text)
browser.close()

Categories