Selenium cannot get all elements of a page - python

i am using selenium to go search on agoda and scrape all the hotel name in the page, but the output only return 2 names.
Then i tried to add a line to scroll to the bottom, now the output gives me first 2 names and last 2 names (first two from beginning, last two from bottom)
I don't understand what's the problem, i added time.sleep() for each step so the whole page should have been loaded completely. Does selenium limit by page view that it can only scrape those element in sight?
my code below:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(30)
def scrape():
r = requests.get(current_page)
if r.status_code == requests.codes.ok:
print('start scraping!')
hotel = driver.find_elements_by_class_name('hotel-name')
hotels = []
for h in hotel:
if hotel:
hotels.append(h.text)
print(hotels, file=open("output.txt", 'a', encoding="utf-8"))
scrape()
Here is the page i want to scrape

Try to use below script to scroll page down until no more results appeared on page and then scrape all available names:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.agoda.com/pages/agoda/default/DestinationSearchResult.aspx?asq=8wUBc629jr0%2B3O%2BxycijdcaVIGtokeWrEO7ShJumN8xsNvkFkEV9bUgNnbx6%2Bx22ncbzTLOPBjT84OgAAKXmu6quf8aEKRA%2FQH%2BGoyXgowLt%2BXyB8OpN1h2WP%2BnBM%2FwNPzD%2BpaeII93w%2Bs4dMWI4QPJNbZJ8DWvRiPsrPVVBJY7ilpMPlUermwV1UKIKfuyeis3BqRkJh9FzJOs0E98zXQ%3D%3D&city=9590&cid=-142&tick=636818018163&languageId=20&userId=3c2c4cb9-ba6d-4519-8ef4-c85dfd280b8f&sessionId=d4qzq2tgymjrwsf22lnadxpc&pageTypeId=1&origin=HK&locale=zh-TW&aid=130589&currencyCode=HKD&htmlLanguage=zh-tw&cultureInfoName=zh-TW&ckuid=3c2c4cb9-ba6d-4519-8ef4-c85dfd280b8f&prid=0&checkIn=2019-01-16&checkOut=2019-01-17&rooms=1&adults=2&children=0&priceCur=HKD&los=1&textToSearch=%E5%A4%A7%E9%98%AA&productType=-1&travellerType=1')
# Get initial list of names
hotels = wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))
while True:
# Scroll down to last name in list
driver.execute_script('arguments[0].scrollIntoView();', hotels[-1])
try:
# Wait for more names to be loaded
wait(driver, 15).until(lambda driver: len(wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))) > len(hotels))
# Update names list
hotels = wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))
except:
# Break the loop in case no new names loaded after page scrolled down
break
# Print names list
print([hotel.text for hotel in hotels])

Related

selenium: stale element reference: element is not attached to the page document

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()
TYPES = ['user', 'verified_audience', 'top_critics']
TYPE = TYPES[2]
URL = 'https://www.rottentomatoes.com/m/dunkirk_2017/reviews'
PAGES = 2
driver = Chrome()
driver.get(URL)
data_reviews = []
while PAGES != 0:
wait = WebDriverWait(driver, 30)
reviews = wait.until(lambda _driver: _driver.find_elements(
By.CSS_SELECTOR, '.review_table_row'))
# Extracting review data
for review in reviews:
if TYPE == 'top_critics':
critic_name_el = review.find_element(
By.CSS_SELECTOR, '[data-qa=review-critic-link]')
critic_review_text_el = review.find_element(
By.CSS_SELECTOR, '[data-qa=review-text]')
data_reviews.append(critic_name_el.text)
try:
next_button_el = driver.find_element(
By.CSS_SELECTOR, '[data-qa=next-btn]:not([disabled=disabled])'
)
if not next_button_el:
PAGES = 0
next_button_el.click() # refresh new reviews
PAGES -= 1
except Exception as e:
driver.quit()
Here, a rotten tomatoes review page is being opened and the reviews are being scraped, but when the next button is clicked and the new reviews are going to be scraped, this error pops up... I am guessing that the new reviews have not been loaded and trying to access them is causing the problem, I tried driver.implicitly_wait but that doesn't work too.
The error originates from line 33, data_reviews.append(critic_name_el.text)
By clicking a next page button next_button_el the new page is being loaded but this process takes some time while your Selenium code continues instantly after that click so probably on this line reviews = wait.until(lambda _driver: _driver.find_elements(By.CSS_SELECTOR, '.review_table_row')) it collects the elements on the old page but then the page is being refreshed so some of these elements critic_name_el collected after that (still on the old page) is no more there since the old page is refreshed.
To make your code working you need to introduce a short delay after clicking the next page button, as following:
data_reviews = []
while PAGES != 0:
wait = WebDriverWait(driver, 30)
reviews = wait.until(lambda _driver: _driver.find_elements(
By.CSS_SELECTOR, '.review_table_row'))
# Extracting review data
for review in reviews:
if TYPE == 'top_critics':
critic_name_el = review.find_element(
By.CSS_SELECTOR, '[data-qa=review-critic-link]')
critic_review_text_el = review.find_element(
By.CSS_SELECTOR, '[data-qa=review-text]')
data_reviews.append(critic_name_el.text)
try:
next_button_el = driver.find_element(
By.CSS_SELECTOR, '[data-qa=next-btn]:not([disabled=disabled])'
)
if not next_button_el:
PAGES = 0
next_button_el.click() # refresh new reviews
PAGES -= 1
time.sleep(2)
except Exception as e:
driver.quit()
Also I'd suggest to wait for elements visibility, not just presence here:
reviews = wait.until(lambda _driver: _driver.find_elements(By.CSS_SELECTOR, '.review_table_row'))
Also you need to understand that driver.implicitly_wait do not introduce any actual pause. This just sets the timeout for find_element and find_elements methods.

Python Selenium: unable to reach particular image in a flyer to scrape text

I am trying to scrape this website: https://www.longos.com/flyers.
I've been able to enter the postal code (ex.M5B 0B7 for people who may not be familiar with postal codes in this area) , click on the closest store and have the flyer pop up.
However, I am having troubles clicking into the specific flyer page, get the particular product and get the text information. The code below is my attempt.
i = 1
while True:
flyer_link = "/html/body/flipp-router/flipp-publication-page/div/flipp-sfml-component/sfml-storefront/div/sfml-linear-layout/sfml-flyer-image[{}]".format(i)
print(flyer_link)
flyer = WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, flyer_link)))
print(flyer)
i+=1
It doesn't seem to be abble to recognize the XPATH
Here is a snapshot of the html code for reference:
j = 1
for i in prdcts:
driver.execute_script("arguments[0].scrollIntoView();", i)
i.click()
time.sleep(3)
print(i)
# print(driver.page_source)
i = 1
while True:
try:
button_link_to_text = '/html/body/flipp-router/flipp-publication-page/div/flipp-sfml-component/sfml-storefront/div/sfml-linear-layout/sfml-flyer-image[{}]/div/button[{}]'.format(j,i)
button = driver.find_element_by_xpath(button_link_to_text)
print(button.get_attribute("aria-label"))
i+=1
except:
break
j+=1
Try this
from selenium.webdriver.common.action_chains import ActionChains
import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get('https://www.longos.com/flyers')
time.sleep(5)
iframe = driver.find_element_by_xpath('//iframe[#class="flippiframe productframe"]')
driver.switch_to.frame(iframe)
eleme = driver.find_element_by_xpath('//input[#id="postal-input"]')
eleme.send_keys("M5B0B7")
eleme = driver.find_element_by_xpath('//button[#id="submit-postal-code"]').click()
time.sleep(2)
eleme = driver.find_element_by_xpath('//button [#aria-label="Select Elizabeth 111 Elizabeth Street Toronto ON, distance from store is <1 km"]').click()
driver.switch_to.default_content()
time.sleep(5)
iframe = driver.find_element_by_xpath('//iframe[#class="flippiframe mainframe"]')
driver.switch_to.frame(iframe)
prdcts = driver.find_elements_by_xpath('//sfml-flyer-image//button')
print (prdcts)
for i in prdcts:
driver.execute_script("arguments[0].scrollIntoView();", i)
print(i.get_attribute("aria-label"))
time.sleep(3)
print(I)
and don't forget to switch back to class="flippiframe productframe"] this frame to get the product details.
the main thing is you need to switch the iframes and get into view to interact with that element

Get element text with a partial string match using Selenium (Python)

I am trying to extract the text from within a <strong> tag that is deeply nested in the HTML content of this webpage: https://www.marinetraffic.com/en/ais/details/ships/imo:9854612
For example:
The strong tag is the only one on the webpage that will contain the string 'cubic meters'.
My objective is to extract the entire text, i.e., "138124 cubic meters Liquid Gas". When I try the following, I get an error:
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
driver.get(url)
time.sleep(3)
element = driver.find_element_by_link_text("//strong[contains(text(),'cubic meters')]").text
print(element)
Error:
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"link text","selector":"//strong[contains(text(),'cubic meters')]"}
What am I doing wrong here?
The following also throws an error:
element = driver.find_element_by_xpath("//strong[contains(text(),'cubic')]").text
Your code works on Firefox(), but not on Chrome().
The page uses lazy loading, so you have to scroll to Summary and then it loads the text with the expected strong.
I used a little slower method - I search all
elements with class='lazyload-wrapper, and in the loop scroll to the item and check if there is strong. If there isn't any strong, then I scroll to the next class='lazyload-wrapper.
from selenium import webdriver
import time
#driver = webdriver.Firefox()
driver = webdriver.Chrome()
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
driver.get(url)
time.sleep(3)
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
elements = driver.find_elements_by_xpath("//span[#class='lazyload-wrapper']")
for number, item in enumerate(elements):
print('--- item', number, '---')
#print('--- before ---')
#print(item.text)
actions.move_to_element(item).perform()
time.sleep(0.1)
#print('--- after ---')
#print(item.text)
try:
strong = item.find_element_by_xpath("//strong[contains(text(), 'cubic')]")
print(strong.text)
break
except Exception as ex:
#print(ex)
pass
Result:
--- item 0 ---
--- item 1 ---
--- item 2 ---
173400 cubic meters Liquid Gas
The result shows that I could use elements[2] to skip two elements, but I wasn't sure if this text will be always in the third element.
Before I created my version I tested other versions and here is the full working code:
from selenium import webdriver
import time
#driver = webdriver.Firefox()
driver = webdriver.Chrome()
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
driver.get(url)
time.sleep(3)
def test0():
elements = driver.find_elements_by_xpath("//strong")
for item in elements:
print(item.text)
print('---')
item = driver.find_element_by_xpath("//strong[contains(text(), 'cubic')]")
print(item.text)
def test1a():
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
element = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//div")
actions.move_to_element(element).build().perform()
text = element.text
print(text)
def test1b():
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
text = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//strong").text
print(text)
def test2():
from bs4 import BeautifulSoup
import re
soup = BeautifulSoup(driver.page_source, "html.parser")
soup.find_all(string=re.compile(r"\d+ cubic meters"))
def test3():
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
elements = driver.find_elements_by_xpath("//span[#class='lazyload-wrapper']")
for number, item in enumerate(elements, 1):
print('--- number', number, '---')
#print('--- before ---')
#print(item.text)
actions.move_to_element(item).perform()
time.sleep(0.1)
#print('--- after ---')
#print(item.text)
try:
strong = item.find_element_by_xpath("//strong[contains(text(), 'cubic')]")
print(strong.text)
break
except Exception as ex:
#print(ex)
pass
#test0()
#test1a()
#test1b()
#test2()
test3()
You can use Beautiful Soup for this, and more precisely the string argument; from the documentation, "you can search for strings instead of tags".
As an argument, you can also pass a regex pattern.
>>> from bs4 import BeautifulSoup
>>> import re
>>> soup = BeautifulSoup(driver.page_source, "html.parser")
>>> soup.find_all(string=re.compile(r"\d+ cubic meters"))
['173400 cubic meters Liquid Gas']
If you're sure there is only one result, or you need just the first, you can also use find instead of find_all.
Your XPath expression is correct and works in Chrome. You get NoSuchElementException, because the element is not loaded within the 3 seconds you wait and does not exist.
To wait for the element, use the WebDriverWait class. It waits explicitly for a specific condition of the element, and in your case presents is enough.
In the code below, Selenium will wait for the element to be presented in the HTML for 10 seconds, polling every 500 milliseconds. You can read about WebDriverWait and conditions here.
Some useful information:
Not visible elements return an empty string. In such a case you need to wait for the visibility of the element, or if the element requires a scroll to scroll to it (example added).
You can also get the text from a not-visible element using JavaScript.
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium import webdriver
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
locator = "//strong[contains(text(),'cubic meters')]"
with webdriver.Chrome() as driver: # Type: webdriver
wait = WebDriverWait(driver, 10)
driver.get(url)
cubic = wait.until(ec.presence_of_element_located((By.XPATH, locator))) # Type: WebElement
print(cubic.text)
# The below examples are just for information
# and are not needed for the case
# Example with scroll. Scroll to the element to make it visible
cubic.location_once_scrolled_into_view
print(cubic.text)
# Example using JavaScript. Works for not visible elements.
text = driver.execute_script("return arguments[0].textContent", cubic)
print(text)
It would be correct to use the marinetraffic API.
I guess you should first scroll to that element and only after that try accessing it including getting it text.
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
element = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//div")
actions.move_to_element(element).build().perform()
text = element.text
In case the above still not good enough you can scroll page height one time like this:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
the_text = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//strong").text

Next Page Iteration in Selenium/BeautfulSoup for Scraping E-Commerce Website

I'm scraping an E-Commerce website, Lazada using Selenium and bs4, I manage to scrape on the 1st page but I unable to iterate to the next page. What I'm tyring to achieve is to scrape the whole pages based on the categories I've selected.
Here what I've tried :
# Run the argument with incognito
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get('https://www.lazada.com.my/')
driver.maximize_window()
# Select category item #
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
t = 10
try:
WebDriverWait(driver,t).until(EC.visibility_of_element_located((By.ID,"a2o4k.searchlistcategory.0.i0.460b6883jV3Y0q")))
except TimeoutException:
print('Page Refresh!')
driver.refresh()
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
print('Page Load!')
#Soup and select element
def getData(np):
soup = bs(driver.page_source, "lxml")
product_containers = soup.findAll("div", class_='c2prKC')
for p in product_containers:
title = (p.find(class_='c16H9d').text)#title
selling_price = (p.find(class_='c13VH6').text)#selling price
try:
original_price=(p.find("del", class_='c13VH6').text)#original price
except:
original_price = "-1"
if p.find("i", class_='ic-dynamic-badge ic-dynamic-badge-freeShipping ic-dynamic-group-2'):
freeShipping = 1
else:
freeShipping = 0
try:
discount = (p.find("span", class_='c1hkC1').text)
except:
discount ="-1"
if p.find(("div", {'class':['c16H9d']})):
url = "https:"+(p.find("a").get("href"))
else:
url = "-1"
nextpage_elements = driver.find_elements_by_class_name('ant-pagination-next')[0]
np=webdriver.ActionChains(driver).move_to_element(nextpage_elements).click(nextpage_elements).perform()
print("- -"*30)
toSave = [title,selling_price,original_price,freeShipping,discount,url]
print(toSave)
writerows(toSave,filename)
getData(np)
The problem might be that the driver is trying to click the button before the element is even loaded correctly.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(PATH, chrome_options=option)
# use this code after driver initialization
# this is make the driver wait 5 seconds for the page to load.
driver.implicitly_wait(5)
url = "https://www.lazada.com.ph/catalog/?q=phone&_keyori=ss&from=input&spm=a2o4l.home.search.go.239e359dTYxZXo"
driver.get(url)
next_page_path = "//ul[#class='ant-pagination ']//li[#class=' ant-pagination-next']"
# the following code will wait 5 seconds for
# element to become clickable
# and then try clicking the element.
try:
next_page = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, next_page_path)))
next_page.click()
except Exception as e:
print(e)
EDIT 1
Changed the code to make the driver wait for the element to become clickable. You can add this code inside a while loop for iterating multiple times and break the loop if the button is not found and is not clickable.

How to avoid StaleElementReferenceError when getting elements from different page?

I want to get all the results from a race. The website shows 50 rows/page.
I navigate to the next page (same URL with suffix #page-x) using selenium, but I get a StaleElementReferenceException error whenever I try to find elements (cells of the table = td) on the next page.
I tried to close the driver between the steps to get just one list of elements at a time. I've also tried to load the pages separately with the URL+suffix, but it doesn't load correctly. I've tried building separate lists (at first I wanted one big list with all the results).
from selenium import webdriver
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
#The block under works well and I get a list of cells as intended.
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
elements = driver.find_elements_by_tag_name("td")
course = []
for i in range(len(elements)):
course.append(elements[i].text)
to_2 = driver.find_element_by_link_text("2")
to_2.click()
print(driver.current_url)
#I'm trying similar code for the next chunk, but it doesn't work.
elements2 = driver.find_elements_by_tag_name("td")
print(len(elements2))
print(elements2[5].text)
course2 = []
for i in range(len(elements2)):
course2.append(elements2[i].text)
driver.close()
I would expect a new list (course2), with the results of the second page, but I get a stale element error. When I print the current URL, the result is as expected. When I print the len(elements2), it's also OK. Looks like the problem is when I try to get the text of an element.
Solution-1:
Using BeautifulSoup and selenium, WebDriverWait is waiting for a certain condition to occur before proceeding further in the code. for more details about BeautifulSoup.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
course = []
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
page_soup = BeautifulSoup(driver.page_source, 'lxml')
# get table data
tbody = page_soup.find("tbody",{"id":"searchResultBoxParticipants"})
rows = tbody.find_all("tr")
for row in rows:
rowData = []
for td in row.find_all("td"):
rowData.append(td.text)
course.append(rowData)
data.append(course)
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
print(data)
Solution-2:
Using pandas library.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
tables = pd.read_html(driver.page_source)
#append Participants table data
data.append(tables[0])
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
#Concat dataframe object
result = pd.concat(data)
print(result)

Categories