selenium: stale element reference: element is not attached to the page document - python

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import chromedriver_autoinstaller
chromedriver_autoinstaller.install()
TYPES = ['user', 'verified_audience', 'top_critics']
TYPE = TYPES[2]
URL = 'https://www.rottentomatoes.com/m/dunkirk_2017/reviews'
PAGES = 2
driver = Chrome()
driver.get(URL)
data_reviews = []
while PAGES != 0:
wait = WebDriverWait(driver, 30)
reviews = wait.until(lambda _driver: _driver.find_elements(
By.CSS_SELECTOR, '.review_table_row'))
# Extracting review data
for review in reviews:
if TYPE == 'top_critics':
critic_name_el = review.find_element(
By.CSS_SELECTOR, '[data-qa=review-critic-link]')
critic_review_text_el = review.find_element(
By.CSS_SELECTOR, '[data-qa=review-text]')
data_reviews.append(critic_name_el.text)
try:
next_button_el = driver.find_element(
By.CSS_SELECTOR, '[data-qa=next-btn]:not([disabled=disabled])'
)
if not next_button_el:
PAGES = 0
next_button_el.click() # refresh new reviews
PAGES -= 1
except Exception as e:
driver.quit()
Here, a rotten tomatoes review page is being opened and the reviews are being scraped, but when the next button is clicked and the new reviews are going to be scraped, this error pops up... I am guessing that the new reviews have not been loaded and trying to access them is causing the problem, I tried driver.implicitly_wait but that doesn't work too.
The error originates from line 33, data_reviews.append(critic_name_el.text)

By clicking a next page button next_button_el the new page is being loaded but this process takes some time while your Selenium code continues instantly after that click so probably on this line reviews = wait.until(lambda _driver: _driver.find_elements(By.CSS_SELECTOR, '.review_table_row')) it collects the elements on the old page but then the page is being refreshed so some of these elements critic_name_el collected after that (still on the old page) is no more there since the old page is refreshed.
To make your code working you need to introduce a short delay after clicking the next page button, as following:
data_reviews = []
while PAGES != 0:
wait = WebDriverWait(driver, 30)
reviews = wait.until(lambda _driver: _driver.find_elements(
By.CSS_SELECTOR, '.review_table_row'))
# Extracting review data
for review in reviews:
if TYPE == 'top_critics':
critic_name_el = review.find_element(
By.CSS_SELECTOR, '[data-qa=review-critic-link]')
critic_review_text_el = review.find_element(
By.CSS_SELECTOR, '[data-qa=review-text]')
data_reviews.append(critic_name_el.text)
try:
next_button_el = driver.find_element(
By.CSS_SELECTOR, '[data-qa=next-btn]:not([disabled=disabled])'
)
if not next_button_el:
PAGES = 0
next_button_el.click() # refresh new reviews
PAGES -= 1
time.sleep(2)
except Exception as e:
driver.quit()
Also I'd suggest to wait for elements visibility, not just presence here:
reviews = wait.until(lambda _driver: _driver.find_elements(By.CSS_SELECTOR, '.review_table_row'))
Also you need to understand that driver.implicitly_wait do not introduce any actual pause. This just sets the timeout for find_element and find_elements methods.

Related

How to paginate using Python Selenium in Trip Advisor to extract reviews

I am trying to extract the reviews with their respective titles of a particular hotel in Trip Advisor, using Web Scraping techniques with Python and Selenium but I have only been able to extract the reviews of a single page and I need to extract all or most of the reviews, but the pagination is not working. What I do is click on the Next button, iterate over a range of pages and extract the information.
The web scraping is from the home page https://www.tripadvisor.com/Hotel_Review-g562644-d1490165-Reviews-Parador_de_Alcala_de_Henares-Alcala_De_Henares.html
notice that when the page changes this is added: -or20- depending if it is the fourth -or30- or fifth -or40- always shows 10 reviews
for example this is the third page: https://www.tripadvisor.com/Hotel_Review-g562644-d1490165-Reviews-or20-Parador_de_Alcala_de_Henares-Alcala_De_Henares.html
Basically this is what I do: read a csv, open the page, (change to all laguanges, this is optional), expand reviews, read reviews, click on Next button, write to csv, iterate in a range of pages.
Any help, thanks in advance !!
Images:
Trip Advisor HTML reviews
Trip Advisor HTML Next button
This is my code so far:
# Web scraping
# Writing to csv
with open('reviews_hotel_9.csv', 'w', encoding="utf-8") as file:
file.write('titles, reviews \n')
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.tripadvisor.com/Hotel_Review-g562644-d1490165-Reviews-Parador_de_Alcala_de_Henares-Alcala_De_Henares.html")
sleep(3)
cookie = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')# cookies accept
try:
cookie.click()
except:
pass
print('ok')
for k in range(10): #range pagination
#container = driver.find_elements_by_xpath("//div[#data-reviewid]")
try:
# radio button all languages (optional)
#driver.execute_script("arguments[0].click();", WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="component_14"]/div/div[3]/div[1]/div[1]/div[4]/ul/li[1]/label/span[1]'))))
# read more expand reviews
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//span[#class="Ignyf _S Z"]'))))
# review titles
titles = driver.find_elements_by_xpath('//div[#class="KgQgP MC _S b S6 H5 _a"]/a/span') #
sleep(1)
# reviews
reviews = driver.find_elements_by_xpath('//q[#class="QewHA H4 _a"]/span')#
sleep(1)
except TimeoutException:
pass
with open('reviews_hotel_9.csv', 'a', encoding="utf-8") as file:
for i in range(len(titles)):
file.write(titles[i].text + ";" + reviews[i].text + "\n")
try:
#driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//a[#class="ui_button nav next primary "]')))) #
# click on Next button
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'ui_button nav next primary '))))
except TimeoutException:
pass
file.close()
driver.quit()
You can scrape data from each review page by clicking on the Next button on the review page. In the code below we go on an infinite loop and keep clicking the Next button. We stop going ahead when we encounter the exception ElementClickInterceptedException. At the end we are saving the data to the file MyData.csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementClickInterceptedException
import pandas as pd
import time
chrome_path = r"C:\Users\hpoddar\Desktop\Tools\chromedriver_win32\chromedriver.exe"
s = Service(chrome_path)
url = 'https://www.tripadvisor.com/Hotel_Review-g562644-d1490165-Reviews-Parador_de_Alcala_de_Henares-Alcala_De_Henares.html'
driver = webdriver.Chrome(service=s)
driver.get(url)
df = pd.DataFrame(columns = ['title', 'review'])
while True:
time.sleep(2)
reviews = driver.find_elements(by=By.CSS_SELECTOR, value='.WAllg._T')
for review in reviews:
title = review.find_element(by=By.CSS_SELECTOR, value='.KgQgP.MC._S.b.S6.H5._a').text
review = review.find_element(by=By.CSS_SELECTOR, value='.fIrGe._T').text
df.loc[len(df)] = [title, review]
try:
driver.find_element(by=By.CSS_SELECTOR, value='.ui_button.nav.next.primary').click()
except ElementClickInterceptedException:
break
df.to_csv("MyData.csv")
This gives us the output :

Next Page Iteration in Selenium/BeautfulSoup for Scraping E-Commerce Website

I'm scraping an E-Commerce website, Lazada using Selenium and bs4, I manage to scrape on the 1st page but I unable to iterate to the next page. What I'm tyring to achieve is to scrape the whole pages based on the categories I've selected.
Here what I've tried :
# Run the argument with incognito
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get('https://www.lazada.com.my/')
driver.maximize_window()
# Select category item #
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
t = 10
try:
WebDriverWait(driver,t).until(EC.visibility_of_element_located((By.ID,"a2o4k.searchlistcategory.0.i0.460b6883jV3Y0q")))
except TimeoutException:
print('Page Refresh!')
driver.refresh()
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
print('Page Load!')
#Soup and select element
def getData(np):
soup = bs(driver.page_source, "lxml")
product_containers = soup.findAll("div", class_='c2prKC')
for p in product_containers:
title = (p.find(class_='c16H9d').text)#title
selling_price = (p.find(class_='c13VH6').text)#selling price
try:
original_price=(p.find("del", class_='c13VH6').text)#original price
except:
original_price = "-1"
if p.find("i", class_='ic-dynamic-badge ic-dynamic-badge-freeShipping ic-dynamic-group-2'):
freeShipping = 1
else:
freeShipping = 0
try:
discount = (p.find("span", class_='c1hkC1').text)
except:
discount ="-1"
if p.find(("div", {'class':['c16H9d']})):
url = "https:"+(p.find("a").get("href"))
else:
url = "-1"
nextpage_elements = driver.find_elements_by_class_name('ant-pagination-next')[0]
np=webdriver.ActionChains(driver).move_to_element(nextpage_elements).click(nextpage_elements).perform()
print("- -"*30)
toSave = [title,selling_price,original_price,freeShipping,discount,url]
print(toSave)
writerows(toSave,filename)
getData(np)
The problem might be that the driver is trying to click the button before the element is even loaded correctly.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(PATH, chrome_options=option)
# use this code after driver initialization
# this is make the driver wait 5 seconds for the page to load.
driver.implicitly_wait(5)
url = "https://www.lazada.com.ph/catalog/?q=phone&_keyori=ss&from=input&spm=a2o4l.home.search.go.239e359dTYxZXo"
driver.get(url)
next_page_path = "//ul[#class='ant-pagination ']//li[#class=' ant-pagination-next']"
# the following code will wait 5 seconds for
# element to become clickable
# and then try clicking the element.
try:
next_page = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, next_page_path)))
next_page.click()
except Exception as e:
print(e)
EDIT 1
Changed the code to make the driver wait for the element to become clickable. You can add this code inside a while loop for iterating multiple times and break the loop if the button is not found and is not clickable.

Can't get all the links connected to a hashtag from a lazy loading webpage

I've created a script in python together with selenium to scroll to the bottom of a lazy-loading webpage and parse the content from there. I'm trying to get all the links connected to a hashtag from instagram. There are around 475 results out there but my current attempt fetches me only 38.
The script I've created can scroll to the bottom of that page but I still get 38 results out of some 475 results.
Link to that webpage
I've tried so far with:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
tag = '#baltimorepizza'
hash_url = 'https://www.instagram.com/explore/tags/{}/'
def scroll_to_get_more():
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
wait.until(lambda driver: driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = driver.execute_script("return document.body.scrollHeight;")
except TimeoutException:
break
def get_links(tag):
driver.get(hash_url.format(tag.strip("#").lower()))
scroll_to_get_more()
total_links = [item.get_attribute("href") for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.kIKUG > a')))]
print("Total link scraped:",len(total_links))
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
get_links(tag)
driver.quit()
How can I get all the links connected to that specific hashtag from instagram?
Same as #KunduK, I can only gather 437, so I am wondering if this is the correct number, maybe you need to login to see the remaining ones..?
You are only getting ~38 because the page does not render the entire code in the DOM at one. So even if you scrolled, you queried the data but it is not all accessible, only when you scroll back to it (images in view).
Solution here will get the data while scrolling.
We will scroll at the bottom first and ensure all query was made to load the images using your method scroll_to_get_more.
Then we will start scraping from the top to bottom, so we need to scroll all the way back to the top using:
def scroll_to_header():
el = driver.find_element_by_tag_name("header")
driver.execute_script("arguments[0].scrollIntoView();", el)
Your get_links method will now look like this:
def get_links(tag):
driver.get(hash_url.format(tag.strip("#").lower()))
scroll_to_get_more()
scroll_to_header()
total_links = []
current_len = 0
new_len = -1
while current_len != new_len:
current_len = len(total_links)
try:
links = []
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.Nnq7C.weEfm [href]')))
for el in elements:
if el.get_attribute('href') not in total_links:
links.append(el.get_attribute('href'))
total_links.extend(links)
except StaleElementReferenceException:
continue
if len(elements):
driver.execute_script("arguments[0].scrollIntoView();", el)
new_len = len(total_links)
print("Total link scraped:", len(total_links))
Basically, after every query we are scrolling to the last element, which will load in the DOM the next images.
Also, I was thinking that your scroll method was the reason I was getting 437 (scrolling and missing elements). So I have implemented a new method, that use the spinner as the element to scroll, instead of the height of the page. Both are valid, but I think this one is faster (see result below):
def scroll_to_get_more():
while True:
try:
spinner = driver.find_element_by_css_selector('.By4nA')
driver.execute_script("arguments[0].scrollIntoView();", spinner)
except StaleElementReferenceException:
continue
except NoSuchElementException:
break
Output with scrolling method above:
Total link scraped: 437
Query took: 23.520002755
Output with your scrolling method:
Total link scraped: 437
Query took: 42.685470925
Main reason for the time difference is that you will always sleep 10 seconds once the page does not need to scroll anymore.

Selenium cannot get all elements of a page

i am using selenium to go search on agoda and scrape all the hotel name in the page, but the output only return 2 names.
Then i tried to add a line to scroll to the bottom, now the output gives me first 2 names and last 2 names (first two from beginning, last two from bottom)
I don't understand what's the problem, i added time.sleep() for each step so the whole page should have been loaded completely. Does selenium limit by page view that it can only scrape those element in sight?
my code below:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(30)
def scrape():
r = requests.get(current_page)
if r.status_code == requests.codes.ok:
print('start scraping!')
hotel = driver.find_elements_by_class_name('hotel-name')
hotels = []
for h in hotel:
if hotel:
hotels.append(h.text)
print(hotels, file=open("output.txt", 'a', encoding="utf-8"))
scrape()
Here is the page i want to scrape
Try to use below script to scroll page down until no more results appeared on page and then scrape all available names:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.agoda.com/pages/agoda/default/DestinationSearchResult.aspx?asq=8wUBc629jr0%2B3O%2BxycijdcaVIGtokeWrEO7ShJumN8xsNvkFkEV9bUgNnbx6%2Bx22ncbzTLOPBjT84OgAAKXmu6quf8aEKRA%2FQH%2BGoyXgowLt%2BXyB8OpN1h2WP%2BnBM%2FwNPzD%2BpaeII93w%2Bs4dMWI4QPJNbZJ8DWvRiPsrPVVBJY7ilpMPlUermwV1UKIKfuyeis3BqRkJh9FzJOs0E98zXQ%3D%3D&city=9590&cid=-142&tick=636818018163&languageId=20&userId=3c2c4cb9-ba6d-4519-8ef4-c85dfd280b8f&sessionId=d4qzq2tgymjrwsf22lnadxpc&pageTypeId=1&origin=HK&locale=zh-TW&aid=130589&currencyCode=HKD&htmlLanguage=zh-tw&cultureInfoName=zh-TW&ckuid=3c2c4cb9-ba6d-4519-8ef4-c85dfd280b8f&prid=0&checkIn=2019-01-16&checkOut=2019-01-17&rooms=1&adults=2&children=0&priceCur=HKD&los=1&textToSearch=%E5%A4%A7%E9%98%AA&productType=-1&travellerType=1')
# Get initial list of names
hotels = wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))
while True:
# Scroll down to last name in list
driver.execute_script('arguments[0].scrollIntoView();', hotels[-1])
try:
# Wait for more names to be loaded
wait(driver, 15).until(lambda driver: len(wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))) > len(hotels))
# Update names list
hotels = wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))
except:
# Break the loop in case no new names loaded after page scrolled down
break
# Print names list
print([hotel.text for hotel in hotels])

Python - Selenium next page

I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion

Categories