I am scraping amazon products but in the first, I want to click on each category, the code work just with the first category in the loop and get this error, I searched about that and found many of answers but they didn't work inside the loop and all of them work with xpath(one element not elements)
first click (see_more) work, the problem with a click in loop
ERROR:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=80.0.3987.149)
Here is the code.
from selenium.common.exceptions import ElementClickInterceptedException
from csv import writer
from selenium.webdriver import ActionChains
driver = webdriver.Chrome(executable_path='C:\\Users\\Compu City\\Desktop\\chromedriver.exe')
driver.get('https://www.amazon.com/international-sales-offers/b/?ie=UTF8&node=15529609011&ref_=nav_navm_intl_deal_btn')
time.sleep(10)
res = driver.execute_script("return document.documentElement.outerHTML", 'window.scrollBy(0,2000)')
soup = BeautifulSoup(res, 'lxml')
cat=[]
filter_con = driver.find_element_by_id('widgetFilters') # main container of products
cats=driver.find_elements_by_css_selector('.a-expander-container .a-checkbox label .a-label')
see_more =driver.find_element_by_css_selector('#widgetFilters > div:nth-child(1) > div.a-row.a-expander-container.a-expander-inline-container > a > span')
ActionChains(driver).move_to_element(filter_con).click(see_more).perform()
cat= 0
while(cat < len(cats)):
print(cat)
print(cats[cat].text)
action = ActionChains(driver).move_to_element(filter_con).click(cats[cat]).perform()
cat+=1
The moment you click on the cat element the references in the cats will be refreshed meaning selenium will get a new set of references to each elements. And as you are still pointing to the older references you are getting Stale Element Exception. Update your code as below.
Option 1: Fixing in existing code
while(cat < len(cats)):
currentCat = driver.find_elements_by_css_selector('.a-expander-container .a-checkbox label .a-label')[cat]
print(cat)
print(currentCat.text)
action = ActionChains(driver).move_to_element(filter_con).click(currentCat).perform()
cat+=1
Option 2: Using for loop (without action chain)
for catNumber in range(len(cats)):
cat = driver.find_elements_by_css_selector('.a-expander-container .a-checkbox label .a-label')[catNumber+1]
print(catNumber+1)
# scroll to the elemen
cat.location_once_scrolled_into_view
# click
cat.click()
Related
This is the link
https://www.unibet.eu/betting/sports/filter/football/matches
Using selenium driver, I access this link. This is what we have on the page
The actual task for me is to click on each of the match link. I found all those matches by
elems = driver.find_elements_by_class_name('eb700')
When i did this
for elem in elems:
elements
elem.click()
time.sleep(2)
driver.execute_script("window.history.go(-1)")
time.sleep(2)
The first time it clicked, loaded new page, went to previous page and then gave the following error
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
I also tried getting HREF attribute from the elem, but it gave None, Is it possible to open the page in a new tab instead of clicking the elem?
You can retry to click on element once again since it is no longer present in DOM.
Code :
driver = webdriver.Chrome("C:\\Users\\**\\Inc\\Desktop\\Selenium+Python\\chromedriver.exe")
driver.maximize_window()
wait = WebDriverWait(driver, 30)
driver.get("https://www.unibet.eu/betting/sports/filter/football/matches")
wait.until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, "OK"))).click()
sleep(2)
elements = driver.find_elements(By.XPATH, "//div[contains(#class,'_')]/div[#data-test-name='accordionLevel1']")
element_len = len(elements)
print(element_len)
counter = 0
while counter < element_len:
attempts = 0
while attempts < 2:
try:
ActionChains(driver).move_to_element(elements[counter]).click().perform()
except:
pass
attempts = attempts + 1
sleep(2)
# driver.execute_script("window.history.go(-1)") #may be get team name
#using //div[#data-test-name='teamName'] xpath
sleep(2)
# driver.refresh()
sleep(2)
counter = counter + 1
Since you move to next page, the elements no longer exists in DOM. So, you will get Stale Element exception.
What you can do is when comming back to same page, get all the links again (elems) and use while loop instead of for loop.
elems = driver.find_elements_by_class_name('eb700')
i=0
while i<len(elems):
elems[i].click()
time.sleep(2)
driver.execute_script("window.history.go(-1)")
time.sleep(2)
elems = driver.find_elements_by_class_name('eb700')
i++
Other solution is to remain on same page and save all href attributes in a list and then use driver.navigate to open each match link.
matchLinks=[]
elems = driver.find_elements_by_class_name('eb700')
for elem in elems:
matchLinks.append(elem.get_attribute('href')
for match in matchLinks:
driver.get(match)
#do whatever you want to do on match page.
I'm writing a script to scrape product names from a website, filtered by brands. Some search results may contain more than one page, and this is where the problem comes in. I'm able to scrape the first page but when the script clicks on the next page the error message selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document shows. Below is my code:
def scrape():
resultList = []
currentPage = 1
while currentPage <= 2:
titleResults = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'h4.mt-0')))
resultList.append(titleResults)
checkNextPage = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, "//div/nav/ul/li/a[#aria-label='Next']")))
for cnp in checkNextPage:
nextPageNumber = int(cnp.get_attribute("data-page"))
currentPage += 1
driver.find_element_by_xpath("//div/nav/ul/li/a[#aria-label='Next']").click()
for result in resultList[0]:
print("Result: {}".format(result.text))
I think the error got triggered when .click() was called. I've done a lot of searching on the internet before resorting to posting this question here because either I don't understand the solutions from other articles/posts or they don't apply to my case.
Stale Element means an old element or no longer available element.
I think the error is caused by last line.
You should extract elements text before the elements become unavailable.
def scrape():
resultList = []
currentPage = 1
while currentPage <= 2:
titleResults = WebDriverWait(driver,
10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'h4.mt-0')))
// Extract elements text
results_text = [titleResults[i].text for i in range(0, len(titleResults))]
resultList.extend(results_text)
checkNextPage = WebDriverWait(driver, 30).until(EC.visibility_of_all_elements_located((By.XPATH, "//div/nav/ul/li/a[#aria-label='Next']")))
for cnp in checkNextPage:
nextPageNumber = int(cnp.get_attribute("data-page"))
currentPage += 1
driver.find_element_by_xpath("//div/nav/ul/li/a[#aria-label='Next']").click()
print("Result: {}".format(resultList))
I am trying scraping data from a number of pages on a website by using selenium in python. The syntax run and scrape data successfully on the first page but, after the second page, it can't find the click button and stop scraping. I check the HTML codes of the webpage, but the element on the second page is as same as the one on the first page. I found this question related to the same issue. I think that the problem is caused by that the reference to the button is lost after the DOM is changed, but I still can't fix the issue properly. I would appreciate any suggestions or solutions. The syntax and results are included below:
browser = webdriver.Chrome(r"C:\Users\...\chromedriver.exe")
browser.get('https://fortune.com/global500/2019/walmart')
table = browser.find_element_by_css_selector('tbody')
data =[]
#Use For Loop for Index
i = 1
while True:
if i > 5:
break
try:
print("Scraping Page no. " + str(i))
i = i + 1
# Select rows in the table
for row in table.find_elements_by_css_selector('tr'):
cols = data.append([cell.text for cell in row.find_elements_by_css_selector('td')])
try:
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH,'//span[#class="singlePagination__icon--2KbZn"]')))
time.sleep(10)
finally:
browser.find_element_by_xpath('//span[#class="singlePagination__icon--2KbZn"]').click()
except Exception as e:
print(e)
break
data1 = pd.DataFrame(data, columns=['Labels','Value'])
print(data1)
browser.close()
output:
Scraping Page no. 1
Scraping Page no. 2
Message: stale element reference: element is not attached to the page document
(Session info: chrome=....)
Labels Value
0 (...) (...)
1 (...) (...)
move table = browser.find_element_by_css_selector('tbody') line into your while loop.So that you will get the latest reference to the table element as part of each loop and then you should not see any stale element issue.
while True:
table = browser.find_element_by_css_selector('tbody')
if i > 5:
I've created a script in python together with selenium to scroll to the bottom of a lazy-loading webpage and parse the content from there. I'm trying to get all the links connected to a hashtag from instagram. There are around 475 results out there but my current attempt fetches me only 38.
The script I've created can scroll to the bottom of that page but I still get 38 results out of some 475 results.
Link to that webpage
I've tried so far with:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
tag = '#baltimorepizza'
hash_url = 'https://www.instagram.com/explore/tags/{}/'
def scroll_to_get_more():
check_height = driver.execute_script("return document.body.scrollHeight;")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
wait.until(lambda driver: driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = driver.execute_script("return document.body.scrollHeight;")
except TimeoutException:
break
def get_links(tag):
driver.get(hash_url.format(tag.strip("#").lower()))
scroll_to_get_more()
total_links = [item.get_attribute("href") for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.kIKUG > a')))]
print("Total link scraped:",len(total_links))
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
get_links(tag)
driver.quit()
How can I get all the links connected to that specific hashtag from instagram?
Same as #KunduK, I can only gather 437, so I am wondering if this is the correct number, maybe you need to login to see the remaining ones..?
You are only getting ~38 because the page does not render the entire code in the DOM at one. So even if you scrolled, you queried the data but it is not all accessible, only when you scroll back to it (images in view).
Solution here will get the data while scrolling.
We will scroll at the bottom first and ensure all query was made to load the images using your method scroll_to_get_more.
Then we will start scraping from the top to bottom, so we need to scroll all the way back to the top using:
def scroll_to_header():
el = driver.find_element_by_tag_name("header")
driver.execute_script("arguments[0].scrollIntoView();", el)
Your get_links method will now look like this:
def get_links(tag):
driver.get(hash_url.format(tag.strip("#").lower()))
scroll_to_get_more()
scroll_to_header()
total_links = []
current_len = 0
new_len = -1
while current_len != new_len:
current_len = len(total_links)
try:
links = []
elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.Nnq7C.weEfm [href]')))
for el in elements:
if el.get_attribute('href') not in total_links:
links.append(el.get_attribute('href'))
total_links.extend(links)
except StaleElementReferenceException:
continue
if len(elements):
driver.execute_script("arguments[0].scrollIntoView();", el)
new_len = len(total_links)
print("Total link scraped:", len(total_links))
Basically, after every query we are scrolling to the last element, which will load in the DOM the next images.
Also, I was thinking that your scroll method was the reason I was getting 437 (scrolling and missing elements). So I have implemented a new method, that use the spinner as the element to scroll, instead of the height of the page. Both are valid, but I think this one is faster (see result below):
def scroll_to_get_more():
while True:
try:
spinner = driver.find_element_by_css_selector('.By4nA')
driver.execute_script("arguments[0].scrollIntoView();", spinner)
except StaleElementReferenceException:
continue
except NoSuchElementException:
break
Output with scrolling method above:
Total link scraped: 437
Query took: 23.520002755
Output with your scrolling method:
Total link scraped: 437
Query took: 42.685470925
Main reason for the time difference is that you will always sleep 10 seconds once the page does not need to scroll anymore.
I am trying to fetch episodes link from website.
So i need to enter to the links and fetch the information from episodes for.
Its working in 2-3 episodes then crashing with error that wrotted in the bottom.
I tried to raise the time sleep but its still crashing.
Error :
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
import time
import datetime
import random
import string
import json
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from urllib.request import urlopen
import urllib.request
def send_imdb(id):
try:
r = requests.get('http://sdarot.bnlstudio.com/import.php?id=%s' % (id))
json = r.json()
if json['status'] == "success":
return json['id']
except:
return("Err")
links = []
seasons = []
link = "http://www.tvil.me/"
chrome_options = Options()
chrome_options.add_extension('gighmmpiobklfepjocnamgkkbiglidom.crx')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(link)
#Getting Posts links and split them
page_right = driver.find_element_by_id("page-right")
posts = page_right.find_elements_by_xpath("//div[#class='index-episode-caption']/a")
for element in posts:
href = element.get_attribute("href")
splited = href.split("/")
url = "%s//%s/view/%s/1/1/v/%s" % (splited[0], splited[2], splited[4], splited[8])
links.append(url)
##print(url)
#Entering posts and gets IMDB ID
E = 0;
for link in links:
driver.get(link)
time.sleep(2)
imdb_q = driver.find_element_by_xpath("//div[#id='view-trailer-imdb']/a")
imdb = imdb_q.get_attribute("href")
imdb_id = imdb.split("/")[4]
post_id = send_imdb(imdb_id)
print("Post_ID: %s" % (post_id))
seasons_num = driver.find_elements_by_xpath("//*[contains(#id, 'change-season-')]/a")
total_seasons_num = len(seasons_num)
for i in range(1, total_seasons_num):
print("Season: %i" % (i))
season = driver.find_element_by_css_selector("#change-season-{num} a".format(num=i))
season.click()
episodes = driver.find_elements_by_xpath("//*[contains(#id, 'change-episode-')]/a")
for episode in episodes:
E += 1
print("Episode: %i" % (E))
time.sleep(3)
episode.click() # Break point
time.sleep(3)
When the element is not attached to the DOM, stale element exception occurs. Try finding the element again after stale element exception.
Consider clicking an element will refresh the page.
Example:
WebElement element = driver.findElement(By.Id("refreshButton"));
element.Click(); //page will be refreshed after clicking this button
element.Click(); //stale element exception will throw
Now, if you perform any action(click,senkeys,etc) on the element, it will throw StaleElementException because it reference is lost(page has been refreshed)
To overcome this, find that element again after the page has refreshed.Like,
WebElement element = driver.findElement(By.Id("refreshButton"));
element.Click(); //page will be refreshed after clicking this button
element = driver.findElement(By.Id("refreshButton"));
element.Click();