How to avoid StaleElementReferenceError when getting elements from different page? - python

I want to get all the results from a race. The website shows 50 rows/page.
I navigate to the next page (same URL with suffix #page-x) using selenium, but I get a StaleElementReferenceException error whenever I try to find elements (cells of the table = td) on the next page.
I tried to close the driver between the steps to get just one list of elements at a time. I've also tried to load the pages separately with the URL+suffix, but it doesn't load correctly. I've tried building separate lists (at first I wanted one big list with all the results).
from selenium import webdriver
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
#The block under works well and I get a list of cells as intended.
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
elements = driver.find_elements_by_tag_name("td")
course = []
for i in range(len(elements)):
course.append(elements[i].text)
to_2 = driver.find_element_by_link_text("2")
to_2.click()
print(driver.current_url)
#I'm trying similar code for the next chunk, but it doesn't work.
elements2 = driver.find_elements_by_tag_name("td")
print(len(elements2))
print(elements2[5].text)
course2 = []
for i in range(len(elements2)):
course2.append(elements2[i].text)
driver.close()
I would expect a new list (course2), with the results of the second page, but I get a stale element error. When I print the current URL, the result is as expected. When I print the len(elements2), it's also OK. Looks like the problem is when I try to get the text of an element.

Solution-1:
Using BeautifulSoup and selenium, WebDriverWait is waiting for a certain condition to occur before proceeding further in the code. for more details about BeautifulSoup.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
course = []
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
page_soup = BeautifulSoup(driver.page_source, 'lxml')
# get table data
tbody = page_soup.find("tbody",{"id":"searchResultBoxParticipants"})
rows = tbody.find_all("tr")
for row in rows:
rowData = []
for td in row.find_all("td"):
rowData.append(td.text)
course.append(rowData)
data.append(course)
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
print(data)
Solution-2:
Using pandas library.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
tables = pd.read_html(driver.page_source)
#append Participants table data
data.append(tables[0])
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
#Concat dataframe object
result = pd.concat(data)
print(result)

Related

Export multiple scraped items to a csv file in selenium

Ok so i have this code to scrape this link.... https://datacentersupport.lenovo.com/gb/en/products/storage/lenovo-storage/s3200/70l8/parts/display/compatible. The code I have scrapes all the details perfectly.......except the substitutes. Here's the full code. Have I missed something?
from selenium import webdriver
from time import sleep
import pandas as pd
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
# initializing webdriver
driver = webdriver.Chrome(executable_path="~~chromedriver.exe")
url = " https://datacentersupport.lenovo.com/gb/en/products/storage/lenovo-storage/s3200/70l8/parts/display/compatible."
driver.get(url)
sleep(5)
results = []
#getting breadcrumbs
bread1 = driver.find_element_by_xpath("//span[#class='prod-catagory-name']")
bread2 = driver.find_element_by_xpath("//span[#class='prod-catagory-name']/a")
#grabbing table data and navigating
pages = int(driver.find_element_by_xpath("//div[#class='page-container']/span[#class='icon-s-right active']/preceding-sibling::span[1]").text)
num = pages -1
for _ in range(pages):
rows = driver.find_elements_by_xpath("//table/tbody/tr/td[2]/div")
for row in rows:
parts = row.text
results.append([url,bread1.text,parts])
try:
for element in WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "span[class='icon-s-down']"))):
driver.execute_script("arguments[0].click();", element)
sleep(5)
substitute = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.XPATH, "//span[#class='icon-s-up']//following::tr[3]/td[contains(#class,'enabled-border')]//div[text()]")))
for sub in substitute:
subs = sub.text
results.append(subs)
except TimeoutException:
pass
except NoSuchElementException:
break
finally:
try:
pagination = driver.find_element_by_xpath("//div[#class='page-container']/span[#class='icon-s-right active']").click()
sleep(3)
except NoSuchElementException:
break
df = pd.DataFrame(results)
df.to_csv('datacenter2.csv', index=False)
driver.quit()
The results were far from pleasing. I know am missing something within the loops. But am not sure what. Any suggestion would be highly appreciated.
This is the result I get :
I need to print the code alongside the numbers in the last column for each row it scrapes

Python Selenium - Extract all URL's In Table and iterate until next button disappears

I am trying to extract all URL's and iterate where the next button is pressed until there isn't a next button. I would then like to open each URL if that is possible. Could I be pointed in the right direction for this please.
The website where you need to press the search button is here
Link to Table of URL's that need to be extracted
from selenium import webdriver
from selenium.webdriver.common.by import By
driver=webdriver.Chrome(executable_path=r"C:\Users\matt_\Documents\Python Scripts\Selenium\chromedriver.exe")
driver.get("https://publicaccess.aberdeencity.gov.uk/online-applications/search.do?action=monthlyList")
driver.find_element_by_xpath("/html/body/div/div/div[3]/div[3]/div/form/fieldset/div[5]/input[2]").click()
test = driver.find_elements(By.TAG_NAME,"a")
print(test)
Here is the example what you looking for
from bs4 import BeautifulSoup as Soup
from selenium import webdriver
import pandas as pd
import time
driver = webdriver.Chrome()
driver.get("https://monerobenchmarks.info/")
page = Soup(driver.page_source, features='html.parser')
final_list = []
def parsh_table():
table = page.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
final_list.extend(row)
def next_bu():
next_button = driver.find_element_by_xpath('//*[#id="cpu_next"]')
next_button.click()
# put range of pages
for _ in range(1,2):
parsh_table()
time.sleep(2)
next_bu()
print(final_list)
You can check the element exists or not with simple logic like this:
if len(driver.find_elements_by_css_selector('.next')) > 0:
Try the below code:
driver.get('https://publicaccess.aberdeencity.gov.uk/online-applications/search.do?action=monthlyList')
search_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.button.primary')))
search_btn.click()
condition = True
while condition:
links = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'li.searchresult a')))
for link in links:
print(link.get_attribute('href'))
if len(driver.find_elements_by_css_selector('.next')) > 0:
driver.find_element_by_css_selector('.next').click()
else:
condition = False
driver.quit()
Following import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Here you go
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"C:\Users\matt_\Documents\Python Scripts\Selenium\chromedriver.exe")
driver.get("https://publicaccess.aberdeencity.gov.uk/online-applications/search.do?action=monthlyList")
driver.find_element_by_css_selector("input[value='Search']").click()
def parse():
links = driver.find_elements_by_xpath('//*[#id="searchresults"]/li/a')
for link in links:
print(link.text, link.get_attribute("href"))
try:
driver.find_element_by_class_name('next').click()
parse()
except:
print('complete')
parse()

Html Parser pulling from previous webpage

I have a script that loads a page and saves a bunch of data ids from multiple containers. I then want to open up new urls appending those said data ids onto the end of the urls. For each url I want to locate all the hrefs and compare them to a list of specific links and if any of them match I want to save that link and a few other details to a table.
I have managed to get it to open the url with the appended data id but when I try to search for elements in the new page it either pulls them from the first url that was parsed if I try to findAll from soup again or I constantly get this error when I try to run another html.parser.
ResultSet object has no attribute 'findAll'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
Is it not possible to run another parser or am I just doing something wrong?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.action_chains import ActionChains
url = "http://csgo.exchange/id/76561197999004010#x"
driver = webdriver.Firefox()
driver.get(url)
import time
time.sleep(15)
html = driver.page_source
soup = soup(html, "html.parser")
containers = soup.findAll("div",{"class":"vItem"})
print(len(containers))
data_ids = [] # Make a list to hold the data-id's
for container in containers:
test = container.attrs["data-id"]
data_ids.append(test) # add data-id's to the list
print(str(test))
for id in data_ids:
url2 = "http://csgo.exchange/item/" + id
driver.get(url2)
import time
time.sleep(2)
soup2 = soup(html, "html.parser")
containers2 = soup2.findAll("div",{"class":"bar"})
print(str(containers2))
with open('scraped.txt', 'w', encoding="utf-8") as file:
for id in data_ids:
file.write(str(id)+'\n') # write every data-id to a new line
Not sure exactly what you want from each page. You should add waits. I add waits looking for hrefs in the flow history section of each page (if present). It should illustrate the idea.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
driver = webdriver.Chrome()
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
results.append([id, flowHistory])
except:
print(url)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
pros = ['http://csgo.exchange/profiles/76561198149324950']
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,3).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
if flowHistory in pros:
results.append([url,flowHistory])
print(results)
except:
print()
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
urls = ['http://csgo.exchange/id/76561197999004010']
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
for url in urls:
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
pros = ['http://csgo.exchange/profiles/76561198149324950', 'http://csgo.exchange/profiles/76561198152970370']
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,2).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
match = []
for string in pros:
if string in flowHistory:
match = string
break
if match:
pass
results.append([url,match])
print(results)
except:
print()

Unable to parse data inside h4 tag: Python3

I am facing an issue while parsing data from the 'Literature ' tab from the third table. The steps I took to reach the table:
Go to ibl.mdanderson.org/fasmic/#!
Type and select AKT1 (3 mutations) (NOTE:'GO' button doesn't work, please click the option from the drop-down)
Click on the green button with the text 'MS', a new table will appear.
In this new table, there will be a tab called literature, I need the literature text and the PMID.
I tried the following code, but it gives an empty list:
xyz= driver.find_element_by_xpath("//*[contains(text(),'Literature')]").click()
for elements in driver.find_elements_by_xpath('//div[#class="tab-pane ng-scope active"]'):
soup = BeautifulSoup(driver.page_source, 'lxml')
table = soup.find('div', attrs={'id': "literature_div"})
table_body = table.find('h4')
rows = table.find_all('h4')
for row in rows:
cols = row.find_all('h4')
# cols = [ele.text.strip() for ele in cols]
litrature.append([ele for ele in cols if ele]) # Get rid of empty value
print("Data from COLUMN 1:")
print(litrature)
How can I resolve this?
UPDATE
When I try to click on the 'Next ' button under the 'literature' table, I get the following error:
"Message: The element reference of is stale; either the element is no longer attached to the DOM, it is not in the current frame context, or the document has been refreshed "
Following is the line I added to click on the "NEXT" buton: driver.find_element_by_xpath('//a[#ng-click="selectPage(page + 1, $event)"]').click()
How can I resolve this?
you need to wait 3 times
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get('https://ibl.mdanderson.org/fasmic/#!/')
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH , '//input')))
input = driver.find_element_by_xpath("//input")
input.send_keys("AKT1\n")
button = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME , 'btn-tab-avail')))
button.click()
driver.find_element_by_xpath("//*[contains(text(),'Literature')]").click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#literature_div h4')))
rows = driver.find_elements_by_css_selector("#literature_div h4")
litrature = []
for item in rows:
item = item.text
litrature.append(item)
print("Data from COLUMN 1:")
print item
Like this? Someone with more knowledge of python waits can certainly improve on my wait lines.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
url = "https://ibl.mdanderson.org/fasmic/#!/"
d = webdriver.Chrome()
wait = WebDriverWait(d, 10)
d.get(url)
d.find_element_by_css_selector('[type=text]').send_keys('AKT1 (3 mutations)')
d.find_element_by_css_selector("input[type='text']").send_keys(Keys.RETURN)
btn = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".btn.btn-default.btn-tab-avail")))
btn.click()
d.find_element_by_css_selector("[heading=Literature]").click()
ele = wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#literature_div [ng-repeat]"), "PMID"))
eles = d.find_elements_by_css_selector("#literature_div [ng-repeat]")
for item in eles:
print(item.text,"\n")
d.quit()

Parsing a site where URL doesn't change with Selenium Python

I'm trying to scrape [this][1] site its URL doesnt change when next page is clicked on. So, I used Selenium to click on the next page, but doing that doesnt help. As my driver keeps getting the old page even after next page is clicked on. Is there any other way to get to the next page and scrape it?
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Safari()
store_pages = []
#10306 is total number of pages.
for i in range (10306):
Starting_url = 'site'
driver.get(Starting_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
print (store_pages.append(i))
timeout = 20
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_lblDisclaimerMsg']")))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
timeout = 20
wait = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a > div.act_search_results > div.act_search_header'), "206113 Record(s) | Page [2 of 10306]"))
NGO_element = driver.find_element_by_class_name("faq-sub-content exempted-result")
NGO_name = NGO_element.find_elements_by_tag_name("h1")
NGO_name_pancard = driver.find_elements_by_class_name("pan-id")
NGO_data = NGO_element.find_elements_by_tag_name("ul")
NGO_sub_data = NGO_element.find_elements_by_tag_name("li")
for i, p, t in zip(NGO_name, NGO_name_pancard, NGO_data):
n_name = i.text.replace(p.text, '')
n_data = t.text
n_pan = p.text
print ("Name of NGO:", n_name, "Fields of NGO:", n_data, "Pancard number:", n_pan)
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
#timeout = 2
You need to make sure when you reach the next page, the content of the earlier page has become stale otherwise, you will have stale element error or get the same thing repeatedly. Try the below approach, it should get you there. The rest you can modify yourself.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[id^='arrowex']"))):
print(elem.text)
try:
wait.until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_imgbtnNext"))).click()
wait.until(EC.staleness_of(elem))
except:
break
driver.quit()

Categories