I have some questions related to doing the loop with Selenium in Python. In fact, I want to iterate a list of links tracked by 'driver.find_elements_by_id' and click on them one by one, but the problem is such that each time I click on the link ('linklist' in the code), the page is refreshed so there is an error message indicating that
'Message: The element reference is stale. Either the element is no longer attached to the DOM or the page has been refreshed.'
I know that the reason is because the list of links disappeared after the click. But how can I generally in Selenium iterate the list even though the page doesn't exist anymore. I used 'driver.back()' and apparently it doesn't work.
The error message pops up after this line in the code:
link.click()
the linklist is located in this URL (I want to clink on the button Document and then download the first file after the refreshed page is displayed) 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001467373&type=10-K&dateb=20101231&owner=exclude&count=40'
Can someone have a look at this problem?
Thank you!
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import unittest
import os
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
import requests
import html2text
class LoginTest(unittest.TestCase):
def setUp(self):
self.driver=webdriver.Firefox()
self.driver.get("https://www.sec.gov/edgar/searchedgar/companysearch.html")
def test_Login(self):
driver=self.driver
cikID="cik"
searchButtonID="cik_find"
typeID="//*[#id='type']"
priorID="prior_to"
cik="00001467373"
Type="10-K"
prior="20101231"
search2button="//*[#id='contentDiv']/div[2]/form/table/tbody/tr/td[6]/input[1]"
documentsbuttonid="documentsbutton"
formbuttonxpath='//a[text()="d10k.htm"]'
cikElement=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_id(cikID))
cikElement.clear()
cikElement.send_keys(cik)
searchButtonElement=WebDriverWait(driver,20).until(lambda driver:driver.find_element_by_id(searchButtonID))
searchButtonElement.click()
typeElement=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_xpath(typeID))
typeElement.clear()
typeElement.send_keys(Type)
priorElement=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_id(priorID))
priorElement.clear()
priorElement.send_keys(prior)
search2Element=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_xpath(search2button))
search2Element.send_keys(Keys.SPACE)
time.sleep(1)
documentsButtonElement=WebDriverWait(driver,20).until(lambda driver:driver.find_element_by_id(documentsbuttonid))
a=driver.current_url
window_be1 = driver.window_handles[0]
linklist=driver.find_elements_by_id(documentsbuttonid)
with open("D:/doc2/"+"a"+".txt", mode="w",errors="ignore") as newfile:
for link in linklist:
link.click()
formElement=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_xpath(formbuttonxpath))
formElement.click()
time.sleep(1)
t=driver.current_url
r = requests.get(t)
data = r.text
newfile.write(html2text.html2text(data))
drive.back()
drive.back()
def terdown(self):
self.driver.quit()
if __name__=='__main__':
unittest.main()
You should not use a list of web-elements, but a list of links. Try something like this:
linklist = []
for link in driver.find_elements_by_xpath('//h4[#class="title"]/a'):
linklist.append(link.get_attribute('href'))
And then you can iterate through list of links
for link in linklist:
driver.get(link)
# do some actions on page
If you want to physically click on each link, you might need to use
for link in linklist:
driver.find_element_by_xpath('//h4[#class="title"]/a[#href=%s]' % link).click()
# do some actions on page
Related
I am trying to write a script to automate job applications on Linkedin using selenium and python.
The steps are simple:
open the LinkedIn page, enter id password and log in
open https://linkedin.com/jobs and enter the search keyword and location and click search(directly opening links like https://www.linkedin.com/jobs/search/?geoId=101452733&keywords=python&location=Australia get stuck as loading, probably due to lack of some post information from the previous page)
the click opens the job search page but this doesn't seem to update the driver as it still searches on the previous page.
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
import yaml
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
url = "https://linkedin.com/"
driver.get(url)
content = driver.page_source
stream = open("details.yaml", 'r')
details = yaml.safe_load(stream)
def login():
username = driver.find_element_by_id("session_key")
password = driver.find_element_by_id("session_password")
username.send_keys(details["login_details"]["id"])
password.send_keys(details["login_details"]["password"])
driver.find_element_by_class_name("sign-in-form__submit-button").click()
def get_experience():
return "1%C22"
login()
jobs_url = f'https://www.linkedin.com/jobs/'
driver.get(jobs_url)
keyword = driver.find_element_by_xpath("//input[starts-with(#id, 'jobs-search-box-keyword-id-ember')]")
location = driver.find_element_by_xpath("//input[starts-with(#id, 'jobs-search-box-location-id-ember')]")
keyword.send_keys("python")
location.send_keys("Australia")
driver.find_element_by_xpath("//button[normalize-space()='Search']").click()
WebDriverWait(driver, 10)
# content = driver.page_source
# soup = BeautifulSoup(content)
# with open("a.html", 'w') as a:
# a.write(str(soup))
print(driver.current_url)
driver.current_url returns https://linkedin.com/jobs/ instead of https://www.linkedin.com/jobs/search/?geoId=101452733&keywords=python&location=Australia as it should. I have tried to print the content to a file, it is indeed from the previous jobs page and not from the search page. I have also tried to search elements from page like experience and easy apply button but the search results in a not found error.
I am not sure why this isn't working.
Any ideas? Thanks in Advance
UPDATE
It works if try to directly open something like https://www.linkedin.com/jobs/search/?f_AL=True&f_E=2&keywords=python&location=Australia but not https://www.linkedin.com/jobs/search/?f_AL=True&f_E=1%2C2&keywords=python&location=Australia
the difference in both these links is that one of them takes only one value for experience level while the other one takes two values. This means it's probably not a post values issue.
You are getting and printing the current URL immediately after clicking on the search button, before the page changed with the response received from the server.
This is why it outputs you with https://linkedin.com/jobs/ instead of something like https://www.linkedin.com/jobs/search/?geoId=101452733&keywords=python&location=Australia.
WebDriverWait(driver, 10) or wait = WebDriverWait(driver, 20) will not cause any kind of delay like time.sleep(10) does.
wait = WebDriverWait(driver, 20) only instantiates a wait object, instance of WebDriverWait module / class
This question already has answers here:
Python Selenium: Block-Title is not properly verified. (Magento Cloud)
(2 answers)
Closed 2 years ago.
I am trying to access the text of an element using selenium with Python. I can access the elements themselves just fine, but when I try to get the text it doesn't work.
This is my code:
from selenium import webdriver
driver = webdriver.Chrome() # I removed the path for my post, but there is one that works in my actual code
URL = "https://www.costco.com/laptops.html"
driver.get(URL)
prices = driver.find_elements_by_class_name("price")
print([price.text for price in prices])
If I run this code I get: selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
However, if I were to print out the elements themselves, I have no problem.
I read some previous posts about the stale element exception, but I don't understand why it applies to me in this case. Why would the DOM change when I try to access the text? Why is this happening?
Turn out you just need to wait:
from selenium import webdriver
import time
driver = webdriver.Chrome() # I removed the path for my post, but there is one that works in my actual code
URL = "https://www.costco.com/laptops.html"
driver.get(URL)
time.sleep(3)
prices = driver.find_elements_by_class_name("price")
print([price.text for price in prices])
Output:
['$1,999.99', '$2,299.99', '', '', '$769.99', '', '$799.99', '$1,449.99', '$1,199.99', '$1,199.99', '$1,999.99', '$1,599.99', '$1,299.99', '$2,299.99', '$1,549.99', '$1,499.99', '$599.99', '$1,699.99', '$1,079.99', '$2,999.99', '$1,649.99', '$1,499.99', '$2,399.99', '$1,499.97', '$1,199.99', '$1,649.99', '$849.99', '']
The correct way to do this is to use WebDriverWait. See
Old answer:
I am not entirely sure why that is happening. But I would suggest you try BeautifulSoup:
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome() # I removed the path for my post, but there is one that works in my actual code
URL = "https://www.costco.com/laptops.html"
driver.get(URL)
soup = BeautifulSoup(driver.page_source)
divs = soup.find_all("div",{"class":"price"})
[div.text.replace("\t",'').replace("\n",'') for div in divs]
Output:
['$1,099.99',
'$399.99',
'$1,199.99',
'$599.99',
'$1,049.99',
'$799.99',
'$699.99',
'$949.99',
'$699.99',
'$1,999.99',
'$449.99',
'$2,699.99',
'$1,149.99',
'$1,599.99',
'$1,049.99',
'$1,249.99',
'$299.99',
'$1,799.99',
'$749.99',
'$849.99',
'$2,299.99',
'$999.99',
'$649.99',
'$799.99']
I'm working in selenium with Chrome.
The webpage I'm accessing updates dynamically.
I need the html that shows the results, I can access it when I do 'inspect element'.
I don't get how I need to access that html from my code. I always get the original html.
I tried this: Get HTML Source of WebElement in Selenium WebDriver using Python
browser.get('http://bijsluiters.fagg-afmps.be/?localeValue=nl')
searchform = browser.find_element_by_class_name('iceInpTxt')
searchform.send_keys('cefuroxim')
button = browser.find_element_by_class_name('iceCmdBtn').click()
element = browser.find_element_by_class_name('contentContainer')
html = element.get_attribute('innerHTML')
browser.close()
print(html)
It seems that it's working after some delay. If I were you I should try to experiment with the delay time.
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('http://bijsluiters.fagg-afmps.be/?localeValue=nl')
searchform = browser.find_element_by_class_name('iceInpTxt')
searchform.send_keys('cefuroxim')
button = browser.find_element_by_class_name('iceCmdBtn').click()
time.sleep(10)
element = browser.find_element_by_class_name('contentContainer')
html = element.get_attribute('innerHTML')
browser.close()
print(html)
Addition: a nicer way is to let the script proceed when an element is available (because of time it takes with JS (for example) before a specific element has been added to the DOM). The element to look for in your example is table with id iceDatTbl (for what I could find after a quick look).
I cobbled together some code to login to a website and navigate to the specific pages I want to scrape from. This part works fine. Now, however, I'm searching for a specific element titled 'tspan' and I'm getting an error that reads:
AttributeError: 'str' object has no attribute 'descendants'
If I go to the URL, right-click the element I want to grab, and click 'Inspect Element' I see the code behind the page, and it looks like this.
It looks like querying by 'g id' may work too.
So, I thought I could get all 'tspan' items, load all into a list, and write the list to a text file. However, I'm getting no 'tspan' elements at all. If I right-click the page and click 'View Page Source', I see no 'tspan' elements. This is very weird! The code behind the page is definitely different than what's rendered on the page itself. Here's my code. What am I doing wrong here?
from bs4 import BeautifulSoup as bs
import webbrowser
import requests
from lxml import html
from selenium import webdriver
profile = webdriver.FirefoxProfile()
profile.accept_untrusted_certs = True
import time
# selenium
wd = webdriver.Firefox(executable_path="C:/Utility/geckodriver.exe", firefox_profile=profile)
url = "https://corp-internal.com/admin/?page=0"
wd.get(url)
# set username
time.sleep(2)
username = wd.find_element_by_id("identifierId")
username.send_keys("my_email#email.com")
wd.find_element_by_id("identifierNext").click()
# set password
time.sleep(2)
password = wd.find_element_by_name("password")
password.send_keys("my_pswd")
wd.find_element_by_id("passwordNext").click()
all_text = []
# list of URLs
url_list = ['https://corp-internal.com/admin/graph?dag_id=emm1_daily_legacy',
'https://corp-internal.com/admin/graph?dag_id=eemm1_daily_legacy_history']
for link in url_list:
#File = webbrowser.open(link)
#File = requests.get(link)
#data = File.text
for link in bs.findAll('tspan'):
alldata = all_text.append(link.get('tspan'))
outF = open('C:/Users/ryans/OneDrive/Desktop/test.txt', 'w')
outF.writelines(alldata)
outF.close()
I've been trying to write a script which will give me all the links to the episodes present on this page :- http://www.funimation.com/shows/assassination-classroom/videos/episodes
As you can see that the links can be seen in 'Outer HTML', I used selenium and PhantomJS with python.
Link Example: http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time
However, I can't seem to get my code right. I do have a basic Idea of what I want to do. Here's the process :-
1.) Copy the Outer HTML of the very first page and then save it as 'Source_html' file.
2.) Look for links inside this file.
3.) Move to the next page to see rest of the videos and their links.
4.) Repeat the step 2.
This is what my code looks like :
from selenium import webdriver
from selenium import selenium
from bs4 import BeautifulSoup
import time
# ---------------------------------------------------------------------------------------------
driver = webdriver.PhantomJS()
driver.get('http://www.funimation.com/shows/assassination-classroom/videos/episodes')
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("outerHTML")
f = open('source_code.html', 'w')
f.write(source_code.encode('utf-8'))
f.close()
print 'Links On First Page Are : \n'
soup = BeautifulSoup('source_code.html')
subtitles = soup.find_all('div',{'class':'popup-heading'})
official = 'something'
for official in subtitles:
x = official.findAll('a')
for a in x:
print a['href']
sbtn = driver.find_element_by_link_text(">"):
print sbtn
print 'Entering The Loop Now'
for driver.find_element_by_link_text(">"):
sbtn.click()
time.sleep(3)
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("outerHTML")
f = open('source_code1.html', 'w')
f.write(source_code.encode('utf-8'))
f.close()
Things I already know :-
soup = BeautifulSoup('source_code.html') won't work, because I need to open this file via python and feed it into BS after that. That I can manage.
That official variable isn't really doing anything. Just helping me start a loop.
for driver.find_element_by_link_text(">"):
Now, this is what I need to fix somehow. I'm not sure how to check if this thing is still clickable or not. If yes, then proceed to next page, get the links, click this again to go to page 3 and repeat the process.
Any help would be appreciated.
You don't need to use BeautifulSoup here at all. Just grab all the links via selenium. Proceed to next page only if the > link is visible. Here is the complete implementation including gathering the links, necessary waits. It should work for any page count:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.PhantomJS()
driver.get("http://www.funimation.com/shows/assassination-classroom/videos/episodes")
wait = WebDriverWait(driver, 10)
links = []
while True:
# wait for the page to load
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.item-title")))
# wait until the loading circle becomes invisible
wait.until(EC.invisibility_of_element_located((By.ID, "loadingCircle")))
links.extend([link.get_attribute("href") for link in driver.find_elements_by_css_selector("a.item-title")])
print("Parsing page number #" + driver.find_element_by_css_selector("a.jp-current").text)
# click next
next_link = driver.find_element_by_css_selector("a.next")
if not next_link.is_displayed():
break
next_link.click()
time.sleep(1) # hardcoded delay
print(len(links))
print(links)
For the mentioned in the question URL, it prints:
Parsing page number #1
Parsing page number #2
93
['http://www.funimation.com/shows/assassination-classroom/videos/official/assassination-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assassination-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assassination-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/baseball-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/baseball-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/baseball-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/grown-up-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/grown-up-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/grown-up-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assembly-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assembly-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/assembly-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/test-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/test-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/test-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time1st-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time1st-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time1st-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/school-trip-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/l-and-r-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/l-and-r-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/l-and-r-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/transfer-student-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/ball-game-tournament-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/ball-game-tournament-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/ball-game-tournament-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/talent-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/talent-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/talent-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/vision-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/vision-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/vision-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/end-of-term-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/end-of-term-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/end-of-term-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/schools-out1st-term', 'http://www.funimation.com/shows/assassination-classroom/videos/official/schools-out1st-term', 'http://www.funimation.com/shows/assassination-classroom/videos/official/schools-out1st-term', 'http://www.funimation.com/shows/assassination-classroom/videos/official/island-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/island-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/island-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/action-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/action-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/action-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/pandemonium-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/pandemonium-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/pandemonium-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time2nd-period', 'http://www.funimation.com/shows/assassination-classroom/videos/official/karma-time2nd-period', 'http://www.funimation.com/shows/deadman-wonderland', 'http://www.funimation.com/shows/deadman-wonderland', 'http://www.funimation.com/shows/riddle-story-of-devil', 'http://www.funimation.com/shows/riddle-story-of-devil', 'http://www.funimation.com/shows/soul-eater', 'http://www.funimation.com/shows/soul-eater', 'http://www.funimation.com/shows/assassination-classroom/videos/official/xx-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/xx-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/xx-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/nagisa-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/nagisa-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/nagisa-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/summer-festival-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/summer-festival-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/summer-festival-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/kaede-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/kaede-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/kaede-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/itona-horibe-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/itona-horibe-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/itona-horibe-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/spinning-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/spinning-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/spinning-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/leader-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/leader-time', 'http://www.funimation.com/shows/assassination-classroom/videos/official/leader-time', 'http://www.funimation.com/shows/deadman-wonderland', 'http://www.funimation.com/shows/deadman-wonderland', 'http://www.funimation.com/shows/riddle-story-of-devil', 'http://www.funimation.com/shows/riddle-story-of-devil', 'http://www.funimation.com/shows/soul-eater', 'http://www.funimation.com/shows/soul-eater']
Basically, I use webelement.is_displayed() to check if it is clickable or not.
isLinkDisplay = driver.find_element_by_link_text(">").is_displayed()