I´m trying to scrape pricing data for each numerous vehicles, for example:
http://www.leasingcar.dk/privatleasing/Citro%C3%ABn-C1/VTi-68-Feel
I´m iterating over the selection boxes "leasingPeriod" and then "annualMileage".
My problem is that by the time the request has returned, I´ve already scraped the data, so I´m retrieving the same price every time. I´ve tried to use an implicit wait but it doesn´t seem to have any effect? I´ve also tried to wait for the completion of ajax calls but to no avail.
My code looks like this:
enter code# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import unittest
class DataTest(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
self.driver.get("http://www.leasingcar.dk/privatleasing")
def testData(self):
driver = self.driver
vehicleLinksList = []
vehicleLinks =driver.find_elements_by_css_selector('div.vehicle[data-nice_url]')
for linkElement in vehicleLinks:
vehicleLinksList.append(linkElement.get_attribute("data-nice_url"))
for link in vehicleLinksList:
fullUrl = ""
fullUrl = "http://www.leasingcar.dk" + str(link)
driver.get(fullUrl)
leasingPeriodElements = driver.find_element_by_css_selector("select[id=leasingPeriod]") #get the select element
periodsOptions = leasingPeriodElements.find_elements_by_tag_name("option") #get all the options into a list
mileageElements = driver.find_element_by_css_selector("select[id=annualMileage]") #get the select element
mileageOptions = mileageElements.find_elements_by_tag_name("option") #get all the options into a list
periodOptionsList = []
mileageOptionList = []
for option in periodsOptions:
periodOptionsList.append(option.get_attribute("value"))
for option in mileageOptions:
mileageOptionList.append(option.get_attribute("value"))
for optionValue in periodOptionsList:
print "starting loop on option %s" % optionValue
leasingPeriodElement = Select(driver.find_element_by_css_selector("select[id=leasingPeriod]"))
leasingPeriodElement.select_by_value(optionValue)
for mileageValue in mileageOptionList:
mileageElement = Select(driver.find_element_by_css_selector("select[id=annualMileage]"))
mileageElement.select_by_value(mileageValue)
#driver.implicitly_wait(10)
#WebDriverWait(driver, 10).until(ajax_complete, "Timeout waiting for page to load")
wait = WebDriverWait(driver, 10)
price = wait.until(wait_for_visible_element_text_to_contain((By.CSS_SELECTOR, "span.total-price"), "Kr."))
print price.text
#driver.refresh()
#driver.implicitly_wait(10)
def tearDown(self):
self.driver.quit()
if __name__ == '__main__':
unittest.main()
class wait_for_visible_element_text_to_contain(object):
def __init__(self, locator, text):
self.locator = locator
self.text = text
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
for element in elements:
if self.text in element.text and element.is_displayed():
return element
except StaleElementReferenceException:
return False
def ajax_complete(driver):
try:
return 0 == driver.execute_script("return jQuery.active")
except WebDriverException:
pass
Is there any way to check whether requests have been completed, or if a value has been refreshed?
Frank,
Use an explicit wait to determine when the "Leasing Period" changes in the data table. Something like:
from selenium.webdriver.support.ui import WebDriverWait
xpath = "//div[#class='data-last']/span[#class='period']"
for elm in driver.find_elements_by_xpath(xpath):
if elm.is_displayed():
WebDriverWait(driver, 10).until(
lambda _: elm.text == "48"
)
NOTE: I had to use find_elements_by_xpath and check if the elements are displayed because there is a hidden element with that same xpath.
Related
how is it possible for me to let Selenium search for a specific Line in the Code of a Website?
I am searching for the Line i attached in the photo.
Thanks in advance!
Image Info Website
You can get the inner text of an element by either:
inner_text = element.get_attribute('innerText');
inner_text = element.text;
Therefore, you can scan all those divs with the condition of inner_text == "Fehler".
Since the ids follow a pattern, here's how you can scan the divs and select the element:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def id_value(index):
return f"cdk-describe-message-{index}"
def find_innertext(driver, url, text, N):
driver.get(url)
for idx in range(N):
element = driver.find_elements(By.ID, id_value(idx))
if element.text == text:
return element
def main():
driver = webdriver.Firefox()
url = "https://...."
target = "Fehler"
number_of_divs = 40
return find_innertext(driver, url, target, number_of_divs)
if __name__ == "__main__":
element = main()
Notes:
you have to know the number of divs elements there are in advance;
consider using waits.
This is what i got so far:
def main():
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument("--incognito")
chrome_options.add_argument('ignore-certificate-errors')
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
smalogin = "https://172.16.63.100/webui/login"
driver.get(smalogin)
driver.implicitly_wait(100)
email = driver.find_element(By.NAME, "username")
email.send_keys('user')
password = driver.find_element(By.NAME, "password")
password.send_keys('pass')
submit = driver.find_element("xpath",'//*[#id="login"]/button')
submit.click()
monitoring = driver.find_element(By.ID,"ennexos-element-monitoring")
monitoring.click()
statusliste = driver.find_element("xpath",'//*[#id="cdk-accordion-child-0"]/div/div/sma-feature-board-slot[1]/sma-navigation-link')
statusliste.click()
time.sleep(10)
if __name__ == '__main__':
main()
I had posted in Stack Exchange earlier; however, did not get much response from that yet; hence, posting it here.
I am trying to scrape some data using the following code. When I run the code line by line, it works fine. However, when I want to run all code at one go, the dropdown options go blank and as a result, the last line returns error. Your help would be much appreciated. The code is below.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os
path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)
url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()
browser.find_element_by_link_text("BIHAR").click()
browser.implicitly_wait(5)
year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlFin")
elem2.send_keys(year[0])
browser.implicitly_wait(5)
select_dist = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist")
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
for e in range(len(options)):
select_dist = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist"))
select_dist.select_by_index(e)
select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
options1 = select_block.options
for f in range(len(options1)):
select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
select_block.select_by_index(f)
select_gp = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
options2 = select_gp.options
for g in range(len(options2)):
select_gp = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
select_gp.select_by_index(g)
browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_rbLoginLevel_1").click()
browser.implicitly_wait(10)
elem6 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodFrom")
elem6.send_keys('01/04/2016')
browser.implicitly_wait(10)
elem7 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodTo")
elem7.send_keys('31/03/2017')
browser.implicitly_wait(10)
browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_login").click()
browser.implicitly_wait(10)
browser.find_element_by_link_text("Download All Reports").click()
Besides that the target page is slower than an aged snail, and those 10 second waits are barely enough for anything, there are two things you missed an those caused your troubles:
you did not take account that the first element of the select options are "select an option" types. So if you try to cycle trough all of them, you must ignore the option at the first index, else it will look like "nothing is selected"
wait for that spinner. After the spinner is gone, page will be refreshed. Do not grab the elements before page refresh is complete, wait until the spinner is gone.
With these two helper functions it is possible to press the "Get Reports" button without issues:
def is_spinner_gone(arg):
loaded_spinner = browser.find_element_by_xpath('//div[//div[#class="loader"]]')
if loaded_spinner:
return loaded_spinner.get_attribute('style') == 'display: none;'
return True
def wait_for_element(xpath):
# this is necessary because the spinner does not pop up instantly
time.sleep(1)
no_spinner = WebDriverWait(browser, 500).until(is_spinner_gone)
element = WebDriverWait(browser, 500).until(
EC.element_to_be_clickable((By.XPATH, xpath)))
return element
If you get your elements via the wait_for_element call then you'll be able to interact with them without error. I guess you know that pressing that button is not the end of the road yet, you'll have to choose the report format and who knows what later on.
Adjusted code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os
import time
path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)
start = time.time()
url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()
loaded = time.time()
print(f'PAGE LOADED IN {loaded-start} seconds')
browser.find_element_by_link_text("BIHAR").click()
def is_spinner_gone(arg):
loaded_spinner = browser.find_element_by_xpath('//div[//div[#class="loader"]]')
if loaded_spinner:
return loaded_spinner.get_attribute('style') == 'display: none;'
return True
def wait_for_element(xpath):
# this is necessary because the spinner does not pop up instantly
time.sleep(1)
no_spinner = WebDriverWait(browser, 500).until(is_spinner_gone)
element = WebDriverWait(browser, 500).until(
EC.element_to_be_clickable((By.XPATH, xpath)))
return element
year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlFin"]')
selector_page_loaded = time.time()
print(f'WORK AREA LOADED IN {selector_page_loaded-loaded} seconds')
elem2.send_keys(year[0])
select_dist = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddldist"]')
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
# ISSUE: default fields are included in the options!
for e in range(1,len(options)):
select_dist = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddldist"]'))
select_dist.select_by_index(e)
select_block = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlblock"]'))
options1 = select_block.options
for f in range(1, len(options1)):
select_block = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlblock"]'))
select_block.select_by_index(f)
select_gp = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
options2 = select_gp.options
for g in range(1, len(options2)):
select_gp = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
select_gp.select_by_index(g)
wait_for_element('//*[#id="ctl00_ContentPlaceHolder1_rbLoginLevel_1"]').click()
elem6 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$txtperiodFrom"]')
elem6.send_keys('01/04/2016')
elem7 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$txtperiodTo"]')
elem7.send_keys('31/03/2017')
wait_for_element('//*[#value="Get Reports"]').click()
print(f'FIRST RUN IN {time.time()-selector_page_loaded}')
The website I am scrapping is:
http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx
I am getting to page 10 with my code that is looking at the pagination numbers and iterating over them but it is failing when it wants to get past page 10 because there are three dots (...) that, if you click in the browser, it loads page 11 (Same for after page 20, page 30 etc). How can I update my code below so that it can deal with this error without breaking?
The code I am using is:
import re
import string
import urlparse
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.PhantomJS()
self.driver.set_window_size(1120, 550)
def scrape(self):
self.driver.get(self.url)
# choose to search using the region
try:
self.driver.find_element_by_id('SearchChkb_5').click()
except NoSuchElementException:
pass
#get the provinces that are available
select = Select(self.driver.find_element_by_id('ddlProvince'))
option_indexes = range(1, len(select.options))
#iterate through the provinces
for index in option_indexes[:3]:
select.select_by_index(index)
#click the search button
self.driver.find_element_by_id('cmdSearch').click()
pageno = 2
while True:
#create a beautiful soup of the page source code
s = BeautifulSoup(self.driver.page_source)
#get all links that match seeing practitioner profile
r1 = re.compile(r'^PractitionerView\.aspx\?FILENO=([A-Z0-9-]+)$')
#create a dictionary of the attributes
x = {'href': r1}
#so in the page source, find all links that have the attributes stated in x
for a in s.findAll('a', attrs=x):
print 'View Doctor URL: ', urlparse.urljoin(self.driver.current_url, a['href'])
print
# Pagination
try:
next_page_elem = self.driver.find_element_by_xpath("//a[text()='%d']" % pageno)
print "Next page: ", next_page_elem
except NoSuchElementException:
break # no more pages
print 'page ', pageno, '\n'
next_page_elem.click()
pageno += 1
self.driver.quit()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.scrape()
I am getting this error:
StaleElementReferenceException: {"errorMessage":"Element does not exist in cache","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"121","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:63135","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\": \"tag name\", \"sessionId\": \"ef6d0590-a2d6-11e7-91fa-5773b3326267\", \"id\": \":wdc:1506442969197\", \"value\": \"option\"}","url":"/elements","urlParsed":{"anchor":"","query":"","file":"elements","directory":"/","path":"/elements","relative":"/elements","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/elements","queryKey":{},"chunks":["elements"]},"urlOriginal":"/session/ef6d0590-a2d6-11e7-91fa-5773b3326267/element/:wdc:1506442969197/elements"}}
The main problem with this site is that the clickable elements frequently goes beyond the sight and it throws element not clickable error. However, I've already fixed it. If you have ChromeDriver installed in your machine, just run it and see the magic. It will flawlessly traverse all the pages no matter how many they are. I've checked it.
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
main_link = 'http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx'
def get_content(driver,wait,link):
driver.get(link)
driver.find_element_by_id('SearchChkb_5').click()
select = Select(driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
try:
get_content(driver,wait,main_link)
finally:
driver.close()
And using Class:
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def __del__(self):
self.driver.close()
def controlling_pagination(self):
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
self.driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
self.driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
def get_content(self):
self.driver.get(self.url)
self.driver.find_element_by_id('SearchChkb_5').click()
select = Select(self.driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = self.wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
self.controlling_pagination()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.get_content()
Btw, take a look at the bottom of the image where you can see the changes of pages:
I am writing script using selenium python but there is problem i have tried to find solution but i can not find one that was helpful to me. here is the code
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import unittest
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class sulekhastart(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
def test_parse_contact_urls_and_go_to_next_page(self):
pagenumber = 'Page'
#assign WEBDRIVER to local webdriver
driver = self.driver
#Website open by below url
driver.get("http://www.sulekha.com/ac-dealers/bangalore")
self.assertIn("Sulekha Bangalore", driver.title)
#close the lightbox thnat appears at the firsttime load of page
startlightbox = driver.find_element_by_xpath('//a[#class="lcf-close"]')
startlightbox.click()
while True:
#get the page number
pageno = driver.find_element_by_xpath('//li[#id="numberPage"]/strong')
print pageno.text
print pagenumber
#check if page same as last page or not
if str(pageno.text) != pagenumber:
pagenumber = str(pageno.text)
businessname = driver.find_elements_by_xpath('//li/div/div[#class="busi-name"]/h3/a')
records = len(businessname)
#print all data that are available on the webpage
for i in range(0,records):
print businessname[i].get_attribute('href')
print businessname[i].text
nextpage = driver.find_element_by_xpath('//li[#id="nextPage"]')
nextpage.click()
else:
print 'This is last page all data is scraped change url and get another data'
break
element = WebDriverWait(driver, 10).until_not(EC.presence_of_element_located((By.XPATH, "/html/body/div/div/svg")))
def tearDown(self):
self.driver.close()
print 'page not be closed'
if __name__ == "__main__":
unittest.main()
and i want to wait script after click on the next button until By.XPATH, "/html/body/div/div/svg" this element gone from DOM or page source and then after wait until 3 seconds
as andersson commented
replacing
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/svg")))
with
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/*[name()='svg']")))
solves the problem
Q:
I'm using Selenium to get a page with contents, and after I click the more button,the page outputs more content,and how I get the new page through webdriver?
some codes like this:
def parase_questions(self):
driver = self.login()
driver.implicitly_wait(2)
more_btn = driver.find_element_by_css_selector(".zg-btn-white.zg-r3px.zu-button-more")
more_btn.click()
# should I do something to get the new driver ?
print driver.page_source
question_links = driver.find_elements_by_css_selector('.question_link')
print len(question_links)
If I understand you correctly, after you click the More button, there are more elements with question_link class loaded. You would need a way to wait for the question links to be loaded.
Here is one idea - a custom Expected Condition that would help you to wait until there are more than N number of elements:
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
class wait_for_more_than_n_elements(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
count = len(EC._find_elements(driver, self.locator))
return count > self.count
except StaleElementReferenceException:
return False
Usage:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
driver = self.login()
driver.implicitly_wait(2)
question_links = driver.find_elements_by_css_selector('.question_link')
more_btn = driver.find_element_by_css_selector(".zg-btn-white.zg-r3px.zu-button-more")
more_btn.click()
# wait
wait = WebDriverWait(driver, 10)
wait.until(wait_for_more_than_n_elements((By.CSS_SELECTOR, ".question_link"), len(question_links))
# now more question links were loaded, get the page source
print(driver.page_source)