The website I am scrapping is:
http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx
I am getting to page 10 with my code that is looking at the pagination numbers and iterating over them but it is failing when it wants to get past page 10 because there are three dots (...) that, if you click in the browser, it loads page 11 (Same for after page 20, page 30 etc). How can I update my code below so that it can deal with this error without breaking?
The code I am using is:
import re
import string
import urlparse
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.PhantomJS()
self.driver.set_window_size(1120, 550)
def scrape(self):
self.driver.get(self.url)
# choose to search using the region
try:
self.driver.find_element_by_id('SearchChkb_5').click()
except NoSuchElementException:
pass
#get the provinces that are available
select = Select(self.driver.find_element_by_id('ddlProvince'))
option_indexes = range(1, len(select.options))
#iterate through the provinces
for index in option_indexes[:3]:
select.select_by_index(index)
#click the search button
self.driver.find_element_by_id('cmdSearch').click()
pageno = 2
while True:
#create a beautiful soup of the page source code
s = BeautifulSoup(self.driver.page_source)
#get all links that match seeing practitioner profile
r1 = re.compile(r'^PractitionerView\.aspx\?FILENO=([A-Z0-9-]+)$')
#create a dictionary of the attributes
x = {'href': r1}
#so in the page source, find all links that have the attributes stated in x
for a in s.findAll('a', attrs=x):
print 'View Doctor URL: ', urlparse.urljoin(self.driver.current_url, a['href'])
print
# Pagination
try:
next_page_elem = self.driver.find_element_by_xpath("//a[text()='%d']" % pageno)
print "Next page: ", next_page_elem
except NoSuchElementException:
break # no more pages
print 'page ', pageno, '\n'
next_page_elem.click()
pageno += 1
self.driver.quit()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.scrape()
I am getting this error:
StaleElementReferenceException: {"errorMessage":"Element does not exist in cache","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"121","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:63135","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\": \"tag name\", \"sessionId\": \"ef6d0590-a2d6-11e7-91fa-5773b3326267\", \"id\": \":wdc:1506442969197\", \"value\": \"option\"}","url":"/elements","urlParsed":{"anchor":"","query":"","file":"elements","directory":"/","path":"/elements","relative":"/elements","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/elements","queryKey":{},"chunks":["elements"]},"urlOriginal":"/session/ef6d0590-a2d6-11e7-91fa-5773b3326267/element/:wdc:1506442969197/elements"}}
The main problem with this site is that the clickable elements frequently goes beyond the sight and it throws element not clickable error. However, I've already fixed it. If you have ChromeDriver installed in your machine, just run it and see the magic. It will flawlessly traverse all the pages no matter how many they are. I've checked it.
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
main_link = 'http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx'
def get_content(driver,wait,link):
driver.get(link)
driver.find_element_by_id('SearchChkb_5').click()
select = Select(driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
try:
get_content(driver,wait,main_link)
finally:
driver.close()
And using Class:
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def __del__(self):
self.driver.close()
def controlling_pagination(self):
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
self.driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
self.driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
def get_content(self):
self.driver.get(self.url)
self.driver.find_element_by_id('SearchChkb_5').click()
select = Select(self.driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = self.wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
self.controlling_pagination()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.get_content()
Btw, take a look at the bottom of the image where you can see the changes of pages:
Related
Trying to get the tyres' details from this page. https://eurawheels.com/fr/catalogue/BBS
links = driver.find_elements_by_xpath('//div[#class="col-xs-1 col-md-3"]//a')
parent_window = driver.current_window_handle
x = 0
for j in range(len(links)):
driver.execute_script('window.open(arguments[0]);', links[j])
#scraping here
if x == 0:
driver.close()
driver.switch_to.window(parent_window)
x += 1
else:
driver.back()
driver.refresh() #refresh page
tyres = WebDriverWait(driver, 25).until(EC.visibility_of_all_elements_located((By.XPATH, '//div[#class="card-body text-center"]//a'))) #redefine links
time.sleep(4)
It works for 10 links but then the links go stale. Cannot figure out what needs to be changed. Any help is welcome.
You need to add scroll element into the view before executing driver.execute_script('window.open(arguments[0]);', links[j]) since not all the elements are initially loaded on the page.
So your code should look like following:
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
links = driver.find_elements_by_xpath('//div[#class="col-xs-1 col-md-3"]//a')
parent_window = driver.current_window_handle
x = 0
for j in range(len(links)):
actions.move_to_element(j).perform()
driver.execute_script('window.open(arguments[0]);', links[j])
#scraping here
if x == 0:
driver.close()
driver.switch_to.window(parent_window)
x += 1
else:
driver.back()
driver.refresh() #refresh page
tyres = WebDriverWait(driver, 25).until(EC.visibility_of_all_elements_located((By.XPATH, '//div[#class="card-body text-center"]//a'))) #redefine links
time.sleep(4)
Try this:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
link = 'https://eurawheels.com/fr/catalogue/BBS'
with webdriver.Chrome() as driver:
wait = WebDriverWait(driver,15)
driver.get(link)
linklist = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,".card-body > a")))
for i,elem in enumerate(linklist):
linklist[i].click()
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".spinner-border[role='status']")))
time.sleep(2) #if you kick out this delay, your script will run very fast but you may end up getting same results multiple times.
item = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"h3"))).text
print(item)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"h1.modal-title + button[class='close'][data-dismiss='modal']"))).click()
driver.back()
I'm trying to scrape [this][1] site its URL doesnt change when next page is clicked on. So, I used Selenium to click on the next page, but doing that doesnt help. As my driver keeps getting the old page even after next page is clicked on. Is there any other way to get to the next page and scrape it?
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Safari()
store_pages = []
#10306 is total number of pages.
for i in range (10306):
Starting_url = 'site'
driver.get(Starting_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
print (store_pages.append(i))
timeout = 20
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_lblDisclaimerMsg']")))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
timeout = 20
wait = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a > div.act_search_results > div.act_search_header'), "206113 Record(s) | Page [2 of 10306]"))
NGO_element = driver.find_element_by_class_name("faq-sub-content exempted-result")
NGO_name = NGO_element.find_elements_by_tag_name("h1")
NGO_name_pancard = driver.find_elements_by_class_name("pan-id")
NGO_data = NGO_element.find_elements_by_tag_name("ul")
NGO_sub_data = NGO_element.find_elements_by_tag_name("li")
for i, p, t in zip(NGO_name, NGO_name_pancard, NGO_data):
n_name = i.text.replace(p.text, '')
n_data = t.text
n_pan = p.text
print ("Name of NGO:", n_name, "Fields of NGO:", n_data, "Pancard number:", n_pan)
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
#timeout = 2
You need to make sure when you reach the next page, the content of the earlier page has become stale otherwise, you will have stale element error or get the same thing repeatedly. Try the below approach, it should get you there. The rest you can modify yourself.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[id^='arrowex']"))):
print(elem.text)
try:
wait.until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_imgbtnNext"))).click()
wait.until(EC.staleness_of(elem))
except:
break
driver.quit()
I am writing script using selenium python but there is problem i have tried to find solution but i can not find one that was helpful to me. here is the code
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import unittest
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class sulekhastart(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
def test_parse_contact_urls_and_go_to_next_page(self):
pagenumber = 'Page'
#assign WEBDRIVER to local webdriver
driver = self.driver
#Website open by below url
driver.get("http://www.sulekha.com/ac-dealers/bangalore")
self.assertIn("Sulekha Bangalore", driver.title)
#close the lightbox thnat appears at the firsttime load of page
startlightbox = driver.find_element_by_xpath('//a[#class="lcf-close"]')
startlightbox.click()
while True:
#get the page number
pageno = driver.find_element_by_xpath('//li[#id="numberPage"]/strong')
print pageno.text
print pagenumber
#check if page same as last page or not
if str(pageno.text) != pagenumber:
pagenumber = str(pageno.text)
businessname = driver.find_elements_by_xpath('//li/div/div[#class="busi-name"]/h3/a')
records = len(businessname)
#print all data that are available on the webpage
for i in range(0,records):
print businessname[i].get_attribute('href')
print businessname[i].text
nextpage = driver.find_element_by_xpath('//li[#id="nextPage"]')
nextpage.click()
else:
print 'This is last page all data is scraped change url and get another data'
break
element = WebDriverWait(driver, 10).until_not(EC.presence_of_element_located((By.XPATH, "/html/body/div/div/svg")))
def tearDown(self):
self.driver.close()
print 'page not be closed'
if __name__ == "__main__":
unittest.main()
and i want to wait script after click on the next button until By.XPATH, "/html/body/div/div/svg" this element gone from DOM or page source and then after wait until 3 seconds
as andersson commented
replacing
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/svg")))
with
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/*[name()='svg']")))
solves the problem
Q:
I'm using Selenium to get a page with contents, and after I click the more button,the page outputs more content,and how I get the new page through webdriver?
some codes like this:
def parase_questions(self):
driver = self.login()
driver.implicitly_wait(2)
more_btn = driver.find_element_by_css_selector(".zg-btn-white.zg-r3px.zu-button-more")
more_btn.click()
# should I do something to get the new driver ?
print driver.page_source
question_links = driver.find_elements_by_css_selector('.question_link')
print len(question_links)
If I understand you correctly, after you click the More button, there are more elements with question_link class loaded. You would need a way to wait for the question links to be loaded.
Here is one idea - a custom Expected Condition that would help you to wait until there are more than N number of elements:
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
class wait_for_more_than_n_elements(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
count = len(EC._find_elements(driver, self.locator))
return count > self.count
except StaleElementReferenceException:
return False
Usage:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
driver = self.login()
driver.implicitly_wait(2)
question_links = driver.find_elements_by_css_selector('.question_link')
more_btn = driver.find_element_by_css_selector(".zg-btn-white.zg-r3px.zu-button-more")
more_btn.click()
# wait
wait = WebDriverWait(driver, 10)
wait.until(wait_for_more_than_n_elements((By.CSS_SELECTOR, ".question_link"), len(question_links))
# now more question links were loaded, get the page source
print(driver.page_source)
I´m trying to scrape pricing data for each numerous vehicles, for example:
http://www.leasingcar.dk/privatleasing/Citro%C3%ABn-C1/VTi-68-Feel
I´m iterating over the selection boxes "leasingPeriod" and then "annualMileage".
My problem is that by the time the request has returned, I´ve already scraped the data, so I´m retrieving the same price every time. I´ve tried to use an implicit wait but it doesn´t seem to have any effect? I´ve also tried to wait for the completion of ajax calls but to no avail.
My code looks like this:
enter code# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import unittest
class DataTest(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
self.driver.get("http://www.leasingcar.dk/privatleasing")
def testData(self):
driver = self.driver
vehicleLinksList = []
vehicleLinks =driver.find_elements_by_css_selector('div.vehicle[data-nice_url]')
for linkElement in vehicleLinks:
vehicleLinksList.append(linkElement.get_attribute("data-nice_url"))
for link in vehicleLinksList:
fullUrl = ""
fullUrl = "http://www.leasingcar.dk" + str(link)
driver.get(fullUrl)
leasingPeriodElements = driver.find_element_by_css_selector("select[id=leasingPeriod]") #get the select element
periodsOptions = leasingPeriodElements.find_elements_by_tag_name("option") #get all the options into a list
mileageElements = driver.find_element_by_css_selector("select[id=annualMileage]") #get the select element
mileageOptions = mileageElements.find_elements_by_tag_name("option") #get all the options into a list
periodOptionsList = []
mileageOptionList = []
for option in periodsOptions:
periodOptionsList.append(option.get_attribute("value"))
for option in mileageOptions:
mileageOptionList.append(option.get_attribute("value"))
for optionValue in periodOptionsList:
print "starting loop on option %s" % optionValue
leasingPeriodElement = Select(driver.find_element_by_css_selector("select[id=leasingPeriod]"))
leasingPeriodElement.select_by_value(optionValue)
for mileageValue in mileageOptionList:
mileageElement = Select(driver.find_element_by_css_selector("select[id=annualMileage]"))
mileageElement.select_by_value(mileageValue)
#driver.implicitly_wait(10)
#WebDriverWait(driver, 10).until(ajax_complete, "Timeout waiting for page to load")
wait = WebDriverWait(driver, 10)
price = wait.until(wait_for_visible_element_text_to_contain((By.CSS_SELECTOR, "span.total-price"), "Kr."))
print price.text
#driver.refresh()
#driver.implicitly_wait(10)
def tearDown(self):
self.driver.quit()
if __name__ == '__main__':
unittest.main()
class wait_for_visible_element_text_to_contain(object):
def __init__(self, locator, text):
self.locator = locator
self.text = text
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
for element in elements:
if self.text in element.text and element.is_displayed():
return element
except StaleElementReferenceException:
return False
def ajax_complete(driver):
try:
return 0 == driver.execute_script("return jQuery.active")
except WebDriverException:
pass
Is there any way to check whether requests have been completed, or if a value has been refreshed?
Frank,
Use an explicit wait to determine when the "Leasing Period" changes in the data table. Something like:
from selenium.webdriver.support.ui import WebDriverWait
xpath = "//div[#class='data-last']/span[#class='period']"
for elm in driver.find_elements_by_xpath(xpath):
if elm.is_displayed():
WebDriverWait(driver, 10).until(
lambda _: elm.text == "48"
)
NOTE: I had to use find_elements_by_xpath and check if the elements are displayed because there is a hidden element with that same xpath.