How to paginate and scrape data from an aspx page without selenium

How to paginate and scrape data from an aspx page without selenium - python

I am trying to scrape the to paginate and scrape each table details from this site.
https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx
Screenshot
I need to click each details box, get directed to a new window and do it for the other records in each page. Then paginate. Here is my selenium code
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
PATH = 'chromedriver.exe'
options = Options()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("--lang=en")
driver = webdriver.Chrome(executable_path=PATH, options=options)
driver.maximize_window()
driver.get('https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx')
driver.find_element_by_xpath('//*[#id="Div1"]/input').click()
def wait(locator, id):
element = WebDriverWait(driver, 50).until(
EC.presence_of_all_elements_located((locator, id))
)
return element
DATA = []
name = '//*[#id="ctl00_ContentPlaceHolder1_TxtName_I"]'
postal = '//*[#id="ctl00_ContentPlaceHolder1_TxtPostalCode_I"]'
fax = '//*[#id="ctl00_ContentPlaceHolder1_TxtFax_I"]'
province = '//*[#id="ctl00_ContentPlaceHolder1_TxtDistrict_I"]'
email = '//*[#id="ctl00_ContentPlaceHolder1_TxtEmail_I"]'
address = '//*[#id="ctl00_ContentPlaceHolder1_TxtAddress_I"]'
phone = '//*[#id="ctl00_ContentPlaceHolder1_TxtPhone_I"]'
courtroom = '//*[#id="ctl00_ContentPlaceHolder1_TxtCourtBox_I"]'
webpage = '//*[#id="ctl00_ContentPlaceHolder1_TxtUrl_I"]'
details = ['Postal Code', 'Fax', 'Calendar Province', 'Email', 'Address', 'Phone', 'Courtroom', 'Webpage']
def gotopage(page):
for p in range(page-1):
next_page = driver.find_element_by_class_name('dxWeb_pNext_Material')
action = ActionChains(driver)
action.click(next_page)
action.perform()
time.sleep(4)
def each_page(page, new):
global DATA
curr = 0
while curr < 80:
if page > 1 and new:
gotopage(page)
action = ActionChains(driver)
action.move_to_element(driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_LawyersGrid_DXPagerBottom_PSI"]')).click()
action.perform()
action.send_keys(Keys.ARROW_UP, Keys.RETURN)
action.perform()
time.sleep(17)
data = {}
action = ActionChains(driver)
detail_list = wait(By.CLASS_NAME, 'dxb-hbc')
try:
action.click(detail_list[curr])
action.perform()
except IndexError:
print(curr)
driver.back()
gotopage(page)
data['Name'] = wait(By.XPATH, name)[0].get_attribute('value')
for i, d in enumerate([postal, fax, province, email, address, phone, courtroom, webpage]):
info = driver.find_element_by_xpath(d).get_attribute(('value'))
data[details[i]] = info
DATA.append(data)
curr += 1
driver.back()
print('============SCRAPING===============')
page = 1
new=True
while page <= 50:
try:
each_page(page, new)
page += 1
except Exception as err:
print(err)
print(page)
The problem here is that this is incredibly slow because each time you say
driver.back()
It goes back to page 1 and I would need to go back to the current page it would need to go back to the page it was in.
Is there anyway i can achieve this with something like BeautifulSoup?

Related

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

I got a dataframe that contains links to google reviews of two restaurants. I wanted to load all reviews of two restaurants (one by one) into the browser and then save them into a new data frame. I wrote a script that reads and load all reviews into the browser as follow:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
link_df = Link
0 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
base_url = i['Link'] #link_df['Link'][i]
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
print('Restaurant number is ',index)
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=1
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
name= ''
rating = ''
text = ''
for index,review_info in enumerate(reviews_info):
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath(".//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath(".//div[#class='Jtu6Td']//span").text
review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
files_present = glob.glob(filename)
if files_present:
review_information.to_csv(filename,index=False,mode='a',header=False)
else:
review_information.to_csv(filename,index=False)
driver.get('https:ww.google.com')
time.sleep(3)
The problem is that script throws an error when it reaches the following line.
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
It throws following error:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=95.0.4638.69)
When I tried the same program without storing google links in dataframe (i.e. no for loop and instead of base_url = i['Link'], I wrote base_url = google review link) it works fine.
I am not sure where I am making the mistake. Any suggestion or help to fix the issue would be highly appreciated?

EDIT
you put the creation of driver outside the for loop
you cant launch the new url with gps data when the first popup is always in front, if you launch it, it stays in backdoor, the easier way is to launch a new url without gps data -> https:ww.google.com and wait 3 dec before to follow your loop:
your count is not good, i have changed your selector and change the total and set some lines in comment
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time
link_df = ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
# i have to launch one time to accept the cookies manually
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])
print ("Headless Firefox Initialized")
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
all_reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
it seems the solution for chrome needs some fixes:
org.openqa.selenium.StaleElementReferenceException: stale element reference: element is not attached to the page document
The literal meaning is about , The referenced element is out of date , No longer attached to the current page . Usually , This is because the page has been refreshed or skipped , The solution is , Reuse findElement or findElements Method to locate the element .
so its seems for chrome there is a problem of refreshing, so i suggest to load the number of record before to scroll, to have a fresh copy of DOM items, and i have to add a wait 1sec at the end of while loop
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time
link_df = [
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
options = Options()
#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
options.binary_location = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\\Téléchargement\\chromedriver.exe" )
# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
#reload to avoid exception, or trap scroll with try/except but more expznsive
all_reviews = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
time.sleep(1)
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()

problem in clicking radio button can't able to select a radio button. Message: stale element reference: element is not attached to the page document

Error : selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document.
website I'm scraping https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb I wanted to loop this tariff details with each section and each radio button shows different prices. I wanted to scrape, price details for each radio buttons one by one and checked radio button name along with price till end of the page. I have tried but I couldn't make success.
could anyone help on this. I will be helpful for me to learn. I have tried till get entered in to change tariff link and I'm facing issue to scrape a details. change tariff links given below links,
https://i.stack.imgur.com/RRyJa.png
https://i.stack.imgur.com/fNafB.png
https://i.stack.imgur.com/jFnLA.png
https://i.stack.imgur.com/WlyLU.png
"I'm trying to click a radio button and need to scrape a price details for selected radio button."
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class telekommobiles:
def __init__(self):
self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
self.country='DE'
self.currency='GBP'
self.VAT='Included'
self.shipping = 'free shipping within 3-4 weeks'
self.Pre_PromotionPrice ='N/A'
self.color ='N/A'
def telekom(self):
#try:
driver=webdriver.Chrome()
driver.maximize_window()
driver.get(self.url)
today = date.today()
#time.sleep(5)
WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//*[#id='consentAcceptAll']")))
cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
print("cookies accepted")
links_prod_check = []
prod_models = []
prod_manufacturer =[]
prod_memorys = []
product_colors =[]
product_price_monthly_payments = []
product_price_one_time_payments =[]
product_links = []
containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
i = 1
for container in containers:
p_links =container.find_element_by_tag_name('a').get_attribute('href')
i = i + 1
product_links.append(p_links)
#print(p_links)
for links in product_links:
driver.get(links)
#time.sleep(5)
#print(driver.current_url)
#links_prod_check.append(driver.current_url)
coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[#data-qa='list_ColorVariant']")))
#print(coloroptions)
for i in range(len(coloroptions)):
coloroption = driver.find_elements_by_xpath("//li[#data-qa='list_ColorVariant']")
coloroption[i].click()
#print(coloroption[i])
time.sleep(3)
memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(memoryoptions)):
memoryoption = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
try:
memoryoption[i].click()
except:
pass
time.sleep(3)
change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
time.sleep(3)
section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')
for section_loop in section_loops:
#Headings
heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
print(heading_1)
looping_for_tariff = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
subcontainers = section_loop.find_elements_by_css_selector('div[class="phx-tariff-box__section"]')
for subcontainer in subcontainers:
radio_buttons_list=subcontainer.find_elements_by_css_selector('div[class="phx-form__row phx-form__row--small phx-form__row--full-width phx-form__row--radio"]')
for radio in radio_buttons_list:
input=radio.find_elements_by_css_selector('span[class="phx-radio__element"]')
if input[0].is_enabled():
try:
ActionChains(driver).move_to_element(subcontainer).perform()
time.sleep(2)
input[0].click()
time.sleep(3)
except:
print('Not clickable')
pass
lable_list=radio.find_elements_by_css_selector('span[class="phx-radio__label"]')
label=""
if lable_list:
label=lable_list[0].text
heading_2 = subcontainer.find_element_by_css_selector('p[class="phx-t6 phx-t--medium"]').text
data_price_list= subcontainer.find_element_by_css_selector('div[class="phx-tariff-box__data-price"]')
volumn_list=data_price_list.find_elements_by_css_selector('div[data-qa="label_Tariff_VolumeSize"]')
volumn=""
if volumn_list:
volumn=volumn_list[0].text
price_list=subcontainer.find_elements_by_css_selector('p[class="phx-price phx-price--size_large phx-price--strong phx-price--color_brand"]')
price=""
nonBreakSpace = u'\xa0'
if price_list:
price=price_list[0].text
print(str(heading_2) + " " + str(label) + " " + str(volumn.replace(' ', '').replace( '\\r\\n','')) + " " + str(price))
#except:
#pass
telekom_de=telekommobiles()
telekom_de.telekom()

After selecting a different Option the page gets Refreshed, hence the issue. I was not able to find where you were trying to click on the buttons in your code. So tried to click on all the radio buttons with below code and was successful. Check the code once.
from selenium import webdriver
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path="path to chromedriver.exe")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/sierrablau-128gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()
radiooptions = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(radiooptions)):
radiooptions = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radiooptions[i].click()
time.sleep(2)

please li element instead of span
//li[#data-qa='list_ColorVariant']
and also add wait once you click on it. 5secs. then click the next one

Python Selenium. Parallel if loop

import csv
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from csv import reader
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
chrome_options = Options()
scroll = 5
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
header_added = False
header_added1 = False
url = "url"
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe', options=chrome_options)
driver.maximize_window()
driver.get(url)
time.sleep(3)
search_city = input("Enter the city :")
res_n = input("Enter the Restaurant's name :")
search = driver.find_element_by_xpath('//input[#name="location"]').send_keys(search_city)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="root"]/div[1]/div[1]/div/div[1]/div[1]/div/div[2]/div/div[3]/div[1]/span[2]').click()
time.sleep(3)
driver.find_element_by_xpath('/html/body/div[1]/div[1]/header/div/div/ul/li[5]/div/a/span[1]').click()
time.sleep(1)
search_res = driver.find_element_by_class_name('_2BJMh').send_keys(res_n.lower())
time.sleep(5)
driver.find_element_by_class_name('_2BJMh').send_keys(Keys.RETURN)
time.sleep(5)
try:
driver.find_element_by_class_name('_3FR5S').click()
time.sleep(5)
except:
print("restaurant not open")
driver.quit()
html = driver.find_element_by_tag_name('html')
def get_items():
global header_added
global item_dvs
cats = driver.find_elements_by_class_name('D_TFT')
cats[1].click()
time.sleep(3)
item_dvs = driver.find_elements_by_class_name('_2wg_t')
for div in item_dvs:
name = div.find_element_by_class_name('styles_itemNameText__3bcKX')
print(name.text)
price = div.find_element_by_class_name('rupee')
print(price.text)
if div.find_elements_by_class_name('styles_itemDesc__MTsVd'):
desc = div.find_element_by_class_name('styles_itemDesc__MTsVd').text
else:
desc = None
if div.find_element_by_css_selector('div._1C1Fl._23qjy'):
element = div.find_element_by_css_selector('div._1C1Fl._23qjy')
print("found")
driver.execute_script("arguments[0].scrollIntoView();", element)
add = div.find_element_by_css_selector('._1RPOp')
driver.execute_script("arguments[0].click();", add)
time.sleep(1)
add_ons = driver.find_element_by_class_name('_3UzO2').text
print(add_ons)
driver.find_element_by_css_selector('#modal-placeholder > div:nth-child(3) > div > div._1Kr-y._3EeZR > div > div._1EZLh > div > button').click()
else:
add_ons = None
dict1 = {'Item Name': name.text, "Price": price.text, "Add Ons :": add_ons, "Description": desc}
with open(f'{search_city}_{res_n}.csv', 'a+', encoding='utf-8-sig') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
get_items()
The is_cust loop keeps running over and over again opening the same element, while the rest of the code moves on to the next divs. What is wrong here?

xPath are bidirectional and is probably the cause here.
Try this code using cssSelector:
for div in item_dvs:
#Do Something
try:
is_cust = div.find_element_by_css_selector('._1C1Fl._23qjy')
print("found")
except NoSuchElementException:
continue
driver.execute_script("arguments[0].scrollIntoView();", is_cust)
add = div.find_element_by_css_selector('._1RPOp')
driver.execute_script("arguments[0].click();", add)
time.sleep(1)
# Not sure why for this one you had driver instead of div. Suspect div should be
add_ons = div.find_element_by_class_name('_26cJ9').text
div.find_element_by_css_selector('#modal-placeholder > div:nth-child(3) > div > div._1Kr-y._3EeZR > div > div._1EZLh > div > button').click()
UPDATE
From your updated code, you are using lot of hardcoded sleep. I will suggest to use the WebDriverWait with expected_conditions.
More info here: Wait from Selenium
Imports needed:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
Code to be added post driver creation:
wait_time = 5
wait = WebDriverWait(driver, wait_time)
Instead of using sleep like this:
time.sleep(5)
driver.find_element_by_class_name('_2BJMh').send_keys(Keys.RETURN)
time.sleep(5)
Use:
wait.until(EC.presence_of_element_located((By.CLASS_NAME, '_2BJMh'))).send_keys(res_n.lower())
Don't gather the element twice.. use find_elements_by* then validate the length:
descs = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'styles_itemDesc__MTsVd')))
if len(descs) > 0:
desc = descs[0].text
else:
desc = None

Next Page Iteration in Selenium/BeautfulSoup for Scraping E-Commerce Website

I'm scraping an E-Commerce website, Lazada using Selenium and bs4, I manage to scrape on the 1st page but I unable to iterate to the next page. What I'm tyring to achieve is to scrape the whole pages based on the categories I've selected.
Here what I've tried :
# Run the argument with incognito
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get('https://www.lazada.com.my/')
driver.maximize_window()
# Select category item #
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
t = 10
try:
WebDriverWait(driver,t).until(EC.visibility_of_element_located((By.ID,"a2o4k.searchlistcategory.0.i0.460b6883jV3Y0q")))
except TimeoutException:
print('Page Refresh!')
driver.refresh()
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
print('Page Load!')
#Soup and select element
def getData(np):
soup = bs(driver.page_source, "lxml")
product_containers = soup.findAll("div", class_='c2prKC')
for p in product_containers:
title = (p.find(class_='c16H9d').text)#title
selling_price = (p.find(class_='c13VH6').text)#selling price
try:
original_price=(p.find("del", class_='c13VH6').text)#original price
except:
original_price = "-1"
if p.find("i", class_='ic-dynamic-badge ic-dynamic-badge-freeShipping ic-dynamic-group-2'):
freeShipping = 1
else:
freeShipping = 0
try:
discount = (p.find("span", class_='c1hkC1').text)
except:
discount ="-1"
if p.find(("div", {'class':['c16H9d']})):
url = "https:"+(p.find("a").get("href"))
else:
url = "-1"
nextpage_elements = driver.find_elements_by_class_name('ant-pagination-next')[0]
np=webdriver.ActionChains(driver).move_to_element(nextpage_elements).click(nextpage_elements).perform()
print("- -"*30)
toSave = [title,selling_price,original_price,freeShipping,discount,url]
print(toSave)
writerows(toSave,filename)
getData(np)

The problem might be that the driver is trying to click the button before the element is even loaded correctly.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(PATH, chrome_options=option)
# use this code after driver initialization
# this is make the driver wait 5 seconds for the page to load.
driver.implicitly_wait(5)
url = "https://www.lazada.com.ph/catalog/?q=phone&_keyori=ss&from=input&spm=a2o4l.home.search.go.239e359dTYxZXo"
driver.get(url)
next_page_path = "//ul[#class='ant-pagination ']//li[#class=' ant-pagination-next']"
# the following code will wait 5 seconds for
# element to become clickable
# and then try clicking the element.
try:
next_page = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, next_page_path)))
next_page.click()
except Exception as e:
print(e)
EDIT 1
Changed the code to make the driver wait for the element to become clickable. You can add this code inside a while loop for iterating multiple times and break the loop if the button is not found and is not clickable.

I am failing ailing to scrape an ASP site after page 10

The website I am scrapping is:
http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx
I am getting to page 10 with my code that is looking at the pagination numbers and iterating over them but it is failing when it wants to get past page 10 because there are three dots (...) that, if you click in the browser, it loads page 11 (Same for after page 20, page 30 etc). How can I update my code below so that it can deal with this error without breaking?
The code I am using is:
import re
import string
import urlparse
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.PhantomJS()
self.driver.set_window_size(1120, 550)
def scrape(self):
self.driver.get(self.url)
# choose to search using the region
try:
self.driver.find_element_by_id('SearchChkb_5').click()
except NoSuchElementException:
pass
#get the provinces that are available
select = Select(self.driver.find_element_by_id('ddlProvince'))
option_indexes = range(1, len(select.options))
#iterate through the provinces
for index in option_indexes[:3]:
select.select_by_index(index)
#click the search button
self.driver.find_element_by_id('cmdSearch').click()
pageno = 2
while True:
#create a beautiful soup of the page source code
s = BeautifulSoup(self.driver.page_source)
#get all links that match seeing practitioner profile
r1 = re.compile(r'^PractitionerView\.aspx\?FILENO=([A-Z0-9-]+)$')
#create a dictionary of the attributes
x = {'href': r1}
#so in the page source, find all links that have the attributes stated in x
for a in s.findAll('a', attrs=x):
print 'View Doctor URL: ', urlparse.urljoin(self.driver.current_url, a['href'])
print
# Pagination
try:
next_page_elem = self.driver.find_element_by_xpath("//a[text()='%d']" % pageno)
print "Next page: ", next_page_elem
except NoSuchElementException:
break # no more pages
print 'page ', pageno, '\n'
next_page_elem.click()
pageno += 1
self.driver.quit()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.scrape()
I am getting this error:
StaleElementReferenceException: {"errorMessage":"Element does not exist in cache","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Length":"121","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:63135","User-Agent":"Python http auth"},"httpVersion":"1.1","method":"POST","post":"{\"using\": \"tag name\", \"sessionId\": \"ef6d0590-a2d6-11e7-91fa-5773b3326267\", \"id\": \":wdc:1506442969197\", \"value\": \"option\"}","url":"/elements","urlParsed":{"anchor":"","query":"","file":"elements","directory":"/","path":"/elements","relative":"/elements","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/elements","queryKey":{},"chunks":["elements"]},"urlOriginal":"/session/ef6d0590-a2d6-11e7-91fa-5773b3326267/element/:wdc:1506442969197/elements"}}

The main problem with this site is that the clickable elements frequently goes beyond the sight and it throws element not clickable error. However, I've already fixed it. If you have ChromeDriver installed in your machine, just run it and see the magic. It will flawlessly traverse all the pages no matter how many they are. I've checked it.
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
main_link = 'http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx'
def get_content(driver,wait,link):
driver.get(link)
driver.find_element_by_id('SearchChkb_5').click()
select = Select(driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
if __name__ == '__main__':
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
try:
get_content(driver,wait,main_link)
finally:
driver.close()
And using Class:
from selenium import webdriver ; import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class DoctorScraper(object):
def __init__(self):
self.url = "http://isystems.hpcsa.co.za/iregister/RegisterSearch.aspx"
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def __del__(self):
self.driver.close()
def controlling_pagination(self):
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter = 2
while True:
try:
if not page_counter % 10 == 1:
self.driver.find_element_by_link_text(str(page_counter)).click()
page_counter += 1
else:
self.driver.find_elements_by_link_text("...")[-1].click()
time.sleep(2)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
page_counter += 1
except NoSuchElementException:
break
def get_content(self):
self.driver.get(self.url)
self.driver.find_element_by_id('SearchChkb_5').click()
select = Select(self.driver.find_element_by_id('ddlProvince'))
select.select_by_visible_text('WESTERN CAPE')
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
elem = self.wait.until(EC.visibility_of_element_located((By.ID, 'cmdSearch')))
elem.click()
self.controlling_pagination()
if __name__ == '__main__':
scraper = DoctorScraper()
scraper.get_content()
Btw, take a look at the bottom of the image where you can see the changes of pages:

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to paginate and scrape data from an aspx page without selenium - python

Related

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

problem in clicking radio button can't able to select a radio button. Message: stale element reference: element is not attached to the page document

Python Selenium. Parallel if loop

Next Page Iteration in Selenium/BeautfulSoup for Scraping E-Commerce Website

I am failing ailing to scrape an ASP site after page 10

Categories

Resources