i'm learning python and scraping and i'm currently trying to scrape this page: enter link description here
I managed to get some data in my csv file with the below code:
from typing import Text
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.select import Select
import time
with open('scraping_5_pagination.csv', 'w') as file:
file.write("business_names, attestation, town_pc, region")
driver = webdriver.Chrome(ChromeDriverManager().install()) # initialise chrome driver
driver.get(
'https://www.cci.fr/agent-immobilier?company_name=agences%20immobili%C3%A8res%20&brand_name=&siren=&numero_carte=&code_region=84&city=&code_postal=&person_name=&state_recherche=1&name_region=AUVERGNE-RHONE-ALPES')
driver.maximize_window()
time.sleep(1)
agences_recherche = driver.find_element_by_id('edit-company-name')
# agences_recherche.send_keys('agences immobilières')
time.sleep(1)
region = driver.find_element_by_id('edit-code-region')
# region.send_heys('AUVERGNE-RHONE-ALPES')
time.sleep(1)
search = driver.find_element_by_xpath('//input[#value="Rechercher"]')
time.sleep(1)
for i in range(200): # Loop for pagination
business_names = driver.find_elements_by_xpath('//td[#class="titre_entreprise"]')
# driver.execute_script("arguments[0].click();", business_names)
attestation = driver.find_elements_by_xpath('//tr[#class="lien-fiche"]/td/a')
# driver.execute_script("arguments[0].click();", attestation)
town_pc = driver.find_elements_by_xpath('//*[#id="main-content"]/div/table/tbody/tr/td[2]')
# driver.execute_script("arguments[0].click();", town_pc)
region = driver.find_elements_by_xpath('//*[#id="main-content"]/div/table/tbody/tr/td[3]')
# driver.execute_script("arguments[0].click();", region)
number_of_pages = int(driver.find_element_by_xpath('//span[contains(text(),"suivant")]')
with open('scraping_5_pagination.csv', 'w') as file:
for i in range(len(business_names)):
file.write(
business_names[i].text + ";" + attestation[i].text + ";" + town_pc[i].text + ";" + region[i].text + "\n")
number_of_pages=driver.find_element_by_xpath('//span[contains(text(),"suivant")]').click()
time.sleep(1)
driver.get(url)
driver.close()
But i don't get why it doesnt click on the ext page button, idk if it's a problem with the Xpath expression, or with the implementation
Also, here's the the html code of the "Next button"
<a href="?company_name=agences%20immobili%C3%A8res%20&brand_name=&siren=&numero_carte=&code_region=84&city=&code_postal=&person_name=&state_recherche=1&name_region=AUVERGNE-RHONE-ALPES&page=2" title="Aller à la page suivante" rel="next" class="">
<span class="visually-hidden">Page suivante</span>
<span aria-hidden="true" class="">Suivant</span>
</a>
Thank you for reading me
Related
Error : selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document.
website I'm scraping https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb I wanted to loop this tariff details with each section and each radio button shows different prices. I wanted to scrape, price details for each radio buttons one by one and checked radio button name along with price till end of the page. I have tried but I couldn't make success.
could anyone help on this. I will be helpful for me to learn. I have tried till get entered in to change tariff link and I'm facing issue to scrape a details. change tariff links given below links,
https://i.stack.imgur.com/RRyJa.png
https://i.stack.imgur.com/fNafB.png
https://i.stack.imgur.com/jFnLA.png
https://i.stack.imgur.com/WlyLU.png
"I'm trying to click a radio button and need to scrape a price details for selected radio button."
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class telekommobiles:
def __init__(self):
self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
self.country='DE'
self.currency='GBP'
self.VAT='Included'
self.shipping = 'free shipping within 3-4 weeks'
self.Pre_PromotionPrice ='N/A'
self.color ='N/A'
def telekom(self):
#try:
driver=webdriver.Chrome()
driver.maximize_window()
driver.get(self.url)
today = date.today()
#time.sleep(5)
WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//*[#id='consentAcceptAll']")))
cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
print("cookies accepted")
links_prod_check = []
prod_models = []
prod_manufacturer =[]
prod_memorys = []
product_colors =[]
product_price_monthly_payments = []
product_price_one_time_payments =[]
product_links = []
containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
i = 1
for container in containers:
p_links =container.find_element_by_tag_name('a').get_attribute('href')
i = i + 1
product_links.append(p_links)
#print(p_links)
for links in product_links:
driver.get(links)
#time.sleep(5)
#print(driver.current_url)
#links_prod_check.append(driver.current_url)
coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[#data-qa='list_ColorVariant']")))
#print(coloroptions)
for i in range(len(coloroptions)):
coloroption = driver.find_elements_by_xpath("//li[#data-qa='list_ColorVariant']")
coloroption[i].click()
#print(coloroption[i])
time.sleep(3)
memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(memoryoptions)):
memoryoption = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
try:
memoryoption[i].click()
except:
pass
time.sleep(3)
change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
time.sleep(3)
section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')
for section_loop in section_loops:
#Headings
heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
print(heading_1)
looping_for_tariff = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
subcontainers = section_loop.find_elements_by_css_selector('div[class="phx-tariff-box__section"]')
for subcontainer in subcontainers:
radio_buttons_list=subcontainer.find_elements_by_css_selector('div[class="phx-form__row phx-form__row--small phx-form__row--full-width phx-form__row--radio"]')
for radio in radio_buttons_list:
input=radio.find_elements_by_css_selector('span[class="phx-radio__element"]')
if input[0].is_enabled():
try:
ActionChains(driver).move_to_element(subcontainer).perform()
time.sleep(2)
input[0].click()
time.sleep(3)
except:
print('Not clickable')
pass
lable_list=radio.find_elements_by_css_selector('span[class="phx-radio__label"]')
label=""
if lable_list:
label=lable_list[0].text
heading_2 = subcontainer.find_element_by_css_selector('p[class="phx-t6 phx-t--medium"]').text
data_price_list= subcontainer.find_element_by_css_selector('div[class="phx-tariff-box__data-price"]')
volumn_list=data_price_list.find_elements_by_css_selector('div[data-qa="label_Tariff_VolumeSize"]')
volumn=""
if volumn_list:
volumn=volumn_list[0].text
price_list=subcontainer.find_elements_by_css_selector('p[class="phx-price phx-price--size_large phx-price--strong phx-price--color_brand"]')
price=""
nonBreakSpace = u'\xa0'
if price_list:
price=price_list[0].text
print(str(heading_2) + " " + str(label) + " " + str(volumn.replace(' ', '').replace( '\\r\\n','')) + " " + str(price))
#except:
#pass
telekom_de=telekommobiles()
telekom_de.telekom()
After selecting a different Option the page gets Refreshed, hence the issue. I was not able to find where you were trying to click on the buttons in your code. So tried to click on all the radio buttons with below code and was successful. Check the code once.
from selenium import webdriver
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path="path to chromedriver.exe")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/sierrablau-128gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()
radiooptions = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(radiooptions)):
radiooptions = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radiooptions[i].click()
time.sleep(2)
please li element instead of span
//li[#data-qa='list_ColorVariant']
and also add wait once you click on it. 5secs. then click the next one
I am using Python and Selenium to scrape tripadvisor all the reviews of a particular hotel and I am new to scraping. But currently it's scraping reviews from first 6 pages out of 36 pages. I need to scrape the reviews from all the pages in that hotel and save them into a csv file. Following is the code I'm using.
import csv
import time
import requests
import re
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
driver = webdriver.Chrome("./chromedriver")
def check_exists_by_xpath(xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
time.sleep(2)
def getHotelReviews():
# Find and click the More link (to load all reviews)
driver.find_element_by_xpath("//span[#class='_33O9dg0j']").click()
time.sleep(20)
reviews = driver.find_elements_by_xpath("//div[#data-test-target='reviews-tab']/div")
reviews_count = len(reviews)
print(reviews_count)
# Loop through the reviews found
for i in range(2, reviews_count):
try:
if (check_exists_by_xpath(".//div[contains(#class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]")):
moreBtn = reviews[i].find_element_by_xpath(
".//div[contains(#class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]").click()
time.sleep(20)
if (check_exists_by_xpath(".//div[contains(#class,'_2f_ruteS _1bona3Pu')]/div/q/span")):
review = reviews[i].find_element_by_xpath(
".//div[contains(#class,'_2f_ruteS _1bona3Pu')]/div/q/span").text
print(review)
date = reviews[i].find_element_by_xpath(".//span[contains(#class,'_34Xs-BQm')]").text
print(date)
title = reviews[i].find_element_by_xpath(".//div[contains(#class,'glasR4aX')]/a/span").text
print(title)
# Save to CSV
csvWriter.writerow((date, title, review))
except:
break
driver.close()
driver.switch_to.window(driver.window_handles[0])
def getHotelPages(url):
driver.get(url)
# to maximize the driver
driver.maximize_window()
nextPage = driver.find_elements_by_xpath("//a[contains(#class,'pageNum cx_brand_refresh_phase2 ')]")
noOfPages = len(nextPage)
print(noOfPages)
for i in range(noOfPages):
print(nextPage[i].get_attribute("href"))
URLs.append(nextPage[i].get_attribute("href"))
URLs = [
'https://www.tripadvisor.com/Hotel_Review-g304141-d3895228-Reviews-The_Hideout_Sigiriya-Sigiriya_Central_Province.html#REVIEWS']
# Prepare CSV file
csvFile = open("hideoutSigiriyab_reviews1.csv", "w", newline='', encoding="utf-8")
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['Date', 'Title', 'Review'])
try:
getHotelPages(URLs[0])
except:
print("Error!!")
time.sleep(60)
for url in URLs:
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[1])
driver.get(url)
getHotelReviews()
time.sleep(20)
csvFile.close()
driver.close()
Can you help me by suggesting a method or a working code to scrape the reviews from all the pages of a hotel.
Simple way to click pages 1-36.
size=int(driver.find_element_by_css_selector('div.pageNumbers >a:nth-last-child(1)').text)
for i in range(2,size):
pageNums=WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.pageNumbers")))
pageNums.find_element_by_xpath("//a[text()='{}']".format(i)).click()
time.sleep(5)
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
I'm facing an issue on chrome webdriver selenium is not giving me updated content, it showing me previous content but actually after click on next page link the new data append into browser but when I get through driver it gives me as previous.
the site link is: www.abc.com
my goal is extract all job link. but I'm unable to do it please help me in this regard.
job_links = []
per_page = 9
total_jobs = int(driver.find_element_by_css_selector(".search-results-count.total-jobs").text.split("(")[1].split(")")[0])
total_pages = math.ceil(total_jobs / per_page)
for x in range(1, total_pages):
print("Page number: ", x)
jobs_on_page = ""
time.sleep(5)
jobs_on_page = driver.find_elements_by_xpath("//div[#class='module job-card-wrapper col-md-4 col-xs-12 col-sm-6 corporate-regular background-white']")
for job in jobs_on_page:
print("job is:", job)
job_link = job.find_element_by_xpath("./a").get_attribute('href').split("%")[0]
job_links.append(job_link)
# if x != (total_pages - 1):
print("Hello Page: ", x)
element = driver.find_element_by_xpath(
"//div[#class='reinvent-pagination-next']//span[#class='arrow cta-arrow']")
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
# self.wait.until(EC.element_to_be_clickable((By.XPATH, "//div[#class='reinvent-pagination-next']//span[#class='arrow cta-arrow']"))).click()
time.sleep(10)"
it gives me repetitively first page job links however my page changes in webdriver.
Induce WebDriverWait() and visibility_of_all_elements_located() and following css selector to get all the links.
Use infinite while loop and check for next button available using try..except
Code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
driver.get("https://www.boom.com")
Alllinks=[]
while True:
elements=WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div.module > a[data-linkcomponentname='jobsearchblock']")))
for link in elements:
Alllinks.append(link.get_attribute('href'))
try :
next_btn=WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,'//a[#class="next-page-btn"]')))
driver.execute_script("arguments[0].click();", next_btn)
except:
break
time.sleep(1)
print('Total links :' + str(len(Alllinks)))
print(Alllinks)
Output:
Total links :90
['https://www.boom.com/ca-en/careers/jobdetails?id=00728259_en&title=Sales+Capture+Lead+%e2%80%93+Large-Scale+Consulting%2c+Technology+and+Operations+Sales', 'https://www.boom.com/ca-en/careers/jobdetails?id=00778020_en&title=Business+Operations+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00804572_en&title=Test+Automation+Engineer', 'https://www.boom.com/ca-en/careers/jobdetails?id=00780841_en&title=Consulting+Senior+Manager%2c+Automotive', 'https://www.boom.com/ca-en/careers/jobdetails?id=00788609_en&title=Senior+Integration+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00788884_en&title=E2E+Senior+Tester', 'https://www.boom.com/ca-en/careers/jobdetails?id=00739145_en&title=Oracle+Project+Portfolio+Management+Cloud+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00777973_en&title=Executive+Assistant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00756315_en&title=Azure+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00798395_en&title=Technology+Delivery+Lead+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00783770_en&title=SAP+Customer+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00780180_en&title=Oracle+Cloud+integration+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00740026_en&title=Smart+Spend+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00765639_en&title=Hybris+Architect%2fDevelopment+Lead', 'https://www.boom.com/ca-en/careers/jobdetails?id=00765637_en&title=Hybris+Senior+Developer', 'https://www.boom.com/ca-en/careers/jobdetails?id=00801716_en&title=Senior+Cloud+Native+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00762181_en&title=Smart+Spend+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00752420_en&title=Senior+Cloud+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00785832_en&title=Digital+Technology+Consulting+Senior+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00736712_en&title=Azure+Data+Architect+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00742724_en&title=Client+Financial+Management+Analyst', 'https://www.boom.com/ca-en/careers/jobdetails?id=00789817_en&title=SAP+Sourcing%2fProcurement+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00791760_en&title=SAP+HCM+Manager+-+H%26PS', 'https://www.boom.com/ca-en/careers/jobdetails?id=00782632_en&title=Workday+Integration+Senior+Analyst', 'https://www.boom.com/ca-en/careers/jobdetails?id=00775896_en&title=SAP+SCM+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00752413_en&title=Red+Hat+OpenShift+Cloud+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00759225_en&title=Cloud+Application+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00797835_en&title=SAP+S%2f4+HANA+EAM+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00778099_en&title=Front+Desk+Assistant%2fReception', 'https://www.boom.com/ca-en/careers/jobdetails?id=00734569_en&title=SAP+Payroll+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00747056_en&title=SAP+Ariba+Delivery+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00684615_en&title=Solutions+Architect%2fManager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00740979_en&title=SAP+IBP+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00594586_en&title=Sales+Capture%2c+Senior+Manager+(Application+Services)', 'https://www.boom.com/ca-en/careers/jobdetails?id=00752409_en&title=Sr+Implementation+Specialist', 'https://www.boom.com/ca-en/careers/jobdetails?id=00784403_en&title=Senior+Technical+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00778080_en&title=Marketing+Campaign+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00786043_en&title=Microservices%2fJava+Spring+Boot+Developer', 'https://www.boom.com/ca-en/careers/jobdetails?id=00774712_en&title=SAP+S%2f4+Finance+Consultant+-+SAP+Technology', 'https://www.boom.com/ca-en/careers/jobdetails?id=00756729_en&title=SAP+Delivery+Lead+-+SAP+Technology', 'https://www.boom.com/ca-en/careers/jobdetails?id=00758527_en&title=Management+Consulting+Manager+%e2%80%93+Utilities+T%26D+(Toronto)', 'https://www.boom.com/ca-en/careers/jobdetails?id=00789288_en&title=SAP+Finance+Manager+-+Health+and+Public+Services+Sector', 'https://www.boom.com/ca-en/careers/jobdetails?id=00789286_en&title=SAP+Finance+Consultant+-+Health+and+Public+Services+Sector', 'https://www.boom.com/ca-en/careers/jobdetails?id=00752355_en&title=Oracle+Cloud+SCM+Consutant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00733096_en&title=Oracle+Cloud+-+Order+To+Cash+Functional+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00782656_en&title=Sr+Oracle+Projects+Lead', 'https://www.boom.com/ca-en/careers/jobdetails?id=00756751_en&title=Data+Governance+Senior+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00789201_en&title=Technical+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00768916_en&title=CI+Functional+Designer+-+Technology+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00747893_en&title=SAP+S4+HANA+Finance+%e2%80%93+Senior+Manager+(IPT)', 'https://www.boom.com/ca-en/careers/jobdetails?id=00768965_en&title=Data+Engineering+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00721462_en&title=AEM+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00754980_en&title=Sales+Capture+Senior+Manager+-+Financial+Services', 'https://www.boom.com/ca-en/careers/jobdetails?id=00791449_en&title=Azure+Cloud+Operations+Lead', 'https://www.boom.com/ca-en/careers/jobdetails?id=00779191_en&title=Workday+Data+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00785754_en&title=Organization+Change+Senior+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00752384_en&title=Full+Stack+Developer', 'https://www.boom.com/ca-en/careers/jobdetails?id=00766888_en&title=Oracle+Cloud+ERP+-+Business+Lead', 'https://www.boom.com/ca-en/careers/jobdetails?id=00770105_en&title=SAP+Finance+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00788292_en&title=Systems+Engineer+(Azure%2c+Cloud+%26+O365)', 'https://www.boom.com/ca-en/careers/jobdetails?id=00755903_en&title=Cloud+Engineer+Consulting+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00749401_en&title=Azure+Cloud+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00768544_en&title=Cloud+Native+Developer', 'https://www.boom.com/ca-en/careers/jobdetails?id=00773267_en&title=Global+Category+Management+Associate+Manager+(Canada)', 'https://www.boom.com/ca-en/careers/jobdetails?id=00752415_en&title=Pivotal+Cloud+Foundry+Developer', 'https://www.boom.com/ca-en/careers/jobdetails?id=00763409_en&title=Mulesoft+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00775495_en&title=Consulting+Manager+-+Contact+Center+Strategy+(Retail+Banking)', 'https://www.boom.com/ca-en/careers/jobdetails?id=00780965_en&title=SAP+Finance+Transformation+Senior+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00760167_en&title=SAP+Fieldglass+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00780860_en&title=Oracle+Cloud+SCM+-+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00780864_en&title=Oracle+Cloud+Finance+-+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00751969_en&title=Innovation+and+Best+Practices+(F%26A)+Associate+Director', 'https://www.boom.com/ca-en/careers/jobdetails?id=00781338_en&title=SAP+SuccessFactors+Employee+Central+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00779384_en&title=Vlocity+%2f+Salesforce+Developer', 'https://www.boom.com/ca-en/careers/jobdetails?id=00744256_en&title=SAP+S%2f4+HANA+Finance+Senior+Manager-+SAP+Technology', 'https://www.boom.com/ca-en/careers/jobdetails?id=00774716_en&title=SAP+Technical+Architect+-+Senior+Leader', 'https://www.boom.com/ca-en/careers/jobdetails?id=00756760_en&title=AWS+Cloud+Architect+Specialist', 'https://www.boom.com/ca-en/careers/jobdetails?id=00769005_en&title=SAP+SuccessFactors+Recruiting+%26+Onboarding+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00735660_en&title=SAP+SuccessFactors+LMS+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00736215_en&title=SAP+SuccessFactors+Technical+Solution+Architect', 'https://www.boom.com/ca-en/careers/jobdetails?id=00747061_en&title=SAP+S4+HANA+Supply+Chain+(SCM)+-+Senior+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00747058_en&title=SAP+S4+HANA+Central+Finance+Senior+Manager+-+SAP+Technologies', 'https://www.boom.com/ca-en/careers/jobdetails?id=00776370_en&title=ERP+(SAP%2c+Oracle)+Security+Senior+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00773097_en&title=Organization+Change+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00773095_en&title=Organization+Change+Consultant', 'https://www.boom.com/ca-en/careers/jobdetails?id=00773099_en&title=Organization+Change+Manager', 'https://www.boom.com/ca-en/careers/jobdetails?id=00768546_en&title=Cloud+Native+Senior+Application+Developer', 'https://www.boom.com/ca-en/careers/jobdetails?id=00766506_en&title=Mulesoft+Developer', 'https://www.boom.com/ca-en/careers/jobdetails?id=00748946_en&title=Senior+Software+Engineer%2fTeam+Lead', 'https://www.boom.com/ca-en/careers/jobdetails?id=00334756_en&title=Military+Service+Members+and+Veterans+-+Canada+%2b%2b']
I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion
I have code as below to scrape site and it is no problems, then I want to only use Selenium so I change code to this, then I got errors, I don't know why, does anyone help me?
webdriver.PhantomJS() Errors
Exception: Message: {"errorMessage":"Element does not exist in cache"
webdriver.Chrome() Error:
Exception: Message: stale element reference: element is not attached to the page document
Selenium only code
driver = webdriver.Chrome() # or webdriver.PhantomJS()
a = driver.find_elements_by_css_selector(findTag + "." + findValue + " a")
img = driver.find_elements_by_css_selector(findTag + "#" + findValue + "img")
href = a.get_attribute('href')
src = img.get_attribute("src")
Selenium + BeautifulSoup code:
driver = webdriver.Chrome() # or webdriver.PhantomJS()
soup = bs4.BeautifulSoup(driver.page_source, "html.parser")
a = soup.find(findTag, class_=findValue).find_all("a")
img = soup.find(findTag, id=findValue).find_all("img")
href = a.get("href")
src = img.get("src")
Have you tried to implement waits? It would go as follow:
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome() # or webdriver.PhantomJS()
# Here check that your image is in the page's document.
wait = driver.WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.ID, "YourImgId")))
# Now try to find it in the DOM
img = driver.find_elements_by_css_selector(findTag + "#" + findValue + "img")
a = driver.find_elements_by_css_selector(findTag + "." + findValue + " a")
href = a.get_attribute('href')
src = img.get_attribute("src")
Hope this helps :)
About waits: http://selenium-python.readthedocs.io/waits.html
Edit: not a wait issue
Just navigate to the page with selenium, enter your credential and then use beautifulsoup to scrape the page. It should then be fine :)
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
ex_path = r"C:\chromedriver_win32\chromedriver.exe"
# Going to the link
driver = webdriver.Chrome(executable_path = ex_path)
driver.get("http://ipcamera-viewer.com/view/?camera_code=199619")
# Enter the password
code = driver.find_element_by_name("pass")
code.send_keys("5042")
code.send_keys(Keys.ENTER)
# Now get the soup
soup = BeautifulSoup(driver.page_source, "html.parser")
element_ = soup.find("ul", id = "grid")
images_links = []
for img in element_.find_all("img"):
images_links.append(img.get("src"))
print images_links[0:2]
Output:
>>> [u'http://ipcamera-viewer.com/image/?p=199619_20170301_201334_5668.jpg', u'http://ipcamera-viewer.com/image/?p=199619_20170301_201329_5611.jpg']