Python - Selenium next page - python

I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()

try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)

As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8

hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion

Related

navigate to next page and get href link

how to navigate page to the last page and get all href link from unchanged link page?
hhere my code is:
url = 'https://hoaxornot.detik.com/paging#'
options = webdriver.ChromeOptions()
pathToChromeDriver = "C:/Program Files/Google/Chrome/Application/chromedriver.exe"
browser = webdriver.Chrome(executable_path=pathToChromeDriver,
options=options)
try:
browser.get(url)
browser.implicitly_wait(10)
html = browser.page_source
page = 1
while page <= 2:
paging = browser.find_elements_by_xpath('//*[#id="number_filters"]/a[{}]'.format(page)).click()
for p in paging:
articles = p.find_elements_by_xpath('//*[#id="results-search-hoax-paging"]/div/div/article/a')
for article in articles:
print(article.get_attribute("href"))
page += 1
finally:
browser.quit()
wait=WebDriverWait(browser,60)
browser.get("https://hoaxornot.detik.com/paging#")
page=1
articles=[]
while True:
try:
time.sleep(1)
pagearticles=wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//*[#id="results-search-hoax-paging"]/div/div/article/a')))
for article in pagearticles:
articles.append(article.get_attribute("href"))
page+=1
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="number_filters"]/a[{}]'.format(page)))).click()
except:
break
print(articles)
Here's a simple way to loop through the pages and wait for the element's visibility to come up so you can obtain their values instead of an empty list.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
Outputs:
['https://news.detik.com/berita/d-5900248/video-jembatan-ambruk-disebut-di-samarinda-faktanya-bukan-di-indonesia', 'https://news.detik.com/berita/d-5898607/kantor-walkot-jakbar-diviralkan-rusak-akibat-gempa-ini-faktanya', 'https://news.detik.com/berita/d-5896931/polisi-di-singkawang-diviralkan-berbahasa-china-di-publik-begini-faktanya', 'https://news.detik.com/berita-jawa-timur/d-5895069/video-viral-hutan-baluran-banjir-dipastikan-hoax-polisi-itu-video-lama', 'https://news.detik.com/internasional/d-5873027/beredar-video-ledakan-parah-di-dubai-ternyata-3-insiden-lama-beda-negara', 'https://news.detik.com/berita/d-5865905/awas-ikut-tertipu-sejumlah-warga-ke-kantor-pln-bali-gegara-hoax-rekrutmen', 'https://news.detik.com/berita/d-5863802/beredar-pesan-gambar-kpk-pantau-muktamar-nu-di-lampung-ini-faktanya', 'https://news.detik.com/berita/d-5842083/viral-video-ayah-pukuli-anak-pakai-balok-kayu-begini-faktanya', 'https://news.detik.com/berita/d-5798562/video-mobil-ngebut-190-kmjam-dikaitkan-vanessa-angel-dipastikan-hoax', 'https://news.detik.com/berita/d-5755035/muncul-isu-liar-jokowi-joget-tanpa-masker-di-papua-ini-faktanya', 'https://news.detik.com/berita/d-5729500/beredar-edaran-penerima-bantuan-pesantren-kemenag-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5715146/5-bersaudara-di-surabaya-butuh-diadopsi-karena-papa-mama-meninggal-covid-19-hoaks', 'https://news.detik.com/berita/d-5714873/minta-maaf-ustaz-royan-jelaskan-viral-5-polisi-angkat-poster-demo-jokowi', 'https://health.detik.com/berita-detikhealth/d-5714239/viral-bawang-putih-tarik-cairan-dari-paru-paru-pasien-corona-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5699731/awas-hoax-viral-info-vaksin-palsu-beredar-di-indonesia-ini-faktanya', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5688266/hoax-pesan-bantuan-subsidi-gaji-rp-35-juta-jangan-dibuka', 'https://news.detik.com/berita-jawa-timur/d-5658878/2-sekolah-ditolak-warga-bondowoso-jadi-tempat-isolasi-satgas-tak-patah-arang', 'https://news.detik.com/berita/d-5655368/viral-video-demo-rusuh-di-jl-gajah-mada-polisi-pastikan-hoax', 'https://news.detik.com/berita/d-5755035/muncul-isu-liar-jokowi-joget-tanpa-masker-di-papua-ini-faktanya', 'https://news.detik.com/berita/d-5729500/beredar-edaran-penerima-bantuan-pesantren-kemenag-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5715146/5-bersaudara-di-surabaya-butuh-diadopsi-karena-papa-mama-meninggal-covid-19-hoaks', 'https://news.detik.com/berita/d-5714873/minta-maaf-ustaz-royan-jelaskan-viral-5-polisi-angkat-poster-demo-jokowi', 'https://health.detik.com/berita-detikhealth/d-5714239/viral-bawang-putih-tarik-cairan-dari-paru-paru-pasien-corona-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5699731/awas-hoax-viral-info-vaksin-palsu-beredar-di-indonesia-ini-faktanya', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5688266/hoax-pesan-bantuan-subsidi-gaji-rp-35-juta-jangan-dibuka', 'https://news.detik.com/berita-jawa-timur/d-5658878/2-sekolah-ditolak-warga-bondowoso-jadi-tempat-isolasi-satgas-tak-patah-arang', 'https://news.detik.com/berita/d-5655368/viral-video-demo-rusuh-di-jl-gajah-mada-polisi-pastikan-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5645668/heboh-ajakan-tolak-ppkm-darurat-di-pekalongan-ini-kata-polisi', 'https://news.detik.com/berita/d-5643373/heboh-tim-covid-buru-warga-tanjungpinang-langgar-ppkm-darurat-ini-faktanya', 'https://news.detik.com/berita/d-5638774/viral-rusa-keliaran-di-jalanan-denpasar-saat-ppkm-darurat-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5635282/deretan-hoax-air-kelapa-netralkan-vaksin-hingga-obati-covid-19', 'https://news.detik.com/berita-jawa-tengah/d-5633158/beredar-pesan-ada-pasien-corona-kabur-di-kudus-ternyata', 'https://news.detik.com/berita-jawa-tengah/d-5622194/viral-tim-sar-klaten-kewalahan-jasad-covid-belum-dimakamkan-ini-faktanya', 'https://news.detik.com/berita/d-5607406/beredar-isu-sutiyoso-meninggal-keluarga-tidak-benar', 'https://news.detik.com/berita-jawa-tengah/d-5603576/waspada-ada-akun-wa-catut-bupati-klaten-minta-sumbangan', 'https://news.detik.com/berita-jawa-tengah/d-5603472/heboh-pesan-berantai-soal-varian-baru-corona-di-kudus-ini-faktanya', 'https://news.detik.com/berita/d-5591931/beredar-poster-konvensi-capres-nu-2024-pbnu-pastikan-hoax', 'https://health.detik.com/berita-detikhealth/d-5591504/viral-hoax-makan-bawang-3-kali-sehari-sembuhkan-corona-ini-faktanya', 'https://news.detik.com/berita/d-5590632/viral-tes-antigen-pakai-air-keran-hasilnya-positif-satgas-kepri-menepis', 'https://news.detik.com/internasional/d-5586179/fakta-di-balik-aksi-penyiar-malaysia-tutup-1-mata-untuk-palestina', 'https://inet.detik.com/cyberlife/d-5585732/waspada-6-hoax-vaksin-bermagnet-hingga-china-siapkan-senjata-biologis', 'https://health.detik.com/berita-detikhealth/d-5533468/viral-jadi-sulit-ereksi-karena-vaksin-sinovac-ini-penjelasan-dokter', 'https://health.detik.com/berita-detikhealth/d-5527149/viral-cacing-di-masker-impor-dari-china-ini-fakta-di-baliknya', 'https://finance.detik.com/energi/d-5526617/viral-gaji-petugas-kebersihan-pertamina-rp-13-juta-manajemen-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5519314/fakta-fakta-gibran-disebut-duduk-di-meja-menteri-pupr-duduk-di-kursi', 'https://finance.detik.com/energi/d-5511928/awas-hoax-bbm-langka-imbas-kilang-kebakaran-pertamina-stok-luber', 'https://news.detik.com/berita-jawa-tengah/d-5511550/viral-gibran-duduk-di-atas-meja-depan-menteri-basuki-begini-faktanya', 'https://news.detik.com/berita/d-5507088/geger-kaca-bus-transmetro-deli-medan-diduga-ditembak-begini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5487986/viral-lansia-non-dki-bisa-vaksin-corona-di-senayan-dipastikan-hoax', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5487983/awas-hoax-pesan-berantai-soal-vaksinasi-lansia-di-istora-senayan', 'https://health.detik.com/berita-detikhealth/d-5480124/hoax-tak-ada-larangan-minum-obat-jantung-sebelum-vaksin-covid-19', 'https://health.detik.com/berita-detikhealth/d-5473657/hoax-kemenkes-bantah-puluhan-wartawan-terkapar-setelah-vaksinasi-covid-19', 'https://health.detik.com/berita-detikhealth/d-5368305/minum-air-putih-bisa-atasi-kekentalan-darah-pasien-covid-19-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5360703/viral-info-penemu-vaksin-covid-19-sinovac-meninggal-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5357602/pasien-jalan-ngangkang-seperti-penguin-disebut-karena-anal-swab-ini-faktanya', 'https://finance.detik.com/moneter/d-5351004/kabar-bi-di-lockdown-bank-internasional-swiss-dipastikan-hoax', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5350942/hoax-jangan-percaya-pesan-berantai-dana-bagi-bagi-uang-tunai', 'https://health.detik.com/berita-detikhealth/d-5340874/sederet-hoax-vaksin-jokowi-disebut-salah-suntik-hingga-tak-sampai-habis', 'https://health.detik.com/berita-detikhealth/d-5338133/hoax-viral-kasdim-0817-gresik-wafat-usai-vaksin-covid-19-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5337075/viral-urutan-mandi-agar-tak-kena-stroke-ini-faktanya', 'https://news.detik.com/berita/d-5328895/foto-bayi-selamat-dari-sriwijaya-air-sj182-dipastikan-hoax', 'https://health.detik.com/berita-detikhealth/d-5324630/viral-vaksin-covid-19-memperbesar-penis-bpom-hoax-lah', 'https://news.detik.com/berita-jawa-timur/d-5321500/wawali-surabaya-terpilih-armuji-dikabarkan-meninggal-ketua-dprd-hoaks', 'https://news.detik.com/berita/d-5287986/beredar-chat-kapolda-metro-soal-sikat-laskar-hrs-dipastikan-hoax', 'https://news.detik.com/berita/d-5286913/video-ambulans-fpi-masuk-rs-saat-ricuh-diviralkan-ini-faktanya', 'https://news.detik.com/berita-jawa-tengah/d-5280091/viral-bendung-gerak-serayu-jebol-kepala-upt-itu-kapal-ponton-hanyut', 'https://news.detik.com/berita-jawa-tengah/d-5279872/viral-asrama-isolasi-mandiri-ugm-penuh-ternyata-begini-faktanya', 'https://news.detik.com/berita/d-5275107/kpu-makassar-bantah-keluarkan-flyer-hasil-survei-paslon-pilwalkot-berlogo-kpu', 'https://news.detik.com/berita-jawa-tengah/d-5264429/beredar-voice-note-binatang-buas-gunung-merapi-turun-ke-selo-kades-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5262931/viral-peta-bahaya-gunung-merapi-sejauh-10-km-bpptkg-itu-peta-2010', 'https://health.detik.com/berita-detikhealth/d-5254580/viral-tips-sembuhkan-covid-19-dalam-waktu-5-menit-dokter-paru-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5253524/video-jenazah-covid-19-diviralkan-bola-mata-hilang-keluarga-sebut-hoaks', 'https://news.detik.com/berita/d-5287986/beredar-chat-kapolda-metro-soal-sikat-laskar-hrs-dipastikan-hoax', 'https://news.detik.com/berita/d-5286913/video-ambulans-fpi-masuk-rs-saat-ricuh-diviralkan-ini-faktanya', 'https://news.detik.com/berita-jawa-tengah/d-5280091/viral-bendung-gerak-serayu-jebol-kepala-upt-itu-kapal-ponton-hanyut', 'https://news.detik.com/berita-jawa-tengah/d-5279872/viral-asrama-isolasi-mandiri-ugm-penuh-ternyata-begini-faktanya', 'https://news.detik.com/berita/d-5275107/kpu-makassar-bantah-keluarkan-flyer-hasil-survei-paslon-pilwalkot-berlogo-kpu', 'https://news.detik.com/berita-jawa-tengah/d-5264429/beredar-voice-note-binatang-buas-gunung-merapi-turun-ke-selo-kades-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5262931/viral-peta-bahaya-gunung-merapi-sejauh-10-km-bpptkg-itu-peta-2010', 'https://health.detik.com/berita-detikhealth/d-5254580/viral-tips-sembuhkan-covid-19-dalam-waktu-5-menit-dokter-paru-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5253524/video-jenazah-covid-19-diviralkan-bola-mata-hilang-keluarga-sebut-hoaks', 'https://news.detik.com/berita/d-3124615/benarkah-sesuap-lele-mengandung-3000-sel-kanker', 'https://news.detik.com/berita/d-3124915/loket-tiket-konser-bon-jovi-di-gbk-dibakar-hoax']

Next Page Iteration in Selenium/BeautfulSoup for Scraping E-Commerce Website

I'm scraping an E-Commerce website, Lazada using Selenium and bs4, I manage to scrape on the 1st page but I unable to iterate to the next page. What I'm tyring to achieve is to scrape the whole pages based on the categories I've selected.
Here what I've tried :
# Run the argument with incognito
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get('https://www.lazada.com.my/')
driver.maximize_window()
# Select category item #
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
t = 10
try:
WebDriverWait(driver,t).until(EC.visibility_of_element_located((By.ID,"a2o4k.searchlistcategory.0.i0.460b6883jV3Y0q")))
except TimeoutException:
print('Page Refresh!')
driver.refresh()
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
print('Page Load!')
#Soup and select element
def getData(np):
soup = bs(driver.page_source, "lxml")
product_containers = soup.findAll("div", class_='c2prKC')
for p in product_containers:
title = (p.find(class_='c16H9d').text)#title
selling_price = (p.find(class_='c13VH6').text)#selling price
try:
original_price=(p.find("del", class_='c13VH6').text)#original price
except:
original_price = "-1"
if p.find("i", class_='ic-dynamic-badge ic-dynamic-badge-freeShipping ic-dynamic-group-2'):
freeShipping = 1
else:
freeShipping = 0
try:
discount = (p.find("span", class_='c1hkC1').text)
except:
discount ="-1"
if p.find(("div", {'class':['c16H9d']})):
url = "https:"+(p.find("a").get("href"))
else:
url = "-1"
nextpage_elements = driver.find_elements_by_class_name('ant-pagination-next')[0]
np=webdriver.ActionChains(driver).move_to_element(nextpage_elements).click(nextpage_elements).perform()
print("- -"*30)
toSave = [title,selling_price,original_price,freeShipping,discount,url]
print(toSave)
writerows(toSave,filename)
getData(np)
The problem might be that the driver is trying to click the button before the element is even loaded correctly.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(PATH, chrome_options=option)
# use this code after driver initialization
# this is make the driver wait 5 seconds for the page to load.
driver.implicitly_wait(5)
url = "https://www.lazada.com.ph/catalog/?q=phone&_keyori=ss&from=input&spm=a2o4l.home.search.go.239e359dTYxZXo"
driver.get(url)
next_page_path = "//ul[#class='ant-pagination ']//li[#class=' ant-pagination-next']"
# the following code will wait 5 seconds for
# element to become clickable
# and then try clicking the element.
try:
next_page = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, next_page_path)))
next_page.click()
except Exception as e:
print(e)
EDIT 1
Changed the code to make the driver wait for the element to become clickable. You can add this code inside a while loop for iterating multiple times and break the loop if the button is not found and is not clickable.

Python Selenium - Extract all URL's In Table and iterate until next button disappears

I am trying to extract all URL's and iterate where the next button is pressed until there isn't a next button. I would then like to open each URL if that is possible. Could I be pointed in the right direction for this please.
The website where you need to press the search button is here
Link to Table of URL's that need to be extracted
from selenium import webdriver
from selenium.webdriver.common.by import By
driver=webdriver.Chrome(executable_path=r"C:\Users\matt_\Documents\Python Scripts\Selenium\chromedriver.exe")
driver.get("https://publicaccess.aberdeencity.gov.uk/online-applications/search.do?action=monthlyList")
driver.find_element_by_xpath("/html/body/div/div/div[3]/div[3]/div/form/fieldset/div[5]/input[2]").click()
test = driver.find_elements(By.TAG_NAME,"a")
print(test)
Here is the example what you looking for
from bs4 import BeautifulSoup as Soup
from selenium import webdriver
import pandas as pd
import time
driver = webdriver.Chrome()
driver.get("https://monerobenchmarks.info/")
page = Soup(driver.page_source, features='html.parser')
final_list = []
def parsh_table():
table = page.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
final_list.extend(row)
def next_bu():
next_button = driver.find_element_by_xpath('//*[#id="cpu_next"]')
next_button.click()
# put range of pages
for _ in range(1,2):
parsh_table()
time.sleep(2)
next_bu()
print(final_list)
You can check the element exists or not with simple logic like this:
if len(driver.find_elements_by_css_selector('.next')) > 0:
Try the below code:
driver.get('https://publicaccess.aberdeencity.gov.uk/online-applications/search.do?action=monthlyList')
search_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.button.primary')))
search_btn.click()
condition = True
while condition:
links = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'li.searchresult a')))
for link in links:
print(link.get_attribute('href'))
if len(driver.find_elements_by_css_selector('.next')) > 0:
driver.find_element_by_css_selector('.next').click()
else:
condition = False
driver.quit()
Following import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Here you go
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"C:\Users\matt_\Documents\Python Scripts\Selenium\chromedriver.exe")
driver.get("https://publicaccess.aberdeencity.gov.uk/online-applications/search.do?action=monthlyList")
driver.find_element_by_css_selector("input[value='Search']").click()
def parse():
links = driver.find_elements_by_xpath('//*[#id="searchresults"]/li/a')
for link in links:
print(link.text, link.get_attribute("href"))
try:
driver.find_element_by_class_name('next').click()
parse()
except:
print('complete')
parse()

Selenium no another page

I am scraping one page but the problem i came up today was that the page didn`t have another page and it gave me the previous page without any error from which i could determine that page was last one..
for ex: https://example/page-7
when i want to go to: https://example/page-8 which doesn`t exist it gives me
the last page: https://example/page-7
How could i determine that https://example/page-7 was the last page using python3???
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-1"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
for j in range(100):
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page{0}".format(j+2)
driver.get(page)
dd = driver.page_source
At first i was thinking about checking dublicates of collected data but this is too slow cause i have 30 000 links from which i have to collect data. Maybe there is easier solution??
Found the answer to my own question.
To find the page url just use driver.current_url
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
current_pages = []
for j in range(100):
page_url = driver.current_url
if(page_url not in current_pages):
current_pages.append(page_url)
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-{0}".format(j+2)
driver.get(page)
dd = driver.page_source
else:
print(current_pages)
driver.quit()
break

Scraping: scraped links - now unable to scrape and dump html files into a folder

Using Python, Selenium, Sublime and Firefox: I am scraping the links off of this website and would like to save the scraped pages (as html files) into a folder. However, I have been working for days on trying to get the body of these html files to dump into a dropbox folder. The problem is 1) saving the html files and 2) saving them to a dropbox folder (or any folder).
I have successfully written code that will perform a search, then scrape the links off of a series of webpages. The following code works well for that.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import re
import csv
import pickle
import signal
import time
def handler(signum, frame):
raise Exception('Last Resort!')
signal.signal(signal.SIGALRM,handler)
def isReady(browser):
return browser.execute_script("return document.readyState")=="complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
def waitUntilReadyBreak(browser_b,url,counter):
try:
signal.alarm(counter)
waitUntilReady(browser_b)
signal.alarm(0)
except Exception,e:
print e
signal.alarm(0)
browser_b.close()
browser_b = webdriver.Firefox()
browser_b.get(url)
waitUntilReadyBreak(browser_b,url,counter)
return browser_b
browser = webdriver.Firefox()
thisurl = 'http://www.usprwire.com/cgi-bin/news/search.cgi'
browser.get(thisurl)
waitUntilReady(browser)
numarticles = 0
elem = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
elem = browser.find_element_by_name("query")
elem.send_keys('"test"')
form = browser.find_element_by_xpath("/html/body/center/table/tbody/tr/td/table/tbody/tr[3]/td/table/tbody/tr[3]/td[2]/table/tbody/tr[3]/td/table/tbody/tr[1]/td/font/input[2]").click()
nextpage = False
all_newproduct_links = []
npages = 200
for page in range(1,npages+1):
if page == 1:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_A_PAGE"
elem = browser.find_element_by_link_text('[>>]').click()
waitUntilReady(browser)
if page >=2 <= 200:
# click the dots
print page
print page
print "B4 LastLoop"
elems = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.CLASS_NAME, "category_links")))
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_C_PAGE"
# This is the part that will not work :(
for e in elems:
numarticles = numarticles+1
numpages = 0
numpages = numpages+1000
article_url = e.get_attribute('href')
print 'waiting'
bodyelem.send_keys(Keys.COMMAND + "2")
browser.get(article_url)
waitUntilReady(browser)
fw = open('/Users/My/Dropbox/MainFile/articlesdata/'+str(page)+str(numpages)+str(numarticles)+'.html','w')
fw.write(browser.page_source.encode('utf-8'))
fw.close()
bodyelem2 = browser.find_elements_by_xpath("//body")[0]
bodyelem2.send_keys(Keys.COMMAND + "1")
The above (for e in elems:) is meant to click on the page and create an html file containing the body of the scraped page. I seem to be missing something fundamental.
Any guidance at all would be most appreciated.
I think you are overcomplicating it.
There is at least one problem in this block:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
elems would contain a list of elements found by find_elements_by_tag_name(), but then, you are using the same elems variable in the list comprehension. As a result, when you are iterating over elems later, you are getting an error, since elems now refer to a single element and not a list.
Anyway, here is the approach I would take:
gather all the article urls first
iterate over the urls one by one and save the HTML source using the page url name as a filename. E.g. _Iran_Shipping_Report_Q4_2014_is_now_available_at_Fast_Market_Research_326303.shtml would be the article filename
The code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def isReady(browser):
return browser.execute_script("return document.readyState") == "complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
browser = webdriver.Firefox()
browser.get('http://www.usprwire.com/cgi-bin/news/search.cgi')
# make a search
query = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
query.send_keys('"test"')
submit = browser.find_element_by_xpath("//input[#value='Search']")
submit.click()
# grab article urls
npages = 4
article_urls = []
for page in range(1, npages + 1):
article_urls += [elm.get_attribute("href") for elm in browser.find_elements_by_class_name('category_links')]
browser.find_element_by_link_text('[>>]').click()
# iterate over urls and save the HTML source
for url in article_urls:
browser.get(url)
waitUntilReady(browser)
title = browser.current_url.split("/")[-1]
with open('/Users/My/Dropbox/MainFile/articlesdata/' + title, 'w') as fw:
fw.write(browser.page_source.encode('utf-8'))

Categories