navigate to next page and get href link - python

how to navigate page to the last page and get all href link from unchanged link page?
hhere my code is:
url = 'https://hoaxornot.detik.com/paging#'
options = webdriver.ChromeOptions()
pathToChromeDriver = "C:/Program Files/Google/Chrome/Application/chromedriver.exe"
browser = webdriver.Chrome(executable_path=pathToChromeDriver,
options=options)
try:
browser.get(url)
browser.implicitly_wait(10)
html = browser.page_source
page = 1
while page <= 2:
paging = browser.find_elements_by_xpath('//*[#id="number_filters"]/a[{}]'.format(page)).click()
for p in paging:
articles = p.find_elements_by_xpath('//*[#id="results-search-hoax-paging"]/div/div/article/a')
for article in articles:
print(article.get_attribute("href"))
page += 1
finally:
browser.quit()

wait=WebDriverWait(browser,60)
browser.get("https://hoaxornot.detik.com/paging#")
page=1
articles=[]
while True:
try:
time.sleep(1)
pagearticles=wait.until(EC.visibility_of_all_elements_located((By.XPATH,'//*[#id="results-search-hoax-paging"]/div/div/article/a')))
for article in pagearticles:
articles.append(article.get_attribute("href"))
page+=1
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="number_filters"]/a[{}]'.format(page)))).click()
except:
break
print(articles)
Here's a simple way to loop through the pages and wait for the element's visibility to come up so you can obtain their values instead of an empty list.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
Outputs:
['https://news.detik.com/berita/d-5900248/video-jembatan-ambruk-disebut-di-samarinda-faktanya-bukan-di-indonesia', 'https://news.detik.com/berita/d-5898607/kantor-walkot-jakbar-diviralkan-rusak-akibat-gempa-ini-faktanya', 'https://news.detik.com/berita/d-5896931/polisi-di-singkawang-diviralkan-berbahasa-china-di-publik-begini-faktanya', 'https://news.detik.com/berita-jawa-timur/d-5895069/video-viral-hutan-baluran-banjir-dipastikan-hoax-polisi-itu-video-lama', 'https://news.detik.com/internasional/d-5873027/beredar-video-ledakan-parah-di-dubai-ternyata-3-insiden-lama-beda-negara', 'https://news.detik.com/berita/d-5865905/awas-ikut-tertipu-sejumlah-warga-ke-kantor-pln-bali-gegara-hoax-rekrutmen', 'https://news.detik.com/berita/d-5863802/beredar-pesan-gambar-kpk-pantau-muktamar-nu-di-lampung-ini-faktanya', 'https://news.detik.com/berita/d-5842083/viral-video-ayah-pukuli-anak-pakai-balok-kayu-begini-faktanya', 'https://news.detik.com/berita/d-5798562/video-mobil-ngebut-190-kmjam-dikaitkan-vanessa-angel-dipastikan-hoax', 'https://news.detik.com/berita/d-5755035/muncul-isu-liar-jokowi-joget-tanpa-masker-di-papua-ini-faktanya', 'https://news.detik.com/berita/d-5729500/beredar-edaran-penerima-bantuan-pesantren-kemenag-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5715146/5-bersaudara-di-surabaya-butuh-diadopsi-karena-papa-mama-meninggal-covid-19-hoaks', 'https://news.detik.com/berita/d-5714873/minta-maaf-ustaz-royan-jelaskan-viral-5-polisi-angkat-poster-demo-jokowi', 'https://health.detik.com/berita-detikhealth/d-5714239/viral-bawang-putih-tarik-cairan-dari-paru-paru-pasien-corona-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5699731/awas-hoax-viral-info-vaksin-palsu-beredar-di-indonesia-ini-faktanya', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5688266/hoax-pesan-bantuan-subsidi-gaji-rp-35-juta-jangan-dibuka', 'https://news.detik.com/berita-jawa-timur/d-5658878/2-sekolah-ditolak-warga-bondowoso-jadi-tempat-isolasi-satgas-tak-patah-arang', 'https://news.detik.com/berita/d-5655368/viral-video-demo-rusuh-di-jl-gajah-mada-polisi-pastikan-hoax', 'https://news.detik.com/berita/d-5755035/muncul-isu-liar-jokowi-joget-tanpa-masker-di-papua-ini-faktanya', 'https://news.detik.com/berita/d-5729500/beredar-edaran-penerima-bantuan-pesantren-kemenag-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5715146/5-bersaudara-di-surabaya-butuh-diadopsi-karena-papa-mama-meninggal-covid-19-hoaks', 'https://news.detik.com/berita/d-5714873/minta-maaf-ustaz-royan-jelaskan-viral-5-polisi-angkat-poster-demo-jokowi', 'https://health.detik.com/berita-detikhealth/d-5714239/viral-bawang-putih-tarik-cairan-dari-paru-paru-pasien-corona-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5699731/awas-hoax-viral-info-vaksin-palsu-beredar-di-indonesia-ini-faktanya', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5688266/hoax-pesan-bantuan-subsidi-gaji-rp-35-juta-jangan-dibuka', 'https://news.detik.com/berita-jawa-timur/d-5658878/2-sekolah-ditolak-warga-bondowoso-jadi-tempat-isolasi-satgas-tak-patah-arang', 'https://news.detik.com/berita/d-5655368/viral-video-demo-rusuh-di-jl-gajah-mada-polisi-pastikan-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5645668/heboh-ajakan-tolak-ppkm-darurat-di-pekalongan-ini-kata-polisi', 'https://news.detik.com/berita/d-5643373/heboh-tim-covid-buru-warga-tanjungpinang-langgar-ppkm-darurat-ini-faktanya', 'https://news.detik.com/berita/d-5638774/viral-rusa-keliaran-di-jalanan-denpasar-saat-ppkm-darurat-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5635282/deretan-hoax-air-kelapa-netralkan-vaksin-hingga-obati-covid-19', 'https://news.detik.com/berita-jawa-tengah/d-5633158/beredar-pesan-ada-pasien-corona-kabur-di-kudus-ternyata', 'https://news.detik.com/berita-jawa-tengah/d-5622194/viral-tim-sar-klaten-kewalahan-jasad-covid-belum-dimakamkan-ini-faktanya', 'https://news.detik.com/berita/d-5607406/beredar-isu-sutiyoso-meninggal-keluarga-tidak-benar', 'https://news.detik.com/berita-jawa-tengah/d-5603576/waspada-ada-akun-wa-catut-bupati-klaten-minta-sumbangan', 'https://news.detik.com/berita-jawa-tengah/d-5603472/heboh-pesan-berantai-soal-varian-baru-corona-di-kudus-ini-faktanya', 'https://news.detik.com/berita/d-5591931/beredar-poster-konvensi-capres-nu-2024-pbnu-pastikan-hoax', 'https://health.detik.com/berita-detikhealth/d-5591504/viral-hoax-makan-bawang-3-kali-sehari-sembuhkan-corona-ini-faktanya', 'https://news.detik.com/berita/d-5590632/viral-tes-antigen-pakai-air-keran-hasilnya-positif-satgas-kepri-menepis', 'https://news.detik.com/internasional/d-5586179/fakta-di-balik-aksi-penyiar-malaysia-tutup-1-mata-untuk-palestina', 'https://inet.detik.com/cyberlife/d-5585732/waspada-6-hoax-vaksin-bermagnet-hingga-china-siapkan-senjata-biologis', 'https://health.detik.com/berita-detikhealth/d-5533468/viral-jadi-sulit-ereksi-karena-vaksin-sinovac-ini-penjelasan-dokter', 'https://health.detik.com/berita-detikhealth/d-5527149/viral-cacing-di-masker-impor-dari-china-ini-fakta-di-baliknya', 'https://finance.detik.com/energi/d-5526617/viral-gaji-petugas-kebersihan-pertamina-rp-13-juta-manajemen-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5519314/fakta-fakta-gibran-disebut-duduk-di-meja-menteri-pupr-duduk-di-kursi', 'https://finance.detik.com/energi/d-5511928/awas-hoax-bbm-langka-imbas-kilang-kebakaran-pertamina-stok-luber', 'https://news.detik.com/berita-jawa-tengah/d-5511550/viral-gibran-duduk-di-atas-meja-depan-menteri-basuki-begini-faktanya', 'https://news.detik.com/berita/d-5507088/geger-kaca-bus-transmetro-deli-medan-diduga-ditembak-begini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5487986/viral-lansia-non-dki-bisa-vaksin-corona-di-senayan-dipastikan-hoax', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5487983/awas-hoax-pesan-berantai-soal-vaksinasi-lansia-di-istora-senayan', 'https://health.detik.com/berita-detikhealth/d-5480124/hoax-tak-ada-larangan-minum-obat-jantung-sebelum-vaksin-covid-19', 'https://health.detik.com/berita-detikhealth/d-5473657/hoax-kemenkes-bantah-puluhan-wartawan-terkapar-setelah-vaksinasi-covid-19', 'https://health.detik.com/berita-detikhealth/d-5368305/minum-air-putih-bisa-atasi-kekentalan-darah-pasien-covid-19-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5360703/viral-info-penemu-vaksin-covid-19-sinovac-meninggal-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5357602/pasien-jalan-ngangkang-seperti-penguin-disebut-karena-anal-swab-ini-faktanya', 'https://finance.detik.com/moneter/d-5351004/kabar-bi-di-lockdown-bank-internasional-swiss-dipastikan-hoax', 'https://finance.detik.com/berita-ekonomi-bisnis/d-5350942/hoax-jangan-percaya-pesan-berantai-dana-bagi-bagi-uang-tunai', 'https://health.detik.com/berita-detikhealth/d-5340874/sederet-hoax-vaksin-jokowi-disebut-salah-suntik-hingga-tak-sampai-habis', 'https://health.detik.com/berita-detikhealth/d-5338133/hoax-viral-kasdim-0817-gresik-wafat-usai-vaksin-covid-19-ini-faktanya', 'https://health.detik.com/berita-detikhealth/d-5337075/viral-urutan-mandi-agar-tak-kena-stroke-ini-faktanya', 'https://news.detik.com/berita/d-5328895/foto-bayi-selamat-dari-sriwijaya-air-sj182-dipastikan-hoax', 'https://health.detik.com/berita-detikhealth/d-5324630/viral-vaksin-covid-19-memperbesar-penis-bpom-hoax-lah', 'https://news.detik.com/berita-jawa-timur/d-5321500/wawali-surabaya-terpilih-armuji-dikabarkan-meninggal-ketua-dprd-hoaks', 'https://news.detik.com/berita/d-5287986/beredar-chat-kapolda-metro-soal-sikat-laskar-hrs-dipastikan-hoax', 'https://news.detik.com/berita/d-5286913/video-ambulans-fpi-masuk-rs-saat-ricuh-diviralkan-ini-faktanya', 'https://news.detik.com/berita-jawa-tengah/d-5280091/viral-bendung-gerak-serayu-jebol-kepala-upt-itu-kapal-ponton-hanyut', 'https://news.detik.com/berita-jawa-tengah/d-5279872/viral-asrama-isolasi-mandiri-ugm-penuh-ternyata-begini-faktanya', 'https://news.detik.com/berita/d-5275107/kpu-makassar-bantah-keluarkan-flyer-hasil-survei-paslon-pilwalkot-berlogo-kpu', 'https://news.detik.com/berita-jawa-tengah/d-5264429/beredar-voice-note-binatang-buas-gunung-merapi-turun-ke-selo-kades-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5262931/viral-peta-bahaya-gunung-merapi-sejauh-10-km-bpptkg-itu-peta-2010', 'https://health.detik.com/berita-detikhealth/d-5254580/viral-tips-sembuhkan-covid-19-dalam-waktu-5-menit-dokter-paru-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5253524/video-jenazah-covid-19-diviralkan-bola-mata-hilang-keluarga-sebut-hoaks', 'https://news.detik.com/berita/d-5287986/beredar-chat-kapolda-metro-soal-sikat-laskar-hrs-dipastikan-hoax', 'https://news.detik.com/berita/d-5286913/video-ambulans-fpi-masuk-rs-saat-ricuh-diviralkan-ini-faktanya', 'https://news.detik.com/berita-jawa-tengah/d-5280091/viral-bendung-gerak-serayu-jebol-kepala-upt-itu-kapal-ponton-hanyut', 'https://news.detik.com/berita-jawa-tengah/d-5279872/viral-asrama-isolasi-mandiri-ugm-penuh-ternyata-begini-faktanya', 'https://news.detik.com/berita/d-5275107/kpu-makassar-bantah-keluarkan-flyer-hasil-survei-paslon-pilwalkot-berlogo-kpu', 'https://news.detik.com/berita-jawa-tengah/d-5264429/beredar-voice-note-binatang-buas-gunung-merapi-turun-ke-selo-kades-hoax', 'https://news.detik.com/berita-jawa-tengah/d-5262931/viral-peta-bahaya-gunung-merapi-sejauh-10-km-bpptkg-itu-peta-2010', 'https://health.detik.com/berita-detikhealth/d-5254580/viral-tips-sembuhkan-covid-19-dalam-waktu-5-menit-dokter-paru-pastikan-hoax', 'https://news.detik.com/berita-jawa-timur/d-5253524/video-jenazah-covid-19-diviralkan-bola-mata-hilang-keluarga-sebut-hoaks', 'https://news.detik.com/berita/d-3124615/benarkah-sesuap-lele-mengandung-3000-sel-kanker', 'https://news.detik.com/berita/d-3124915/loket-tiket-konser-bon-jovi-di-gbk-dibakar-hoax']

Related

Python Selenium - Extract all URL's In Table and iterate until next button disappears

I am trying to extract all URL's and iterate where the next button is pressed until there isn't a next button. I would then like to open each URL if that is possible. Could I be pointed in the right direction for this please.
The website where you need to press the search button is here
Link to Table of URL's that need to be extracted
from selenium import webdriver
from selenium.webdriver.common.by import By
driver=webdriver.Chrome(executable_path=r"C:\Users\matt_\Documents\Python Scripts\Selenium\chromedriver.exe")
driver.get("https://publicaccess.aberdeencity.gov.uk/online-applications/search.do?action=monthlyList")
driver.find_element_by_xpath("/html/body/div/div/div[3]/div[3]/div/form/fieldset/div[5]/input[2]").click()
test = driver.find_elements(By.TAG_NAME,"a")
print(test)
Here is the example what you looking for
from bs4 import BeautifulSoup as Soup
from selenium import webdriver
import pandas as pd
import time
driver = webdriver.Chrome()
driver.get("https://monerobenchmarks.info/")
page = Soup(driver.page_source, features='html.parser')
final_list = []
def parsh_table():
table = page.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
final_list.extend(row)
def next_bu():
next_button = driver.find_element_by_xpath('//*[#id="cpu_next"]')
next_button.click()
# put range of pages
for _ in range(1,2):
parsh_table()
time.sleep(2)
next_bu()
print(final_list)
You can check the element exists or not with simple logic like this:
if len(driver.find_elements_by_css_selector('.next')) > 0:
Try the below code:
driver.get('https://publicaccess.aberdeencity.gov.uk/online-applications/search.do?action=monthlyList')
search_btn = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.button.primary')))
search_btn.click()
condition = True
while condition:
links = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'li.searchresult a')))
for link in links:
print(link.get_attribute('href'))
if len(driver.find_elements_by_css_selector('.next')) > 0:
driver.find_element_by_css_selector('.next').click()
else:
condition = False
driver.quit()
Following import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
Here you go
from selenium import webdriver
driver = webdriver.Chrome(executable_path=r"C:\Users\matt_\Documents\Python Scripts\Selenium\chromedriver.exe")
driver.get("https://publicaccess.aberdeencity.gov.uk/online-applications/search.do?action=monthlyList")
driver.find_element_by_css_selector("input[value='Search']").click()
def parse():
links = driver.find_elements_by_xpath('//*[#id="searchresults"]/li/a')
for link in links:
print(link.text, link.get_attribute("href"))
try:
driver.find_element_by_class_name('next').click()
parse()
except:
print('complete')
parse()

Python - Selenium next page

I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion

Parsing a site where URL doesn't change with Selenium Python

I'm trying to scrape [this][1] site its URL doesnt change when next page is clicked on. So, I used Selenium to click on the next page, but doing that doesnt help. As my driver keeps getting the old page even after next page is clicked on. Is there any other way to get to the next page and scrape it?
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Safari()
store_pages = []
#10306 is total number of pages.
for i in range (10306):
Starting_url = 'site'
driver.get(Starting_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
print (store_pages.append(i))
timeout = 20
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_lblDisclaimerMsg']")))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
timeout = 20
wait = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a > div.act_search_results > div.act_search_header'), "206113 Record(s) | Page [2 of 10306]"))
NGO_element = driver.find_element_by_class_name("faq-sub-content exempted-result")
NGO_name = NGO_element.find_elements_by_tag_name("h1")
NGO_name_pancard = driver.find_elements_by_class_name("pan-id")
NGO_data = NGO_element.find_elements_by_tag_name("ul")
NGO_sub_data = NGO_element.find_elements_by_tag_name("li")
for i, p, t in zip(NGO_name, NGO_name_pancard, NGO_data):
n_name = i.text.replace(p.text, '')
n_data = t.text
n_pan = p.text
print ("Name of NGO:", n_name, "Fields of NGO:", n_data, "Pancard number:", n_pan)
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
#timeout = 2
You need to make sure when you reach the next page, the content of the earlier page has become stale otherwise, you will have stale element error or get the same thing repeatedly. Try the below approach, it should get you there. The rest you can modify yourself.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[id^='arrowex']"))):
print(elem.text)
try:
wait.until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_imgbtnNext"))).click()
wait.until(EC.staleness_of(elem))
except:
break
driver.quit()

Web scraping using selenium

My intention is to get the name, location, time of posting, title of the review and the whole review content from the web page (http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061).
My code :
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['binary'] = '/etc/firefox'
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup = BeautifulSoup(driver.page_source,"lxml")
for link in soup.select(".profile"):
try:
profile = link.select("p:nth-of-type(1) a")[0]
profile1 = link.select("p:nth-of-type(2)")[0]
except:pass
print(profile.text,profile1.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup1 = BeautifulSoup(driver.page_source,"lxml")
for link in soup1.select(".col-10.review"):
try:
profile2 = link.select("small:nth-of-type(1)")[0]
profile3 = link.select("span:nth-of-type(3)")[0]
profile4 = link.select("a:nth-of-type(1)")[0]
except:pass
print(profile2.text,profile3.text,profile4.text)
driver = webdriver.Firefox(capabilities=firefox_capabilities)
driver.get('http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061')
soup2 = BeautifulSoup(driver.page_source,"lxml")
for link in soup2.select(".more.review"):
try:
containers=page_soup.findAll("div",{"class":"more reviewdata"})
count=len(containers)
for index in range(count):
count1=len(containers[index].p)
for i in range(count1):
profile5 = link.select("p:nth-of-type(i)")[0]
except:pass
print(profile5.text)
driver.quit()
I am getting the output for name, location, time and title of the review but I am unable to get the full review of a user. I would be grateful, if anyone could help me in getting the output for the same, along with the optimization of my code (i.e) I want my code to extract the required data by loading the web page only once. Also, It would be very helpful for me if someone could help me in extracting all the customer reviews of Jio from all the webpages of the website.
You can achieve the same with few lines of code along with lesser pain. However, I've defined here three main categories, as in name, review_title, review_data and the rest of the fields you can twitch very easily.
This is how you can do alternatively:
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = item.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
name = item.find_element_by_css_selector("p a").text
review_title = item.find_element_by_css_selector("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]").text
review_data = ' '.join([' '.join(items.text.split()) for items in item.find_elements_by_css_selector(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()
Or to do the same combinedly (selenium + bs4):
from bs4 import BeautifulSoup
from selenium import webdriver;import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("http://www.mouthshut.com/mobile-operators/Reliance-Jio-reviews-925812061")
wait = WebDriverWait(driver, 10)
for items in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".review-article"))):
link = items.find_element_by_css_selector(".reviewdata a")
link.click()
time.sleep(2)
soup = BeautifulSoup(driver.page_source,"lxml")
for item in soup.select(".review-article"):
name = item.select("p a")[0].text
review_title = item.select("strong a[id^=ctl00_ctl00_ContentPlaceHolderFooter_ContentPlaceHolderBody_rptreviews]")[0].text
review_data = ' '.join([' '.join(items.text.split()) for items in item.select(".reviewdata")])
print("Name: {}\nReview_Title: {}\nReview_Data: {}\n".format(name, review_title, review_data))
driver.quit()

How to wait until element is available in selenium python

I am writing script using selenium python but there is problem i have tried to find solution but i can not find one that was helpful to me. here is the code
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import unittest
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class sulekhastart(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
def test_parse_contact_urls_and_go_to_next_page(self):
pagenumber = 'Page'
#assign WEBDRIVER to local webdriver
driver = self.driver
#Website open by below url
driver.get("http://www.sulekha.com/ac-dealers/bangalore")
self.assertIn("Sulekha Bangalore", driver.title)
#close the lightbox thnat appears at the firsttime load of page
startlightbox = driver.find_element_by_xpath('//a[#class="lcf-close"]')
startlightbox.click()
while True:
#get the page number
pageno = driver.find_element_by_xpath('//li[#id="numberPage"]/strong')
print pageno.text
print pagenumber
#check if page same as last page or not
if str(pageno.text) != pagenumber:
pagenumber = str(pageno.text)
businessname = driver.find_elements_by_xpath('//li/div/div[#class="busi-name"]/h3/a')
records = len(businessname)
#print all data that are available on the webpage
for i in range(0,records):
print businessname[i].get_attribute('href')
print businessname[i].text
nextpage = driver.find_element_by_xpath('//li[#id="nextPage"]')
nextpage.click()
else:
print 'This is last page all data is scraped change url and get another data'
break
element = WebDriverWait(driver, 10).until_not(EC.presence_of_element_located((By.XPATH, "/html/body/div/div/svg")))
def tearDown(self):
self.driver.close()
print 'page not be closed'
if __name__ == "__main__":
unittest.main()
and i want to wait script after click on the next button until By.XPATH, "/html/body/div/div/svg" this element gone from DOM or page source and then after wait until 3 seconds
as andersson commented
replacing
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/svg")))
with
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/*[name()='svg']")))
solves the problem

Categories