I've been trying to scrape a table of contents with Selenium and Beautiful Soup, but I can't seem to find a good way to loop through the table's pages given how the HTML is written as there is no next button and the currently selected page button has the active class.
This is the code I have so far:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import pandas as pd
path_driver = "C:/Users/CS330584/Documents/Documentos de Defesa da Concorrência/Automatização de Processos/chromedriver.exe"
website = "https://sat.sef.sc.gov.br/tax.NET/Sat.Dva.Web/ConsultaPublicaDevedores.aspx"
value_search = "300"
driver = webdriver.Chrome(path_driver)
driver.get(website)
search_max = driver.find_element_by_id("Body_Main_Main_ctl00_txtTotalDevedores")
search_max.send_keys(value_search)
btn_consult = driver.find_element_by_id("Body_Main_Main_ctl00_btnBuscar")
btn_consult.click()
driver.implicitly_wait(10)
i = 1
while True:
try:
#some wait
driver.find_element_by_xpath("//*[#id='Body_Main_Main_grpDevedores_gridView']/tbody/tr[51]/td/ul/li' and .='[]']".format(str(i + 1))).click()
except:
break
How can I effectively (or even not so effectively) loop through these table pages in order to scrape the data ?
Buttons to next pages run JavaScript code
javascript:GridView_ScrollToTop('Body_Main_Main_grpDevedores_gridView');__doPostBack('ctl00$ctl00$ctl00$Body$Main$Main$grpDevedores$gridView','Page$1')
and you can also use it to change pages.
You have to only update number in Page$1 - ie. using f-string
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import math
path_driver = "C:/Users/CS330584/Documents/Documentos de Defesa da Concorrência/Automatização de Processos/chromedriver.exe"
website = "https://sat.sef.sc.gov.br/tax.NET/Sat.Dva.Web/ConsultaPublicaDevedores.aspx"
value_search = 300
#driver = webdriver.Chrome(path_driver)
driver = webdriver.Firefox()
driver.get(website)
search_max = driver.find_element_by_id("Body_Main_Main_ctl00_txtTotalDevedores")
search_max.send_keys(value_search)
btn_consult = driver.find_element_by_id("Body_Main_Main_ctl00_btnBuscar")
btn_consult.click()
driver.implicitly_wait(10)
pages = math.ceil(value_search/50)
print('pages:', pages)
for i in range(2, pages+1):
try:
time.sleep(2)
driver.execute_script(f"javascript:GridView_ScrollToTop('Body_Main_Main_grpDevedores_gridView');__doPostBack('ctl00$ctl00$ctl00$Body$Main$Main$grpDevedores$gridView','Page${i}')")
except Exception as ex:
print(ex)
break
You can also get all links to page and use i as index - you have to add 1 to skip link <<
for i in range(2, pages+1):
try:
time.sleep(2)
all_links = driver.find_elements_by_xpath('//tr[#class="sat-gv-pagination-row"]//li//a')
all_links[i+1].click()
except Exception as ex:
print(ex)
break
Or you can use f-string to create xpath with li[{i+1}]
for i in range(2, pages+1):
try:
time.sleep(2)
next_link = driver.find_element_by_xpath(f'//tr[#class="sat-gv-pagination-row"]//li[{i+1}]//a')
next_link.click()
except Exception as ex:
print(ex)
break
Related
I am using Python and Selenium to scrape tripadvisor all the reviews of a particular hotel and I am new to scraping. But currently it's scraping reviews from first 6 pages out of 36 pages. I need to scrape the reviews from all the pages in that hotel and save them into a csv file. Following is the code I'm using.
import csv
import time
import requests
import re
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
driver = webdriver.Chrome("./chromedriver")
def check_exists_by_xpath(xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
time.sleep(2)
def getHotelReviews():
# Find and click the More link (to load all reviews)
driver.find_element_by_xpath("//span[#class='_33O9dg0j']").click()
time.sleep(20)
reviews = driver.find_elements_by_xpath("//div[#data-test-target='reviews-tab']/div")
reviews_count = len(reviews)
print(reviews_count)
# Loop through the reviews found
for i in range(2, reviews_count):
try:
if (check_exists_by_xpath(".//div[contains(#class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]")):
moreBtn = reviews[i].find_element_by_xpath(
".//div[contains(#class,'_2f_ruteS _1bona3Pu _2uD5bLZZ')]/div[2]/div/span[1]").click()
time.sleep(20)
if (check_exists_by_xpath(".//div[contains(#class,'_2f_ruteS _1bona3Pu')]/div/q/span")):
review = reviews[i].find_element_by_xpath(
".//div[contains(#class,'_2f_ruteS _1bona3Pu')]/div/q/span").text
print(review)
date = reviews[i].find_element_by_xpath(".//span[contains(#class,'_34Xs-BQm')]").text
print(date)
title = reviews[i].find_element_by_xpath(".//div[contains(#class,'glasR4aX')]/a/span").text
print(title)
# Save to CSV
csvWriter.writerow((date, title, review))
except:
break
driver.close()
driver.switch_to.window(driver.window_handles[0])
def getHotelPages(url):
driver.get(url)
# to maximize the driver
driver.maximize_window()
nextPage = driver.find_elements_by_xpath("//a[contains(#class,'pageNum cx_brand_refresh_phase2 ')]")
noOfPages = len(nextPage)
print(noOfPages)
for i in range(noOfPages):
print(nextPage[i].get_attribute("href"))
URLs.append(nextPage[i].get_attribute("href"))
URLs = [
'https://www.tripadvisor.com/Hotel_Review-g304141-d3895228-Reviews-The_Hideout_Sigiriya-Sigiriya_Central_Province.html#REVIEWS']
# Prepare CSV file
csvFile = open("hideoutSigiriyab_reviews1.csv", "w", newline='', encoding="utf-8")
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['Date', 'Title', 'Review'])
try:
getHotelPages(URLs[0])
except:
print("Error!!")
time.sleep(60)
for url in URLs:
driver.execute_script("window.open('');")
driver.switch_to.window(driver.window_handles[1])
driver.get(url)
getHotelReviews()
time.sleep(20)
csvFile.close()
driver.close()
Can you help me by suggesting a method or a working code to scrape the reviews from all the pages of a hotel.
Simple way to click pages 1-36.
size=int(driver.find_element_by_css_selector('div.pageNumbers >a:nth-last-child(1)').text)
for i in range(2,size):
pageNums=WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.pageNumbers")))
pageNums.find_element_by_xpath("//a[text()='{}']".format(i)).click()
time.sleep(5)
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
I'm scraping an E-Commerce website, Lazada using Selenium and bs4, I manage to scrape on the 1st page but I unable to iterate to the next page. What I'm tyring to achieve is to scrape the whole pages based on the categories I've selected.
Here what I've tried :
# Run the argument with incognito
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get('https://www.lazada.com.my/')
driver.maximize_window()
# Select category item #
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
t = 10
try:
WebDriverWait(driver,t).until(EC.visibility_of_element_located((By.ID,"a2o4k.searchlistcategory.0.i0.460b6883jV3Y0q")))
except TimeoutException:
print('Page Refresh!')
driver.refresh()
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
print('Page Load!')
#Soup and select element
def getData(np):
soup = bs(driver.page_source, "lxml")
product_containers = soup.findAll("div", class_='c2prKC')
for p in product_containers:
title = (p.find(class_='c16H9d').text)#title
selling_price = (p.find(class_='c13VH6').text)#selling price
try:
original_price=(p.find("del", class_='c13VH6').text)#original price
except:
original_price = "-1"
if p.find("i", class_='ic-dynamic-badge ic-dynamic-badge-freeShipping ic-dynamic-group-2'):
freeShipping = 1
else:
freeShipping = 0
try:
discount = (p.find("span", class_='c1hkC1').text)
except:
discount ="-1"
if p.find(("div", {'class':['c16H9d']})):
url = "https:"+(p.find("a").get("href"))
else:
url = "-1"
nextpage_elements = driver.find_elements_by_class_name('ant-pagination-next')[0]
np=webdriver.ActionChains(driver).move_to_element(nextpage_elements).click(nextpage_elements).perform()
print("- -"*30)
toSave = [title,selling_price,original_price,freeShipping,discount,url]
print(toSave)
writerows(toSave,filename)
getData(np)
The problem might be that the driver is trying to click the button before the element is even loaded correctly.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(PATH, chrome_options=option)
# use this code after driver initialization
# this is make the driver wait 5 seconds for the page to load.
driver.implicitly_wait(5)
url = "https://www.lazada.com.ph/catalog/?q=phone&_keyori=ss&from=input&spm=a2o4l.home.search.go.239e359dTYxZXo"
driver.get(url)
next_page_path = "//ul[#class='ant-pagination ']//li[#class=' ant-pagination-next']"
# the following code will wait 5 seconds for
# element to become clickable
# and then try clicking the element.
try:
next_page = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, next_page_path)))
next_page.click()
except Exception as e:
print(e)
EDIT 1
Changed the code to make the driver wait for the element to become clickable. You can add this code inside a while loop for iterating multiple times and break the loop if the button is not found and is not clickable.
I am using Selenium on Python and trying to move the cursor and click on a specific element. This works for the first link and the structure of the HTML is the same for the next link but I get a StaleElementReferenceException for the second link when accessing it through the same webdriver. Why does this happen and how do I fix it? Below is the code I am running. Thank you so much!
def getZest(url):
zestlist = []
yearlist = []
driver.get(url)
time.sleep(5)
result = False;
attempts = 0;
while(attempts < 5):
try:
Home_Value = wait.until(EC.presence_of_element_located((By.XPATH, "//a[text()='Home value']")))
action.move_to_element(Home_Value).click().perform()
zestimate = driver.find_element_by_xpath('//*[#id="ds-home-values"]/div/div[3]/button')
action.move_to_element(zestimate).perform()
result = True
break
except exceptions.StaleElementReferenceException as e:
print(e)
attempts = attempts + 1
fivenums = ["https://www.zillow.com/homedetails/212-Haddrell-St-Mount-Pleasant-SC-29464/10922911_zpid/", "https://www.zillow.com/homedetails/20-Grove-St-Hicksville-NY-11801/31127407_zpid/"]
for num in fivenums:
getZest(num)
I was able to get informations about the first and second link with the following code, without any error:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from selenium.common import exceptions
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"chromedriver.exe")
driver.maximize_window()
def getZest(url):
zestlist = []
yearlist = []
driver.get(url)
time.sleep(5)
result = False
attempts = 0
action = webdriver.ActionChains(driver)
wait = WebDriverWait(driver, 300)
while(attempts < 5):
try:
Home_Value = wait.until(EC.presence_of_element_located((By.XPATH, "//a[text()='Home value']")))
action.move_to_element(Home_Value).click().perform()
zestimate = driver.find_element_by_xpath('//*[#id="ds-home-values"]/div/div[3]/button')
action.move_to_element(zestimate).perform()
result = True
break
except exceptions.StaleElementReferenceException as e:
print(e)
attempts = attempts + 1
fivenums = ["https://www.zillow.com/homedetails/212-Haddrell-St-Mount-Pleasant-SC-29464/10922911_zpid/", "https://www.zillow.com/homedetails/20-Grove-St-Hicksville-NY-11801/31127407_zpid/"]
for num in fivenums:
getZest(num)
In your code there it's not showed where some variables are instantiated so, maybe this is were the problem is located.
However when opening the first link, the website showed me the Google Captcha protection, so, I suppose you have some kind of authorization to scrape the informations with the permission of the owner.
I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion
Using Python, Selenium, Sublime and Firefox: I am scraping the links off of this website and would like to save the scraped pages (as html files) into a folder. However, I have been working for days on trying to get the body of these html files to dump into a dropbox folder. The problem is 1) saving the html files and 2) saving them to a dropbox folder (or any folder).
I have successfully written code that will perform a search, then scrape the links off of a series of webpages. The following code works well for that.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import re
import csv
import pickle
import signal
import time
def handler(signum, frame):
raise Exception('Last Resort!')
signal.signal(signal.SIGALRM,handler)
def isReady(browser):
return browser.execute_script("return document.readyState")=="complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
def waitUntilReadyBreak(browser_b,url,counter):
try:
signal.alarm(counter)
waitUntilReady(browser_b)
signal.alarm(0)
except Exception,e:
print e
signal.alarm(0)
browser_b.close()
browser_b = webdriver.Firefox()
browser_b.get(url)
waitUntilReadyBreak(browser_b,url,counter)
return browser_b
browser = webdriver.Firefox()
thisurl = 'http://www.usprwire.com/cgi-bin/news/search.cgi'
browser.get(thisurl)
waitUntilReady(browser)
numarticles = 0
elem = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
elem = browser.find_element_by_name("query")
elem.send_keys('"test"')
form = browser.find_element_by_xpath("/html/body/center/table/tbody/tr/td/table/tbody/tr[3]/td/table/tbody/tr[3]/td[2]/table/tbody/tr[3]/td/table/tbody/tr[1]/td/font/input[2]").click()
nextpage = False
all_newproduct_links = []
npages = 200
for page in range(1,npages+1):
if page == 1:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_A_PAGE"
elem = browser.find_element_by_link_text('[>>]').click()
waitUntilReady(browser)
if page >=2 <= 200:
# click the dots
print page
print page
print "B4 LastLoop"
elems = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.CLASS_NAME, "category_links")))
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
print page
print article_url
print "END_C_PAGE"
# This is the part that will not work :(
for e in elems:
numarticles = numarticles+1
numpages = 0
numpages = numpages+1000
article_url = e.get_attribute('href')
print 'waiting'
bodyelem.send_keys(Keys.COMMAND + "2")
browser.get(article_url)
waitUntilReady(browser)
fw = open('/Users/My/Dropbox/MainFile/articlesdata/'+str(page)+str(numpages)+str(numarticles)+'.html','w')
fw.write(browser.page_source.encode('utf-8'))
fw.close()
bodyelem2 = browser.find_elements_by_xpath("//body")[0]
bodyelem2.send_keys(Keys.COMMAND + "1")
The above (for e in elems:) is meant to click on the page and create an html file containing the body of the scraped page. I seem to be missing something fundamental.
Any guidance at all would be most appreciated.
I think you are overcomplicating it.
There is at least one problem in this block:
elems = browser.find_elements_by_tag_name('a')
article_url = [elems.get_attribute("href")
for elems in browser.find_elements_by_class_name('category_links')]
elems would contain a list of elements found by find_elements_by_tag_name(), but then, you are using the same elems variable in the list comprehension. As a result, when you are iterating over elems later, you are getting an error, since elems now refer to a single element and not a list.
Anyway, here is the approach I would take:
gather all the article urls first
iterate over the urls one by one and save the HTML source using the page url name as a filename. E.g. _Iran_Shipping_Report_Q4_2014_is_now_available_at_Fast_Market_Research_326303.shtml would be the article filename
The code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def isReady(browser):
return browser.execute_script("return document.readyState") == "complete"
def waitUntilReady(browser):
if not isReady(browser):
waitUntilReady(browser)
browser = webdriver.Firefox()
browser.get('http://www.usprwire.com/cgi-bin/news/search.cgi')
# make a search
query = WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.NAME, "query")))
query.send_keys('"test"')
submit = browser.find_element_by_xpath("//input[#value='Search']")
submit.click()
# grab article urls
npages = 4
article_urls = []
for page in range(1, npages + 1):
article_urls += [elm.get_attribute("href") for elm in browser.find_elements_by_class_name('category_links')]
browser.find_element_by_link_text('[>>]').click()
# iterate over urls and save the HTML source
for url in article_urls:
browser.get(url)
waitUntilReady(browser)
title = browser.current_url.split("/")[-1]
with open('/Users/My/Dropbox/MainFile/articlesdata/' + title, 'w') as fw:
fw.write(browser.page_source.encode('utf-8'))