Hello I'm trying to scrape some questions from a web forum
I am able to scrape questions with a
find_elements_by_xpath
it's something like this :
questions = driver.find_elements_by_xpath('//div[#class="autu-generated"]//div[#class="corpus"]//div[#class="body-bd"]//p')
I made a diagram so you can understand my situation :
my problem is if I didn't specify the auto-generated class in the XPath it's gonna return all the values from the other divs (which I don't want )
and writing the auto-generated class manually like I did to test isn't a valid idea because I'm scraping multiple questions with multiple classes
do you guys have any ideas on how to resolve this problem ??
here is the web forum
thank you
my code :
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
from fastparquet.parquet_thrift.parquet.ttypes import TimeUnit
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
driver = webdriver.Chrome('/Users/ossama/Downloads/chromedriver_win32/chromedriver')
page = 1
#looping in pages
while page <= 10:
driver.get('https://forum.bouyguestelecom.fr/questions/browse?flow_state=published&order=created_at.desc&page='+str(page)+'&utf8=✓&search=&with_category%5B%5D=2483')
# checking to click the pop-up cookies interfaces
if page == 1:
#waiting 10s for the pop-up to show up before accepting it
time.sleep(10)
driver.find_element_by_id('popin_tc_privacy_button_3').click()
# store all the links in a list
#question_links = driver.find_elements_by_xpath('//div[#class="corpus"]//a[#class="content_permalink"]')
links = driver.find_elements_by_xpath('//div[#class="corpus"]//a[#class="content_permalink"]')
forum_links= []
for link in links:
value = link.get_attribute("href")
print(value)
forum_links.append(value)
else:
links = driver.find_elements_by_xpath('//div[#class="corpus"]//a[#class="content_permalink"]')
for link in links:
value = link.get_attribute("href")
print(value)
forum_links.append(value)
q_df = pd.DataFrame(forum_links)
q_df.to_csv('forum_links.csv')
page = page + 1
for link in forum_links:
driver.get(link)
#time.sleep(5)
#driver.find_element_by_id('popin_tc_privacy_button_3').click()
questions = driver.find_elements_by_xpath('//div[#class="corpus"]//div[#class="body-bd"]//p')
authors = driver.find_elements_by_xpath('//div[#class="corpus"]//div[#class="metadata"]//dl[#class="author-name"]//dd//a')
dates = driver.find_elements_by_xpath('//div[#class="corpus"]//div[#class="metadata"]//dl[#class="date"]//dd')
questions_list = []
for question in questions:
for author in authors:
for date in dates:
questions_list.append([question.text, author.text, date.text])
print(question.text)
print(author.text)
print(date.text)
q_df = pd.DataFrame(questions_list)
q_df.to_csv('colrow.csv')
Improved XPATH, and removed second loop.
page = 1
while page <= 10:
driver.get(
'https://forum.bouyguestelecom.fr/questions/browse?flow_state=published&order=created_at.desc&page=' + str(
page) + '&utf8=✓&search=&with_category%5B%5D=2483')
driver.maximize_window()
print("Page url: " + driver.current_url)
time.sleep(1)
if page == 1:
AcceptButton = driver.find_element(By.ID, 'popin_tc_privacy_button_3')
AcceptButton.click()
questions = driver.find_elements(By.XPATH, '//div[#class="corpus"]//a[#class="content_permalink"]')
for count, item in enumerate(questions, start=1):
print(str(count) + ": question detail:")
questionfount = driver.find_element(By.XPATH,
"(//div[#class='corpus']//a[#class='content_permalink'])[" + str(
count) + "]")
questionfount.click()
questionInPage = WebDriverWait(driver, 20).until(EC.visibility_of_element_located(
(By.XPATH, "(//p[#class='old-h1']//following::div[contains(#__uid__, "
"'dim')]//div[#class='corpus']//a["
"#class='content_permalink'])[1]")))
author = WebDriverWait(driver, 20).until(EC.visibility_of_element_located(
(By.XPATH, "(//p[#class='old-h1']//following::div[contains(#__uid__, 'dim')]//div["
"#class='corpus']//div[contains(#class, 'metadata')]//dl["
"#class='author-name']//a)[1]")))
date = WebDriverWait(driver, 20).until(EC.visibility_of_element_located(
(By.XPATH, "(//p[#class='old-h1']//following::div[contains(#__uid__, 'dim')]//div["
"#class='corpus']//div[contains(#class, 'metadata')]//dl[#class='date']//dd)[1]")))
print(questionInPage.text)
print(author.text)
print(date.text)
print(
"-----------------------------------------------------------------------------------------------------------")
driver.back()
driver.refresh()
page = page + 1
driver.quit()
Output (in Console):
Page url: https://forum.bouyguestelecom.fr/questions/browse?flow_state=published&order=created_at.desc&page=1&utf8=%E2%9C%93&search=&with_category%5B%5D=2483
1: question detail:
Comment annuler ma commande bbox
ELHADJI
17 novembre 2021
-----------------------------------------------------------------------------------------------------------
2: question detail:
BBOX adsl : Interruption Service Internet ?
GABRIELA
17 novembre 2021
-----------------------------------------------------------------------------------------------------------
to overcome this issue i found that the div with auto-generated class had a uid
so here is what the xpath looks like now :
questions = driver.find_elements_by_xpath('//div[#__uid__="dim2"]//div[#class="corpus"]//div[#class="body-bd"]//p')
sometimes we just gotta focus right !
Related
I am trying to extract the reviews with their respective titles of a particular hotel in Trip Advisor, using Web Scraping techniques with Python and Selenium but I have only been able to extract the reviews of a single page and I need to extract all or most of the reviews, but the pagination is not working. What I do is click on the Next button, iterate over a range of pages and extract the information.
The web scraping is from the home page https://www.tripadvisor.com/Hotel_Review-g562644-d1490165-Reviews-Parador_de_Alcala_de_Henares-Alcala_De_Henares.html
notice that when the page changes this is added: -or20- depending if it is the fourth -or30- or fifth -or40- always shows 10 reviews
for example this is the third page: https://www.tripadvisor.com/Hotel_Review-g562644-d1490165-Reviews-or20-Parador_de_Alcala_de_Henares-Alcala_De_Henares.html
Basically this is what I do: read a csv, open the page, (change to all laguanges, this is optional), expand reviews, read reviews, click on Next button, write to csv, iterate in a range of pages.
Any help, thanks in advance !!
Images:
Trip Advisor HTML reviews
Trip Advisor HTML Next button
This is my code so far:
# Web scraping
# Writing to csv
with open('reviews_hotel_9.csv', 'w', encoding="utf-8") as file:
file.write('titles, reviews \n')
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.tripadvisor.com/Hotel_Review-g562644-d1490165-Reviews-Parador_de_Alcala_de_Henares-Alcala_De_Henares.html")
sleep(3)
cookie = driver.find_element_by_xpath('//*[#id="onetrust-accept-btn-handler"]')# cookies accept
try:
cookie.click()
except:
pass
print('ok')
for k in range(10): #range pagination
#container = driver.find_elements_by_xpath("//div[#data-reviewid]")
try:
# radio button all languages (optional)
#driver.execute_script("arguments[0].click();", WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="component_14"]/div/div[3]/div[1]/div[1]/div[4]/ul/li[1]/label/span[1]'))))
# read more expand reviews
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//span[#class="Ignyf _S Z"]'))))
# review titles
titles = driver.find_elements_by_xpath('//div[#class="KgQgP MC _S b S6 H5 _a"]/a/span') #
sleep(1)
# reviews
reviews = driver.find_elements_by_xpath('//q[#class="QewHA H4 _a"]/span')#
sleep(1)
except TimeoutException:
pass
with open('reviews_hotel_9.csv', 'a', encoding="utf-8") as file:
for i in range(len(titles)):
file.write(titles[i].text + ";" + reviews[i].text + "\n")
try:
#driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//a[#class="ui_button nav next primary "]')))) #
# click on Next button
driver.execute_script("arguments[0].click();", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'ui_button nav next primary '))))
except TimeoutException:
pass
file.close()
driver.quit()
You can scrape data from each review page by clicking on the Next button on the review page. In the code below we go on an infinite loop and keep clicking the Next button. We stop going ahead when we encounter the exception ElementClickInterceptedException. At the end we are saving the data to the file MyData.csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import ElementClickInterceptedException
import pandas as pd
import time
chrome_path = r"C:\Users\hpoddar\Desktop\Tools\chromedriver_win32\chromedriver.exe"
s = Service(chrome_path)
url = 'https://www.tripadvisor.com/Hotel_Review-g562644-d1490165-Reviews-Parador_de_Alcala_de_Henares-Alcala_De_Henares.html'
driver = webdriver.Chrome(service=s)
driver.get(url)
df = pd.DataFrame(columns = ['title', 'review'])
while True:
time.sleep(2)
reviews = driver.find_elements(by=By.CSS_SELECTOR, value='.WAllg._T')
for review in reviews:
title = review.find_element(by=By.CSS_SELECTOR, value='.KgQgP.MC._S.b.S6.H5._a').text
review = review.find_element(by=By.CSS_SELECTOR, value='.fIrGe._T').text
df.loc[len(df)] = [title, review]
try:
driver.find_element(by=By.CSS_SELECTOR, value='.ui_button.nav.next.primary').click()
except ElementClickInterceptedException:
break
df.to_csv("MyData.csv")
This gives us the output :
Error : selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document.
website I'm scraping https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/graphit-512gb I wanted to loop this tariff details with each section and each radio button shows different prices. I wanted to scrape, price details for each radio buttons one by one and checked radio button name along with price till end of the page. I have tried but I couldn't make success.
could anyone help on this. I will be helpful for me to learn. I have tried till get entered in to change tariff link and I'm facing issue to scrape a details. change tariff links given below links,
https://i.stack.imgur.com/RRyJa.png
https://i.stack.imgur.com/fNafB.png
https://i.stack.imgur.com/jFnLA.png
https://i.stack.imgur.com/WlyLU.png
"I'm trying to click a radio button and need to scrape a price details for selected radio button."
import xlwt
from selenium import webdriver
import re
import time
from datetime import date
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
class telekommobiles:
def __init__(self):
self.url="https://www.telekom.de/mobilfunk/geraete/smartphone?page=1&pageFilter=promotion"
self.country='DE'
self.currency='GBP'
self.VAT='Included'
self.shipping = 'free shipping within 3-4 weeks'
self.Pre_PromotionPrice ='N/A'
self.color ='N/A'
def telekom(self):
#try:
driver=webdriver.Chrome()
driver.maximize_window()
driver.get(self.url)
today = date.today()
#time.sleep(5)
WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//*[#id='consentAcceptAll']")))
cookies = driver.find_element_by_css_selector('button.cl-btn.cl-btn--accept-all').click()
print("cookies accepted")
links_prod_check = []
prod_models = []
prod_manufacturer =[]
prod_memorys = []
product_colors =[]
product_price_monthly_payments = []
product_price_one_time_payments =[]
product_links = []
containers = driver.find_elements_by_css_selector('div[class="styles_item__12Aw4"]')
i = 1
for container in containers:
p_links =container.find_element_by_tag_name('a').get_attribute('href')
i = i + 1
product_links.append(p_links)
#print(p_links)
for links in product_links:
driver.get(links)
#time.sleep(5)
#print(driver.current_url)
#links_prod_check.append(driver.current_url)
coloroptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//li[#data-qa='list_ColorVariant']")))
#print(coloroptions)
for i in range(len(coloroptions)):
coloroption = driver.find_elements_by_xpath("//li[#data-qa='list_ColorVariant']")
coloroption[i].click()
#print(coloroption[i])
time.sleep(3)
memoryoptions = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(memoryoptions)):
memoryoption = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
try:
memoryoption[i].click()
except:
pass
time.sleep(3)
change_traiff = driver.find_element_by_css_selector('button[class="phx-link phx-list-of-links__link js-mod tracking-added"]').click()
time.sleep(3)
section_loops = driver.find_elements_by_css_selector('section[class="tariff-catalog--layer"]')
for section_loop in section_loops:
#Headings
heading_1 = section_loop.find_element_by_css_selector('h2[class="page-title page-title--lowercase"]').text
print(heading_1)
looping_for_tariff = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
subcontainers = section_loop.find_elements_by_css_selector('div[class="phx-tariff-box__section"]')
for subcontainer in subcontainers:
radio_buttons_list=subcontainer.find_elements_by_css_selector('div[class="phx-form__row phx-form__row--small phx-form__row--full-width phx-form__row--radio"]')
for radio in radio_buttons_list:
input=radio.find_elements_by_css_selector('span[class="phx-radio__element"]')
if input[0].is_enabled():
try:
ActionChains(driver).move_to_element(subcontainer).perform()
time.sleep(2)
input[0].click()
time.sleep(3)
except:
print('Not clickable')
pass
lable_list=radio.find_elements_by_css_selector('span[class="phx-radio__label"]')
label=""
if lable_list:
label=lable_list[0].text
heading_2 = subcontainer.find_element_by_css_selector('p[class="phx-t6 phx-t--medium"]').text
data_price_list= subcontainer.find_element_by_css_selector('div[class="phx-tariff-box__data-price"]')
volumn_list=data_price_list.find_elements_by_css_selector('div[data-qa="label_Tariff_VolumeSize"]')
volumn=""
if volumn_list:
volumn=volumn_list[0].text
price_list=subcontainer.find_elements_by_css_selector('p[class="phx-price phx-price--size_large phx-price--strong phx-price--color_brand"]')
price=""
nonBreakSpace = u'\xa0'
if price_list:
price=price_list[0].text
print(str(heading_2) + " " + str(label) + " " + str(volumn.replace(' ', '').replace( '\\r\\n','')) + " " + str(price))
#except:
#pass
telekom_de=telekommobiles()
telekom_de.telekom()
After selecting a different Option the page gets Refreshed, hence the issue. I was not able to find where you were trying to click on the buttons in your code. So tried to click on all the radio buttons with below code and was successful. Check the code once.
from selenium import webdriver
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome(executable_path="path to chromedriver.exe")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.telekom.de/unterwegs/apple/apple-iphone-13-pro/sierrablau-128gb")
wait = WebDriverWait(driver,30)
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[text()='Accept All']"))).click()
radiooptions = wait.until(EC.presence_of_all_elements_located((By.XPATH,"//span[#class='phx-radio__element']")))
for i in range(len(radiooptions)):
radiooptions = driver.find_elements_by_xpath("//span[#class='phx-radio__element']")
radiooptions[i].click()
time.sleep(2)
please li element instead of span
//li[#data-qa='list_ColorVariant']
and also add wait once you click on it. 5secs. then click the next one
So I am trying to scrape usernames and comments from multiple posts. Using this code below.
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
import sys
import pandas as pd
from pandas import ExcelWriter
import os.path
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url=['https://www.instagram.com/p/CRLe53_hmMH','https://www.instagram.com/p/CRX7VL1sL54/?utm_medium=share_sheet',
'https://www.instagram.com/p/CRVB7ykM7-R/?utm_medium=share_sheet', 'https://www.instagram.com/p/CRQ9Bq5M6ce/?utm_medium=share_sheet',
'https://www.instagram.com/p/CRQT1BJMmSi/?utm_medium=share_sheet', 'https://www.instagram.com/p/CM8T3HgMQG0/?utm_medium=copy_link'
'https://www.instagram.com/p/COrn5fYs78O/?utm_medium=share_sheet']
user_names = []
user_comments = []
driver = driver = webdriver.Chrome('E:/chromedriver')
driver.get(url[0])
time.sleep(3)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))
username.clear()
username.send_keys('myuname')
password.clear()
password.send_keys('mypassword')
Login_button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()
time.sleep(4)
not_now = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
not_now2 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
for n in url:
try:
driver.get(n)
time.sleep(3)
load_more_comment = driver.find_element_by_class_name('glyphsSpriteCircle_add__outline__24__grey_9')
print("Found {}".format(str(load_more_comment)))
i = 0
while load_more_comment.is_displayed() and i < 10:
load_more_comment.click()
time.sleep(1.5)
load_more_comment = driver.find_element_by_class_name('glyphsSpriteCircle_add__outline__24__grey_9')
print("Found {}".format(str(load_more_comment)))
i += 1
user_names.pop(0)
user_comments.pop(0)
except Exception as e:
print(e)
pass
comment = driver.find_elements_by_class_name('gElp9 ')
for c in comment:
container = c.find_element_by_class_name('C4VMK')
name = container.find_element_by_class_name('_6lAjh ').text
content = container.find_element_by_tag_name('span').text
content = content.replace('\n', ' ').strip().rstrip()
user_names.append(name)
user_comments.append(content)
print(content)
user_names.pop(0)
user_comments.pop(0)
#export(user_names, user_comments)
driver.close()
df = pd.DataFrame(list(zip(user_names, user_comments)),
columns =['Name', 'Comments'])
#df.to_excel('ujicoba_gabung_IG_6.xlsx')
print(df)
But somehow instead of returning username and comment, both user_names and user_comments return usernames. Where did I make a mistake?
Here Are My outputs
I think my problem is on the for loop where I declare the container as C4VMK. But I inspected the element on Instagram it is already the same
There are two span in C4VMK class. First in h3 -> first div -> span and second is that one you want.
For getting the second span that is the comment, replace your code with below and get the second element.
content = container.find_elements_by_tag_name('span')[1].text
Your container is correct. However, when you search for a span by tag name like this:
content = container.find_element_by_tag_name('span').text
Selenium will find the first span that is under the content. Which in this case is the username span with the class 'Jv7Aj mArmR MqpiF '.
What you are looking for is the other span that I highlighted in the image, which is a direct child of the container with an empty class.
You can select it like this:
content = container.find_element_by_xpath("/span[#class='']")
I'm scraping an E-Commerce website, Lazada using Selenium and bs4, I manage to scrape on the 1st page but I unable to iterate to the next page. What I'm tyring to achieve is to scrape the whole pages based on the categories I've selected.
Here what I've tried :
# Run the argument with incognito
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get('https://www.lazada.com.my/')
driver.maximize_window()
# Select category item #
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
t = 10
try:
WebDriverWait(driver,t).until(EC.visibility_of_element_located((By.ID,"a2o4k.searchlistcategory.0.i0.460b6883jV3Y0q")))
except TimeoutException:
print('Page Refresh!')
driver.refresh()
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
print('Page Load!')
#Soup and select element
def getData(np):
soup = bs(driver.page_source, "lxml")
product_containers = soup.findAll("div", class_='c2prKC')
for p in product_containers:
title = (p.find(class_='c16H9d').text)#title
selling_price = (p.find(class_='c13VH6').text)#selling price
try:
original_price=(p.find("del", class_='c13VH6').text)#original price
except:
original_price = "-1"
if p.find("i", class_='ic-dynamic-badge ic-dynamic-badge-freeShipping ic-dynamic-group-2'):
freeShipping = 1
else:
freeShipping = 0
try:
discount = (p.find("span", class_='c1hkC1').text)
except:
discount ="-1"
if p.find(("div", {'class':['c16H9d']})):
url = "https:"+(p.find("a").get("href"))
else:
url = "-1"
nextpage_elements = driver.find_elements_by_class_name('ant-pagination-next')[0]
np=webdriver.ActionChains(driver).move_to_element(nextpage_elements).click(nextpage_elements).perform()
print("- -"*30)
toSave = [title,selling_price,original_price,freeShipping,discount,url]
print(toSave)
writerows(toSave,filename)
getData(np)
The problem might be that the driver is trying to click the button before the element is even loaded correctly.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(PATH, chrome_options=option)
# use this code after driver initialization
# this is make the driver wait 5 seconds for the page to load.
driver.implicitly_wait(5)
url = "https://www.lazada.com.ph/catalog/?q=phone&_keyori=ss&from=input&spm=a2o4l.home.search.go.239e359dTYxZXo"
driver.get(url)
next_page_path = "//ul[#class='ant-pagination ']//li[#class=' ant-pagination-next']"
# the following code will wait 5 seconds for
# element to become clickable
# and then try clicking the element.
try:
next_page = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, next_page_path)))
next_page.click()
except Exception as e:
print(e)
EDIT 1
Changed the code to make the driver wait for the element to become clickable. You can add this code inside a while loop for iterating multiple times and break the loop if the button is not found and is not clickable.
I am trying to make a scraping application to scrape Hants.gov.uk and right now I am working on it just clicking the pages instead of scraping. When it gets to the last row on page 1 it just stopped, so what I did was make it click button "Next Page" but first it has to go back to the original URL. It clicks page 2, but after page 2 is scraped it doesn't go to page 3, it just restarts page 2.
Can somebody help me fix this issue?
Code:
import time
import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome(executable_path=r"C:\Users\Goten\Desktop\chromedriver.exe")
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result = []
for link in links:
if link not in result:
result.append(link)
else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
start()
driver.get(url)
for i in range(5):
driver.find_element_by_id("ctl00_mainContentPlaceHolder_lvResults_bottomPager_ctl02_NextButton").click()
url = driver.current_url
start()
driver.get(url)
driver.close()
try this:
import time
# import config # Don't worry about this. This is an external file to make a DB
import urllib.request
from bs4 import BeautifulSoup
from selenium import webdriver
url = "https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("mainContentPlaceHolder_btnAccept").click()
result = []
def start():
elements = driver.find_elements_by_css_selector(".searchResult a")
links = [link.get_attribute("href") for link in elements]
result.extend(links)
def start2():
for link in result:
# if link not in result:
# result.append(link)
# else:
driver.get(link)
goUrl = urllib.request.urlopen(link)
soup = BeautifulSoup(goUrl.read(), "html.parser")
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
for i in range(20):
pass # Don't worry about all this commented code, it isn't relevant right now
#table = soup.find_element_by_id("table", {"class": "applicationDetails"})
#print(table.text)
# div = soup.select("div.applicationDetails")
# getDiv = div[i].split(":")[1].get_text()
# log = open("log.txt", "a")
# log.write(getDiv + "\n")
#log.write("\n")
while True:
start()
element = driver.find_element_by_class_name('rdpPageNext')
try:
check = element.get_attribute('onclick')
if check != "return false;":
element.click()
else:
break
except:
break
print(result)
start2()
driver.get(url)
As per the url https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True to click through all the pages you can use the following solution:
Code Block:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe')
driver.get('https://planning.hants.gov.uk/SearchResults.aspx?RecentDecisions=True')
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "mainContentPlaceHolder_btnAccept"))).click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "div#ctl00_mainContentPlaceHolder_lvResults_topPager div.rdpWrap.rdpNumPart>a"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scrapping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='ctl00_mainContentPlaceHolder_lvResults_topPager']//div[#class='rdpWrap rdpNumPart']//a[#class='rdpCurrentPage']/span//following::span[1]"))).click()
driver.quit()
Console Output:
8
Perform your scrapping here on page 1
Perform your scrapping here on page 2
Perform your scrapping here on page 3
Perform your scrapping here on page 4
Perform your scrapping here on page 5
Perform your scrapping here on page 6
Perform your scrapping here on page 7
Perform your scrapping here on page 8
hi #Feitan Portor you have written the code absolutely perfect the only reason that you are redirected back to the first page is because you have given url = driver.current_url in the last for loop where it is the url that remains static and only the java script that instigates the next click event so just remove url = driver.current_url and driver.get(url)
and you are good to go i have tested my self
also to get the current page that your scraper is in just add this part in the for loop so you will get to know where your scraper is :
ss = driver.find_element_by_class_name('rdpCurrentPage').text
print(ss)
Hope this solves your confusion