webscrapping with returning empty list

webscrapping with returning empty list - python

from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2')
soup = BeautifulSoup(driver.page_source, 'lxml')
soup.find_all('div',class_='content-review')
# it always return empty list
# I want to scrap all of review contents from e.g "<div class="content-review" id="325243269"> Super Shampoo, meine Haare glänzt und sind sehr weich. 😍 </div>"
I try multiple ways but it always return empty list.
How should I do in order to solve this problem?

Yo need to wait until page will completely loaded:
driver.get(url)
timeout = 5
try:
element_present = EC.presence_of_element_located((By.CLASS_NAME, 'content-review'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
soup = BeautifulSoup(driver.page_source, 'lxml')
for review in soup.find_all('div', class_='content-review'):
print(review.getText().strip())
Add necessary libs:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
OUTPUT:
Super Shampoo, meine Haare glänzt und sind sehr weich. 😍
Ich verwende dieses Produkt seit kurzem und ich bin begeistert, so ein pflegendes Shampoo habe ich noch nie gehabt. Er gibt meinen Haar Glanz, Geschmeidigkeit und Fülle. Ich kann es nur empfehlen.
Zufrieden
Tolles Shampoo
Sehr gut
Second option - find request with reviews and get data:
url = "https://staticw2.yotpo.com/batch/1eunvtBQrA7MbZslPu3gAznkZCUjvEeL5tp0uybR/80053469-250"
payload='methods=%5B%7B%22method%22%3A%22main_widget%22%2C%22params%22%3A%7B%22pid%22%3A%2280053469-250%22%2C%22page%22%3A2%2C%22order_metadata_fields%22%3A%7B%7D%2C%22widget_product_id%22%3A%2280053469-250%22%7D%7D%5D&app_key=1eunvtBQrA7MbZslPu3gAznkZCUjvEeL5tp0uybR'
response = requests.request("POST", url, data=payload)
soup = BeautifulSoup(response.json()[0]['result'], 'lxml')
for review in soup.find_all('div', class_='content-review'):
print(review.getText().strip())
With same output

Main issue here is that you need to close 'accept cookies' popup which is located in shadow DOM.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
# OR: driver = webdriver.Chrome(executable_path='D:\Downloads\chromedriver\chromedriver.exe')
url = 'https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2'
driver.get(url)
webdriverWaiter = WebDriverWait(driver, 20)
webdriverWaiter.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, "body"), "class" ,"overflowHidden"))
shadow_host = driver.find_element(By.CSS_SELECTOR, '#usercentrics-root')
shadow_root = shadow_host.shadow_root
accept_cookies_button_css = "button[data-testid='uc-accept-all-button']"
# wait for accept cookies button to appear
accept_cookies_button = None
while not accept_cookies_button:
try:
accept_cookies_button = shadow_root.find_element(By.CSS_SELECTOR, accept_cookies_button_css)
except NoSuchElementException:
time.sleep(1)
# click accept cookies button
clicked = False
while not clicked:
try:
accept_cookies_button.click()
clicked = True
except ElementNotInteractableException:
time.sleep(1)
content_review_css = ".content-review"
webdriverWaiter.until(EC.visibility_of_element_located((By.CSS_SELECTOR, content_review_css)))
reviews = driver.find_elements(By.CSS_SELECTOR, content_review_css)
for rev in reviews:
print(rev.text)
Popup image:

Related

Erro no web_scraping com dropdowns - Python

Hi!
I made a script to be able to generate a table, after meeting some
criteria within the dropdowns. However, there is no data drop-down
list, when you try to set a data, it will always return the same
value, regardless of the data I set for it in the script. I would like
it to return today from the data: 01/01/2020, until.
What is wrong in the script?
My script:
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import shutil
import os
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import matplotlib
import csv
import xlrd
import openpyxl
driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
driver.get('http://estatisticas.cetip.com.br/astec/series_v05/paginas/lum_web_v05_series_introducao.asp?str_Modulo=Ativo&int_Idioma=1&int_Titulo=6&int_NivelBD=2/')
driver.find_element_by_xpath('//*[#id="divContainerIframeBmf"]/div/dl/dd[2]/a').click()
time.sleep(3)
driver.switch_to.frame(driver.find_element(By.XPATH, '//iframe[#name="dados_corpo"]'))
driver.switch_to.frame(driver.find_element(By.XPATH, '//frame[#name="ativo"]'))
find_dp1 = driver.find_element(By.XPATH, '//select[#name="ativo"]')
select_find_dp1 = Select(find_dp1)
select_find_dp1.select_by_visible_text("CBIO - Crédito de descarbonização")
time.sleep(3)
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[1])
time.sleep(1)
informacoes = Select(driver.find_element(By.NAME, 'selectopcoes'))
informacoes.select_by_visible_text('Aposentadoria')
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[2])
time.sleep(2)
# Data Inicial
driver.find_element(By.NAME, 'DT_DIA_DE').send_keys('01')
driver.find_element(By.NAME, 'DT_MES_DE').send_keys('01')
driver.find_element(By.NAME, 'DT_ANO_DE').send_keys('2020')
# Data Final
driver.find_element(By.NAME, 'DT_DIA_ATE').send_keys('31')
driver.find_element(By.NAME, 'DT_MES_ATE').send_keys('12')
driver.find_element(By.NAME, 'DT_ANO_ATE').send_keys('2022')
driver.find_elements(By.CLASS_NAME, 'button')[1].click()
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'iframe'))
time.sleep(1)
driver.find_element(By.CLASS_NAME, 'primary-text').find_element(By.TAG_NAME,'a').click()
time.sleep(4)
origem = 'C:\\Users\\prmatteo\\Downloads\\'
destino = os.path.join(origem, 'C:\\Users\\prmatteo\\OneDrive - COPERSUCAR S.A\\Área de Trabalho\\Arquivos Python\\renovabioaposentadoria.xls')
extensao = '.xls'
for file in os.listdir(origem):
if file.endswith(extensao):
shutil.move(os.path.join(origem,file), destino)

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

I got a dataframe that contains links to google reviews of two restaurants. I wanted to load all reviews of two restaurants (one by one) into the browser and then save them into a new data frame. I wrote a script that reads and load all reviews into the browser as follow:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
link_df = Link
0 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
base_url = i['Link'] #link_df['Link'][i]
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
print('Restaurant number is ',index)
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=1
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
name= ''
rating = ''
text = ''
for index,review_info in enumerate(reviews_info):
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath(".//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath(".//div[#class='Jtu6Td']//span").text
review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
files_present = glob.glob(filename)
if files_present:
review_information.to_csv(filename,index=False,mode='a',header=False)
else:
review_information.to_csv(filename,index=False)
driver.get('https:ww.google.com')
time.sleep(3)
The problem is that script throws an error when it reaches the following line.
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
It throws following error:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=95.0.4638.69)
When I tried the same program without storing google links in dataframe (i.e. no for loop and instead of base_url = i['Link'], I wrote base_url = google review link) it works fine.
I am not sure where I am making the mistake. Any suggestion or help to fix the issue would be highly appreciated?

EDIT
you put the creation of driver outside the for loop
you cant launch the new url with gps data when the first popup is always in front, if you launch it, it stays in backdoor, the easier way is to launch a new url without gps data -> https:ww.google.com and wait 3 dec before to follow your loop:
your count is not good, i have changed your selector and change the total and set some lines in comment
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time
link_df = ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
# i have to launch one time to accept the cookies manually
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])
print ("Headless Firefox Initialized")
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
all_reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
it seems the solution for chrome needs some fixes:
org.openqa.selenium.StaleElementReferenceException: stale element reference: element is not attached to the page document
The literal meaning is about , The referenced element is out of date , No longer attached to the current page . Usually , This is because the page has been refreshed or skipped , The solution is , Reuse findElement or findElements Method to locate the element .
so its seems for chrome there is a problem of refreshing, so i suggest to load the number of record before to scroll, to have a fresh copy of DOM items, and i have to add a wait 1sec at the end of while loop
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time
link_df = [
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
options = Options()
#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
options.binary_location = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\\Téléchargement\\chromedriver.exe" )
# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
#reload to avoid exception, or trap scroll with try/except but more expznsive
all_reviews = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
time.sleep(1)
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()

Can't get HTML source of a Javascript Generated page with selenium

So I got this page (https://www.ssn.gob.ar/storage/registros/productores/productoresactivosfiltro.asp) from where I want to extract data.
You can get the data of the people by only putting numbers in the "Matricula" field, that part is easy, but when it generates the new page and I want to get get the data from a specific div it gives you NONE, and checking the HTML it use to browse the data, it's the same as the page I'm putting the numbers to access the data.
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def clear(): return os.system("cls")
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('--disable-extensions')
driver_path = 'C:\\Users\\Menem Lo Hizo\\Downloads\\chromedriver_win32\\chromedriver.exe'
driver = webdriver.Chrome(driver_path, chrome_options=options)
driver.get('https://www.ssn.gob.ar/storage/registros/productores/productoresactivosfiltro.asp')
matricula = driver.find_element_by_id("matricula")
matricula.send_keys("2")
matricula.send_keys(Keys.RETURN)
try:
div = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "col-md-8 col-md-offset-2"))
)
except:
driver.quit()
clear()
print(div)
This is my code.

Logging ones network traffic when submitting the form reveals an HTTP POST request being made to productoresactivos.asp, the response of which is HTML. Simply imitate that request:
def get_columns():
import requests
from bs4 import BeautifulSoup as Soup
url = "https://www.ssn.gob.ar/storage/registros/productores/productoresactivos.asp"
payload = {
"socpro": "PAS",
"matricula": "2",
"apellidorazonsocial": "",
"docNro": "",
"Submit": "Buscar"
}
response = requests.post(url, data=payload)
response.raise_for_status()
soup = Soup(response.content, "html.parser")
for column in soup.select("div[class^=\"col-md-\"]"):
yield " ".join(column.get_text().strip().split())
def main():
for text in get_columns():
print(text)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
Página 1 de 1
Matrícula: 2
Nombre: CABELLO DE GADANO, MARIA CRISTINA
Documento: DNI - 5263977
CUIT: 27-05263977-3
Ramo: PATRIMONIALES Y VIDA
Domicilio: AV. CORDOBA 669 12º B
Localidad: CIUDAD AUTONOMA BS.AS.
Provincia CIUDAD AUTONOMA
Cod. Postal: 1054
Teléfonos: 4311-5860
E-mail:
Nro. de Resolución 17053
Fº de Resolución 06/01/1983
Nro. de Libro: 01
Nro. de Rubrica: 20395
Fº. de Rubrica: 21/08/1992
Nro. de Libro: 1
Fº. de Rubrica: 20396
Fº. de Rubrica: 21/08/1992
>>>

few things :
You need explicit waits
When You hit enter on first page, a new tabs opens up, you need to switch to that windows
Code :
driver.get("https://www.ssn.gob.ar/storage/registros/productores/productoresactivosfiltro.asp")
wait = WebDriverWait(driver, 10)
org_handles = driver.window_handles
wait.until(EC.element_to_be_clickable((By.ID, "matricula"))).send_keys("2" + Keys.RETURN)
new_handles = driver.window_handles
driver.switch_to.window(new_handles[1])
div = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".col-md-8.col-md-offset-2")))
print(div.text)

Can't locate HTML element on Rotten Tomatoes

I would like to find a movie and get ratings of it on Rotten Tomatoes, but I'm stuck because I don't know how to click on it in search results section. I tried almost every XPATH or CLASS NAME but every time I got error message that it couldn't find the element. I'm using Python Selenium.
My code:
import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def element(driver, by_x, html_element):
try:
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((by_x, html_element))
)
return element
except:
print("Can not locate this element")
class rottenTomatoes:
def __init__(self, film):
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
self.driver = webdriver.Chrome(executable_path="./drivers/chromedriver", options=options)
self.film = film
self.driver.get("https://www.rottentomatoes.com/")
def search(self):
# search for a film
search_bar = self.driver.find_element_by_class_name("search-text")
search_bar.click()
search_bar.send_keys(self.film, Keys.RETURN)
# filter movies only
element(self.driver, By.XPATH, "//*[#id='main-page-content']/div/section[1]/search-result-
container/nav/ul/li[3]/span").click()
# accept cookies
self.driver.find_element_by_id("truste-consent-button").click()
# click on film (THE PROBLEM)
element(self.driver, By.CLASS_NAME, "media-col thumbnail-group").click()
rottentomatoes = rottenTomatoes("Shawshank")
rottentomatoes.search()
EDIT
Error message:
Can not locate this element
Traceback (most recent call last):
File "d:\Programovanie\selenium\movie_info\rotten_tomat.py", line 38, in <module>
rottentomatoes.search()
File "d:\Programovanie\selenium\movie_info\rotten_tomat.py", line 35, in search
element(self.driver, By.CLASS_NAME, "media-col thumbnail-group").click()
AttributeError: 'NoneType' object has no attribute 'click'

import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
def element(driver, by_x, html_element):
try:
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((by_x, html_element))
)
return element
except:
print("Can not locate this element")
class rottenTomatoes:
def __init__(self, film):
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
self.driver = webdriver.Chrome(
options=options)
self.film = film
self.driver.get("https://www.rottentomatoes.com/")
def search(self):
# search for a film
search_bar = self.driver.find_element_by_class_name("search-text")
search_bar.click()
search_bar.send_keys(self.film, Keys.RETURN)
# filter movies only
element(self.driver, By.XPATH, "// *[#id='main-page-content']/div/section[1]/search-result-container/nav/ul/li[3]/span").click()
# accept cookies
time.sleep(5)
try:
self.driver.find_element_by_id("truste-consent-button").click()
except:
pass
ele = self.driver.execute_script(
"return document.querySelector('search-result-container').shadowRoot.querySelector('[type=\"movie\"]').shadowRoot.querySelector('media-row').shadowRoot.querySelector('[class=\"media-row center\"]')")
# click on film (THE PROBLEM)
ele.click()
time.sleep(10)
rottentomatoes = rottenTomatoes("Shawshank")
rottentomatoes.search()
Added the full code the element was inside shadowRoot , so you have to use javascript

You can use requests:
import requests
shearch = "Shawshank"
choose_type = "movie"
url = f"https://www.rottentomatoes.com/napi/search/all?type={choose_type}&searchQuery={shearch}"
r = requests.get(url)
r_json = r.json()
print(r_json)

Parsing a site where URL doesn't change with Selenium Python

I'm trying to scrape [this][1] site its URL doesnt change when next page is clicked on. So, I used Selenium to click on the next page, but doing that doesnt help. As my driver keeps getting the old page even after next page is clicked on. Is there any other way to get to the next page and scrape it?
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Safari()
store_pages = []
#10306 is total number of pages.
for i in range (10306):
Starting_url = 'site'
driver.get(Starting_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
print (store_pages.append(i))
timeout = 20
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_lblDisclaimerMsg']")))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
timeout = 20
wait = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a > div.act_search_results > div.act_search_header'), "206113 Record(s) | Page [2 of 10306]"))
NGO_element = driver.find_element_by_class_name("faq-sub-content exempted-result")
NGO_name = NGO_element.find_elements_by_tag_name("h1")
NGO_name_pancard = driver.find_elements_by_class_name("pan-id")
NGO_data = NGO_element.find_elements_by_tag_name("ul")
NGO_sub_data = NGO_element.find_elements_by_tag_name("li")
for i, p, t in zip(NGO_name, NGO_name_pancard, NGO_data):
n_name = i.text.replace(p.text, '')
n_data = t.text
n_pan = p.text
print ("Name of NGO:", n_name, "Fields of NGO:", n_data, "Pancard number:", n_pan)
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
#timeout = 2

You need to make sure when you reach the next page, the content of the earlier page has become stale otherwise, you will have stale element error or get the same thing repeatedly. Try the below approach, it should get you there. The rest you can modify yourself.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[id^='arrowex']"))):
print(elem.text)
try:
wait.until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_imgbtnNext"))).click()
wait.until(EC.staleness_of(elem))
except:
break
driver.quit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

webscrapping with returning empty list - python

Related

Erro no web_scraping com dropdowns - Python

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

Can't get HTML source of a Javascript Generated page with selenium

Can't locate HTML element on Rotten Tomatoes

Parsing a site where URL doesn't change with Selenium Python

Categories

Resources