Erro no web_scraping com dropdowns - Python - python

Hi!
I made a script to be able to generate a table, after meeting some
criteria within the dropdowns. However, there is no data drop-down
list, when you try to set a data, it will always return the same
value, regardless of the data I set for it in the script. I would like
it to return today from the data: 01/01/2020, until.
What is wrong in the script?
My script:
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import shutil
import os
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import matplotlib
import csv
import xlrd
import openpyxl
driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
driver.get('http://estatisticas.cetip.com.br/astec/series_v05/paginas/lum_web_v05_series_introducao.asp?str_Modulo=Ativo&int_Idioma=1&int_Titulo=6&int_NivelBD=2/')
driver.find_element_by_xpath('//*[#id="divContainerIframeBmf"]/div/dl/dd[2]/a').click()
time.sleep(3)
driver.switch_to.frame(driver.find_element(By.XPATH, '//iframe[#name="dados_corpo"]'))
driver.switch_to.frame(driver.find_element(By.XPATH, '//frame[#name="ativo"]'))
find_dp1 = driver.find_element(By.XPATH, '//select[#name="ativo"]')
select_find_dp1 = Select(find_dp1)
select_find_dp1.select_by_visible_text("CBIO - Crédito de descarbonização")
time.sleep(3)
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[1])
time.sleep(1)
informacoes = Select(driver.find_element(By.NAME, 'selectopcoes'))
informacoes.select_by_visible_text('Aposentadoria')
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[2])
time.sleep(2)
# Data Inicial
driver.find_element(By.NAME, 'DT_DIA_DE').send_keys('01')
driver.find_element(By.NAME, 'DT_MES_DE').send_keys('01')
driver.find_element(By.NAME, 'DT_ANO_DE').send_keys('2020')
# Data Final
driver.find_element(By.NAME, 'DT_DIA_ATE').send_keys('31')
driver.find_element(By.NAME, 'DT_MES_ATE').send_keys('12')
driver.find_element(By.NAME, 'DT_ANO_ATE').send_keys('2022')
driver.find_elements(By.CLASS_NAME, 'button')[1].click()
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'iframe'))
time.sleep(1)
driver.find_element(By.CLASS_NAME, 'primary-text').find_element(By.TAG_NAME,'a').click()
time.sleep(4)
origem = 'C:\\Users\\prmatteo\\Downloads\\'
destino = os.path.join(origem, 'C:\\Users\\prmatteo\\OneDrive - COPERSUCAR S.A\\Área de Trabalho\\Arquivos Python\\renovabioaposentadoria.xls')
extensao = '.xls'
for file in os.listdir(origem):
if file.endswith(extensao):
shutil.move(os.path.join(origem,file), destino)

Related

webscrapping with returning empty list

from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2')
soup = BeautifulSoup(driver.page_source, 'lxml')
soup.find_all('div',class_='content-review')
# it always return empty list
# I want to scrap all of review contents from e.g "<div class="content-review" id="325243269"> Super Shampoo, meine Haare glänzt und sind sehr weich. 😍 </div>"
I try multiple ways but it always return empty list.
How should I do in order to solve this problem?
Yo need to wait until page will completely loaded:
driver.get(url)
timeout = 5
try:
element_present = EC.presence_of_element_located((By.CLASS_NAME, 'content-review'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
soup = BeautifulSoup(driver.page_source, 'lxml')
for review in soup.find_all('div', class_='content-review'):
print(review.getText().strip())
Add necessary libs:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
OUTPUT:
Super Shampoo, meine Haare glänzt und sind sehr weich. 😍
Ich verwende dieses Produkt seit kurzem und ich bin begeistert, so ein pflegendes Shampoo habe ich noch nie gehabt. Er gibt meinen Haar Glanz, Geschmeidigkeit und Fülle. Ich kann es nur empfehlen.
Zufrieden
Tolles Shampoo
Sehr gut
Second option - find request with reviews and get data:
url = "https://staticw2.yotpo.com/batch/1eunvtBQrA7MbZslPu3gAznkZCUjvEeL5tp0uybR/80053469-250"
payload='methods=%5B%7B%22method%22%3A%22main_widget%22%2C%22params%22%3A%7B%22pid%22%3A%2280053469-250%22%2C%22page%22%3A2%2C%22order_metadata_fields%22%3A%7B%7D%2C%22widget_product_id%22%3A%2280053469-250%22%7D%7D%5D&app_key=1eunvtBQrA7MbZslPu3gAznkZCUjvEeL5tp0uybR'
response = requests.request("POST", url, data=payload)
soup = BeautifulSoup(response.json()[0]['result'], 'lxml')
for review in soup.find_all('div', class_='content-review'):
print(review.getText().strip())
With same output
Main issue here is that you need to close 'accept cookies' popup which is located in shadow DOM.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotInteractableException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
driver = webdriver.Chrome()
# OR: driver = webdriver.Chrome(executable_path='D:\Downloads\chromedriver\chromedriver.exe')
url = 'https://www.flaconi.de/haare/kerastase/chronologiste/kerastase-chronologiste-bain-regenerant-haarshampoo.html?yoReviewsPage=2'
driver.get(url)
webdriverWaiter = WebDriverWait(driver, 20)
webdriverWaiter.until(EC.text_to_be_present_in_element_attribute((By.CSS_SELECTOR, "body"), "class" ,"overflowHidden"))
shadow_host = driver.find_element(By.CSS_SELECTOR, '#usercentrics-root')
shadow_root = shadow_host.shadow_root
accept_cookies_button_css = "button[data-testid='uc-accept-all-button']"
# wait for accept cookies button to appear
accept_cookies_button = None
while not accept_cookies_button:
try:
accept_cookies_button = shadow_root.find_element(By.CSS_SELECTOR, accept_cookies_button_css)
except NoSuchElementException:
time.sleep(1)
# click accept cookies button
clicked = False
while not clicked:
try:
accept_cookies_button.click()
clicked = True
except ElementNotInteractableException:
time.sleep(1)
content_review_css = ".content-review"
webdriverWaiter.until(EC.visibility_of_element_located((By.CSS_SELECTOR, content_review_css)))
reviews = driver.find_elements(By.CSS_SELECTOR, content_review_css)
for rev in reviews:
print(rev.text)
Popup image:

loop through two dropdowns and downloading files

I would like to download .csv reports for all states and all compliance periods from this web page.
In other words, the selenium script would select a state (for example, "DC") a reporting period (for example, "Jan 2021 - Dec 2021"), and then click "submit." THEN the script would export the results to a spreadsheet by clicking the image that says "CSV".
Ideally, the spreadsheet would do this for all states and all reporting periods. So at the end, my downloads folder would be full of spreadsheets.
I cannot, for the life of me, figure out how to get this to work!
This is what I have so far. There are no loops like I think there should be.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import chromedriver_autoinstaller
import time
import glob
import os
chromedriver_autoinstaller.install()
chromeOptions = webdriver.ChromeOptions()
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
url = "https://gats.pjm-eis.com/GATS2/PublicReports/RPSRetiredCertificatesReportingYear"
driver.get(url)
driver.find_element(By.CSS_SELECTOR, "table:nth-child(4)").click()
driver.find_element(By.ID, "SelectedState0_B-1").click()
driver.find_element(By.ID, "SelectedState0_DDD_L_LBI5T0").click()
driver.find_element(By.ID, "ReportingYear0_B-1").click()
driver.find_element(By.ID, "ReportingYear0_DDD_L_LBI0T0").click()
driver.find_element(By.CSS_SELECTOR, ".dx-vam:nth-child(2)").click()
driver.find_element(By.ID, "CSV0Img").click()
Thank you very much for your help! I truly appreciate it.
Here is the Solution!
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import glob
import os
chromeOptions = webdriver.ChromeOptions()
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
url = "https://gats.pjm-eis.com/GATS2/PublicReports/RPSRetiredCertificatesReportingYear"
state = 'DC' # Enter State Name Here
compliance_period = 'Jan 2020 - Dec 2020' # Enter Compliance Period Here
driver.get(url)
wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[#class="dxEditors_edtDropDown_GATS2"])[1]'))).click() # Clicking on Dropdown Arrow Down Icon
wait.until(EC.element_to_be_clickable((By.XPATH, '//tr[#class="dxeListBoxItemRow_GATS2"]//td[text()="' + state + '"]'))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[#class="dxEditors_edtDropDown_GATS2"])[2]'))).click() # Clicking on Dropdown Arrow Down Icon
wait.until(EC.element_to_be_clickable((By.XPATH, '//tr[#class="dxeListBoxItemRow_GATS2"]//td[text()="' + compliance_period + '"]'))).click()
driver.find_element(By.XPATH, '//*[text()="Submit"]').click()
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="CSV0Img"]'))).click()
print("Successfully Downloaded!")
time.sleep(10)
driver.quit()
* Updated another Solution below as per the case mentioned in the comments where we've to make it loop through all the states and through all the compliance periods.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
chromeOptions = webdriver.ChromeOptions()
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
url = "https://gats.pjm-eis.com/GATS2/PublicReports/RPSRetiredCertificatesReportingYear"
driver.get(url)
count_state = len(driver.find_elements(By.XPATH, '//table[#id="SelectedState0_DDD_L_LBT"]//tr'))
for i in range(1, count_state + 1):
wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[#class="dxEditors_edtDropDown_GATS2"])[1]'))).click() # Clicking on Dropdown Arrow Down Icon
wait.until(EC.element_to_be_clickable((By.XPATH, '(//table[#id="SelectedState0_DDD_L_LBT"]//tr)[' + str(i) + ']'))).click()
state_name = driver.find_element(By.XPATH, '(//table[#id="SelectedState0_DDD_L_LBT"]//tr/td)[' + str(i) + ']').get_attribute("textContent")
count_period = len(driver.find_elements(By.XPATH, '//table[#id="ReportingYear0_DDD_L_LBT"]//tr'))
for j in range(1, count_period + 1):
wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[#class="dxEditors_edtDropDown_GATS2"])[2]'))).click() # Clicking on Dropdown Arrow Down Icon
wait.until(EC.element_to_be_clickable((By.XPATH, '(//table[#id="ReportingYear0_DDD_L_LBT"]//tr)[' + str(j) + ']'))).click()
driver.find_element(By.XPATH, '//*[text()="Submit"]').click()
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="CSV0Img"]'))).click()
compliance_period_name = driver.find_element(By.XPATH, '(//table[#id="ReportingYear0_DDD_L_LBT"]//tr/td)[' + str(j) + ']').get_attribute("textContent")
print("Successfully Downloaded for State:", state_name, " and Compliance Period: ", str(compliance_period_name))
print("\n")
time.sleep(10)
driver.quit()

How to find 'Text' after node?

driver = webdriver.Chrome()
URL= ['https://makemyhomevn.com/collections/ghe-an-cafe/products/ghe-go-tron']
driver.get(URL)
sleep(1)
des = driver.find_element_by_xpath('//div[#class="product-item-description"]//strong/following sibling::text()[1]')
print(des)
I expect my result as 'Gỗ tự nhiên', I have tried many ways but couldn't get the text after 'Chất liệu:'.
You can take the entire span text using .get_attribute('innerText') and then use the split function from Python like below:
driver.maximize_window()
wait = WebDriverWait(driver, 20)
driver.get("https://makemyhomevn.com/collections/ghe-an-cafe/products/ghe-go-tron")
time.sleep(1)
entire_span = wait.until(EC.visibility_of_element_located((By.XPATH, "//strong[text()='Chất liệu:']/..")))
entire_span_splitted = entire_span.get_attribute('innerText').split(":")
#print(entire_span_splitted[0])
print(entire_span_splitted[1])
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Output:
Gỗ tự nhiên.

Convert web_scraping table to csv - Python

**Hi!
With this script, I can generate a table on the site, after meeting
some criteria in the dropdowns.
I would like to return only this table in .csv format, so that power
bi can recognize and return this script as a table**
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import shutil
import os
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
driver.get('http://estatisticas.cetip.com.br/astec/series_v05/paginas/lum_web_v05_series_introducao.asp?str_Modulo=Ativo&int_Idioma=1&int_Titulo=6&int_NivelBD=2/')
driver.find_element_by_xpath('//*[#id="divContainerIframeBmf"]/div/dl/dd[2]/a').click()
time.sleep(3)
driver.switch_to.frame(driver.find_element(By.XPATH, '//iframe[#name="dados_corpo"]'))
driver.switch_to.frame(driver.find_element(By.XPATH, '//frame[#name="ativo"]'))
find_dp1 = driver.find_element(By.XPATH, '//select[#name="ativo"]')
select_find_dp1 = Select(find_dp1)
select_find_dp1.select_by_visible_text("CBIO - Crédito de descarbonização")
time.sleep(3)
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[1])
time.sleep(1)
informacoes = Select(driver.find_element(By.NAME, 'selectopcoes'))
informacoes.select_by_visible_text('Estoque')
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[2])
time.sleep(1)
# Data Inicial
driver.find_element(By.NAME, 'DT_DIA_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_MES_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_ANO_DE').send_keys('2021')
# Data Final
driver.find_element(By.NAME, 'DT_DIA_ATE').send_keys('30')
driver.find_element(By.NAME, 'DT_MES_ATE').send_keys('12')
driver.find_element(By.NAME, 'DT_ANO_ATE').send_keys('2022')
driver.find_elements(By.CLASS_NAME, 'button')[1].click()

Save an .xls file in a specific folder

I have a web_scraping script, which generates a download, resulting in
a .xls file
I can't save this .xls file in the folder I want.
I would like to save it to a local folder
I tried like this:
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
options = Options()
options.use_chromium = True
options.add_argument("--window-size=1920,1080")
options.add_argument("disable-gpu")
options.add_experimental_option('prefs', {
'download.default_directory': r'C:\Users\prmatteo\OneDrive - xxxS.A\Área de Trabalho\arquivos python'
})
driver.get('http://estatisticas.cetip.com.br/astec/series_v05/paginas/lum_web_v05_series_introducao.asp?str_Modulo=Ativo&int_Idioma=1&int_Titulo=6&int_NivelBD=2/')
driver.find_element_by_xpath('//*[#id="divContainerIframeBmf"]/div/dl/dd[2]/a').click()
time.sleep(3)
driver.switch_to.frame(driver.find_element(By.XPATH, '//iframe[#name="dados_corpo"]'))
driver.switch_to.frame(driver.find_element(By.XPATH, '//frame[#name="ativo"]'))
find_dp1 = driver.find_element(By.XPATH, '//select[#name="ativo"]')
select_find_dp1 = Select(find_dp1)
select_find_dp1.select_by_visible_text("CBIO - Crédito de descarbonização")
time.sleep(3)
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[1])
time.sleep(1)
informacoes = Select(driver.find_element(By.NAME, 'selectopcoes'))
informacoes.select_by_visible_text('Estoque')
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[2])
time.sleep(1)
# Data Inicial
driver.find_element(By.NAME, 'DT_DIA_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_MES_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_ANO_DE').send_keys('2021')
# Data Final
driver.find_element(By.NAME, 'DT_DIA_ATE').send_keys('30')
driver.find_element(By.NAME, 'DT_MES_ATE').send_keys('12')
driver.find_element(By.NAME, 'DT_ANO_ATE').send_keys('2022')
driver.find_elements(By.CLASS_NAME, 'button')[1].click()
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'iframe'))
time.sleep(1)
driver.find_element(By.CLASS_NAME, 'primary-text').find_element(By.TAG_NAME,'a').click()
Utilizei este código para tentar direcioná-lo para uma pasta:
options.add_experimental_option('prefs', {
'download.default_directory': r'C:\Users\prmatteo\OneDrive - xxxS.A\Área de Trabalho\arquivos python'
})
Porém, não funcionou.
Copy or move a file from one directory to another:
import shutil
# source path
source = "/old_docs/file.xls"
# destination path
destination = "/new_docs/file.xls"
# copy file
shutil.copy(source, destination)
# move file
shutil.move(source, destination)
Move or copy file with filename as variable:
filename = "file.xls"
# source path
source = "/old_docs/"
# destination path
destination = "/new_docs/"
# copy file
shutil.copy(source + filename, destination + filename)
# move file
shutil.move(source + filename, destination + filename)

Categories