Convert web_scraping table to csv - Python

Convert web_scraping table to csv - Python - python

**Hi!
With this script, I can generate a table on the site, after meeting
some criteria in the dropdowns.
I would like to return only this table in .csv format, so that power
bi can recognize and return this script as a table**
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import shutil
import os
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
driver.get('http://estatisticas.cetip.com.br/astec/series_v05/paginas/lum_web_v05_series_introducao.asp?str_Modulo=Ativo&int_Idioma=1&int_Titulo=6&int_NivelBD=2/')
driver.find_element_by_xpath('//*[#id="divContainerIframeBmf"]/div/dl/dd[2]/a').click()
time.sleep(3)
driver.switch_to.frame(driver.find_element(By.XPATH, '//iframe[#name="dados_corpo"]'))
driver.switch_to.frame(driver.find_element(By.XPATH, '//frame[#name="ativo"]'))
find_dp1 = driver.find_element(By.XPATH, '//select[#name="ativo"]')
select_find_dp1 = Select(find_dp1)
select_find_dp1.select_by_visible_text("CBIO - Crédito de descarbonização")
time.sleep(3)
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[1])
time.sleep(1)
informacoes = Select(driver.find_element(By.NAME, 'selectopcoes'))
informacoes.select_by_visible_text('Estoque')
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[2])
time.sleep(1)
# Data Inicial
driver.find_element(By.NAME, 'DT_DIA_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_MES_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_ANO_DE').send_keys('2021')
# Data Final
driver.find_element(By.NAME, 'DT_DIA_ATE').send_keys('30')
driver.find_element(By.NAME, 'DT_MES_ATE').send_keys('12')
driver.find_element(By.NAME, 'DT_ANO_ATE').send_keys('2022')
driver.find_elements(By.CLASS_NAME, 'button')[1].click()

Related

loop through two dropdowns and downloading files

I would like to download .csv reports for all states and all compliance periods from this web page.
In other words, the selenium script would select a state (for example, "DC") a reporting period (for example, "Jan 2021 - Dec 2021"), and then click "submit." THEN the script would export the results to a spreadsheet by clicking the image that says "CSV".
Ideally, the spreadsheet would do this for all states and all reporting periods. So at the end, my downloads folder would be full of spreadsheets.
I cannot, for the life of me, figure out how to get this to work!
This is what I have so far. There are no loops like I think there should be.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import chromedriver_autoinstaller
import time
import glob
import os
chromedriver_autoinstaller.install()
chromeOptions = webdriver.ChromeOptions()
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
url = "https://gats.pjm-eis.com/GATS2/PublicReports/RPSRetiredCertificatesReportingYear"
driver.get(url)
driver.find_element(By.CSS_SELECTOR, "table:nth-child(4)").click()
driver.find_element(By.ID, "SelectedState0_B-1").click()
driver.find_element(By.ID, "SelectedState0_DDD_L_LBI5T0").click()
driver.find_element(By.ID, "ReportingYear0_B-1").click()
driver.find_element(By.ID, "ReportingYear0_DDD_L_LBI0T0").click()
driver.find_element(By.CSS_SELECTOR, ".dx-vam:nth-child(2)").click()
driver.find_element(By.ID, "CSV0Img").click()
Thank you very much for your help! I truly appreciate it.

Here is the Solution!
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import glob
import os
chromeOptions = webdriver.ChromeOptions()
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
url = "https://gats.pjm-eis.com/GATS2/PublicReports/RPSRetiredCertificatesReportingYear"
state = 'DC' # Enter State Name Here
compliance_period = 'Jan 2020 - Dec 2020' # Enter Compliance Period Here
driver.get(url)
wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[#class="dxEditors_edtDropDown_GATS2"])[1]'))).click() # Clicking on Dropdown Arrow Down Icon
wait.until(EC.element_to_be_clickable((By.XPATH, '//tr[#class="dxeListBoxItemRow_GATS2"]//td[text()="' + state + '"]'))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[#class="dxEditors_edtDropDown_GATS2"])[2]'))).click() # Clicking on Dropdown Arrow Down Icon
wait.until(EC.element_to_be_clickable((By.XPATH, '//tr[#class="dxeListBoxItemRow_GATS2"]//td[text()="' + compliance_period + '"]'))).click()
driver.find_element(By.XPATH, '//*[text()="Submit"]').click()
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="CSV0Img"]'))).click()
print("Successfully Downloaded!")
time.sleep(10)
driver.quit()
* Updated another Solution below as per the case mentioned in the comments where we've to make it loop through all the states and through all the compliance periods.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
chromeOptions = webdriver.ChromeOptions()
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 20)
url = "https://gats.pjm-eis.com/GATS2/PublicReports/RPSRetiredCertificatesReportingYear"
driver.get(url)
count_state = len(driver.find_elements(By.XPATH, '//table[#id="SelectedState0_DDD_L_LBT"]//tr'))
for i in range(1, count_state + 1):
wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[#class="dxEditors_edtDropDown_GATS2"])[1]'))).click() # Clicking on Dropdown Arrow Down Icon
wait.until(EC.element_to_be_clickable((By.XPATH, '(//table[#id="SelectedState0_DDD_L_LBT"]//tr)[' + str(i) + ']'))).click()
state_name = driver.find_element(By.XPATH, '(//table[#id="SelectedState0_DDD_L_LBT"]//tr/td)[' + str(i) + ']').get_attribute("textContent")
count_period = len(driver.find_elements(By.XPATH, '//table[#id="ReportingYear0_DDD_L_LBT"]//tr'))
for j in range(1, count_period + 1):
wait.until(EC.element_to_be_clickable((By.XPATH, '(//*[#class="dxEditors_edtDropDown_GATS2"])[2]'))).click() # Clicking on Dropdown Arrow Down Icon
wait.until(EC.element_to_be_clickable((By.XPATH, '(//table[#id="ReportingYear0_DDD_L_LBT"]//tr)[' + str(j) + ']'))).click()
driver.find_element(By.XPATH, '//*[text()="Submit"]').click()
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="CSV0Img"]'))).click()
compliance_period_name = driver.find_element(By.XPATH, '(//table[#id="ReportingYear0_DDD_L_LBT"]//tr/td)[' + str(j) + ']').get_attribute("textContent")
print("Successfully Downloaded for State:", state_name, " and Compliance Period: ", str(compliance_period_name))
print("\n")
time.sleep(10)
driver.quit()

How to find 'Text' after node?

driver = webdriver.Chrome()
URL= ['https://makemyhomevn.com/collections/ghe-an-cafe/products/ghe-go-tron']
driver.get(URL)
sleep(1)
des = driver.find_element_by_xpath('//div[#class="product-item-description"]//strong/following sibling::text()[1]')
print(des)
I expect my result as 'Gỗ tự nhiên', I have tried many ways but couldn't get the text after 'Chất liệu:'.

You can take the entire span text using .get_attribute('innerText') and then use the split function from Python like below:
driver.maximize_window()
wait = WebDriverWait(driver, 20)
driver.get("https://makemyhomevn.com/collections/ghe-an-cafe/products/ghe-go-tron")
time.sleep(1)
entire_span = wait.until(EC.visibility_of_element_located((By.XPATH, "//strong[text()='Chất liệu:']/..")))
entire_span_splitted = entire_span.get_attribute('innerText').split(":")
#print(entire_span_splitted[0])
print(entire_span_splitted[1])
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Output:
Gỗ tự nhiên.

Erro no web_scraping com dropdowns - Python

Hi!
I made a script to be able to generate a table, after meeting some
criteria within the dropdowns. However, there is no data drop-down
list, when you try to set a data, it will always return the same
value, regardless of the data I set for it in the script. I would like
it to return today from the data: 01/01/2020, until.
What is wrong in the script?
My script:
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import shutil
import os
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import matplotlib
import csv
import xlrd
import openpyxl
driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
driver.get('http://estatisticas.cetip.com.br/astec/series_v05/paginas/lum_web_v05_series_introducao.asp?str_Modulo=Ativo&int_Idioma=1&int_Titulo=6&int_NivelBD=2/')
driver.find_element_by_xpath('//*[#id="divContainerIframeBmf"]/div/dl/dd[2]/a').click()
time.sleep(3)
driver.switch_to.frame(driver.find_element(By.XPATH, '//iframe[#name="dados_corpo"]'))
driver.switch_to.frame(driver.find_element(By.XPATH, '//frame[#name="ativo"]'))
find_dp1 = driver.find_element(By.XPATH, '//select[#name="ativo"]')
select_find_dp1 = Select(find_dp1)
select_find_dp1.select_by_visible_text("CBIO - Crédito de descarbonização")
time.sleep(3)
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[1])
time.sleep(1)
informacoes = Select(driver.find_element(By.NAME, 'selectopcoes'))
informacoes.select_by_visible_text('Aposentadoria')
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[2])
time.sleep(2)
# Data Inicial
driver.find_element(By.NAME, 'DT_DIA_DE').send_keys('01')
driver.find_element(By.NAME, 'DT_MES_DE').send_keys('01')
driver.find_element(By.NAME, 'DT_ANO_DE').send_keys('2020')
# Data Final
driver.find_element(By.NAME, 'DT_DIA_ATE').send_keys('31')
driver.find_element(By.NAME, 'DT_MES_ATE').send_keys('12')
driver.find_element(By.NAME, 'DT_ANO_ATE').send_keys('2022')
driver.find_elements(By.CLASS_NAME, 'button')[1].click()
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'iframe'))
time.sleep(1)
driver.find_element(By.CLASS_NAME, 'primary-text').find_element(By.TAG_NAME,'a').click()
time.sleep(4)
origem = 'C:\\Users\\prmatteo\\Downloads\\'
destino = os.path.join(origem, 'C:\\Users\\prmatteo\\OneDrive - COPERSUCAR S.A\\Área de Trabalho\\Arquivos Python\\renovabioaposentadoria.xls')
extensao = '.xls'
for file in os.listdir(origem):
if file.endswith(extensao):
shutil.move(os.path.join(origem,file), destino)

scraping sports data using requests or selenium

I am trying to scrape data from this page:
https://www.sofascore.com/betting-tips-today
I created this code but dont work:
import requests
url = "https://www.sofascore.com/betting-tips-today"
r = requests.get(url).json()
print(r)
I tried selenium, but don't work:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
# options.add_argument("--headless") #headless
options.add_argument('--no-sandbox')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
driver = webdriver.Chrome(executable_path=r"C:/chromedriver.exe", options=options)
u = "https://www.sofascore.com/betting-tips-today"
driver.get(u)
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[class^='Content__PageContainer-sc-']")))
time.sleep(20)
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("innerHTML")
soup = BeautifulSoup(driver.page_source, 'html.parser')
# print(len(soup.find_all('h2')))
# print(len(soup.select('.ivqpwB')))
parent_soup = soup.find('h2', text=("Odds") ).parent.parent.select('div:nth-of-type(2) > div')
print(len(parent_soup))
for i in parent_soup:
print(i)
Any idea how I can scrape data inside this page ?

You could try like this:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--incognito")
driver = webdriver.Chrome(
executable_path=r"C:/chromedriver.exe", options=options
)
u = "https://www.sofascore.com/betting-tips-today"
driver.get(u)
# Get the page
WebDriverWait(driver, 20).until(
EC.visibility_of_element_located(
(By.CSS_SELECTOR, "div[class^='Content__PageContainer-sc-']")
)
)
time.sleep(20)
# Get the table
elem = driver.find_element_by_xpath(
'//*[#id="__next"]/main/div/div[2]/div/div[1]/div[2]/table'
)
source_code = elem.get_attribute("innerHTML")
# Parse the html
soup = BeautifulSoup(driver.page_source, "html.parser")
# Get the interesting data for each row
data = []
for row in soup.find_all("tr")[4:]:
infos = []
for item in row.find_all("td"):
for label in item.find_all("div"):
infos.append(label.text)
infos.append(item.text)
data.append(infos[3:5] + infos[13:14] + infos[16:17] + infos[20:])
print(data)
# Outputs
[['La Guaira', 'América Cali', '3.25', '3.20', '13.25X3.2022.25',
'1', '', '1', '31%', 'wins 57%'], ['Hapoel Holon', 'Burgos', '2.75',
'1.40', '1', '36%', 'wins 60%'] ...]
You now have a list (data) of lists (one list per row). You can make a dataframe of it with Pandas and do some more work.

How can i click specific category element of list in selenium python

from selenium import webdriver
import time
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Chrome("C:/Users/ysatish/PycharmProjects/all rules/driver/chromedriver.exe")
driver.maximize_window()
driver.implicitly_wait(10)
driver.get("https://www.myntra.com/men-tshirts")
chali = driver.find_elements_by_xpath('//li//a[1]//div[2]//div[1]//span')
dak = driver.find_elements_by_xpath('//li//a[1]//div[2]//div[1]//span[1]')
sub = len(chali)
da = len(dak)
print(da)
print(sub

Updated solution:
driver.get('https://www.myntra.com/men-tshirts')
wait = WebDriverWait(driver, 10)
chali = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//li[*]//a[1]//div[2]//div[1]//span[1]")))
print(len(chali))
for element in chali:
print element.text
element.click()
dak = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//li//a[1]//div[2]//div[1]//span")))
print(len(dak))
for element0 in dak:
print element0.text
element0.click()
Note : please add below imports to your solution
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
Updated Solution:
//li[*]//a[1]//div[2]//div[1]//span[not(#class='product-strike')][not(#class='product-discountedPrice')][not(#class='product-discountPercentage')][last()]//span[1]
and
//li[*]//a[1]//div[2]//div[1]//span
Output:

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Convert web_scraping table to csv - Python - python

Related

loop through two dropdowns and downloading files

How to find 'Text' after node?

Erro no web_scraping com dropdowns - Python

scraping sports data using requests or selenium

How can i click specific category element of list in selenium python

Categories

Resources