Save an .xls file in a specific folder - python

I have a web_scraping script, which generates a download, resulting in
a .xls file
I can't save this .xls file in the folder I want.
I would like to save it to a local folder
I tried like this:
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
options = Options()
options.use_chromium = True
options.add_argument("--window-size=1920,1080")
options.add_argument("disable-gpu")
options.add_experimental_option('prefs', {
'download.default_directory': r'C:\Users\prmatteo\OneDrive - xxxS.A\Área de Trabalho\arquivos python'
})
driver.get('http://estatisticas.cetip.com.br/astec/series_v05/paginas/lum_web_v05_series_introducao.asp?str_Modulo=Ativo&int_Idioma=1&int_Titulo=6&int_NivelBD=2/')
driver.find_element_by_xpath('//*[#id="divContainerIframeBmf"]/div/dl/dd[2]/a').click()
time.sleep(3)
driver.switch_to.frame(driver.find_element(By.XPATH, '//iframe[#name="dados_corpo"]'))
driver.switch_to.frame(driver.find_element(By.XPATH, '//frame[#name="ativo"]'))
find_dp1 = driver.find_element(By.XPATH, '//select[#name="ativo"]')
select_find_dp1 = Select(find_dp1)
select_find_dp1.select_by_visible_text("CBIO - Crédito de descarbonização")
time.sleep(3)
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[1])
time.sleep(1)
informacoes = Select(driver.find_element(By.NAME, 'selectopcoes'))
informacoes.select_by_visible_text('Estoque')
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[2])
time.sleep(1)
# Data Inicial
driver.find_element(By.NAME, 'DT_DIA_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_MES_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_ANO_DE').send_keys('2021')
# Data Final
driver.find_element(By.NAME, 'DT_DIA_ATE').send_keys('30')
driver.find_element(By.NAME, 'DT_MES_ATE').send_keys('12')
driver.find_element(By.NAME, 'DT_ANO_ATE').send_keys('2022')
driver.find_elements(By.CLASS_NAME, 'button')[1].click()
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'iframe'))
time.sleep(1)
driver.find_element(By.CLASS_NAME, 'primary-text').find_element(By.TAG_NAME,'a').click()
Utilizei este código para tentar direcioná-lo para uma pasta:
options.add_experimental_option('prefs', {
'download.default_directory': r'C:\Users\prmatteo\OneDrive - xxxS.A\Área de Trabalho\arquivos python'
})
Porém, não funcionou.

Copy or move a file from one directory to another:
import shutil
# source path
source = "/old_docs/file.xls"
# destination path
destination = "/new_docs/file.xls"
# copy file
shutil.copy(source, destination)
# move file
shutil.move(source, destination)
Move or copy file with filename as variable:
filename = "file.xls"
# source path
source = "/old_docs/"
# destination path
destination = "/new_docs/"
# copy file
shutil.copy(source + filename, destination + filename)
# move file
shutil.move(source + filename, destination + filename)

Related

Go to site use direct link could not get element

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import chromedriver_autoinstaller
from selenium.webdriver.common.keys import Keys
import subprocess
import time
from selenium.webdriver.common.by import By
import functionmodules
### PATH
CHROME_DRIVER = 'C:\\Users\SANGHYUN\Downloads\chromedriver_win32\chromedriver.exe'
url = 'https://cafe.naver.com/reply14/1'
#url = 'https://cafe.naver.com/reply14'
CHROME_PATH = 'C:\\Program Files\Google\Chrome\Application\chrome.exe'
searchpath = url
subprocess.Popen(r'C:\\Program Files\Google\Chrome\Application\chrome.exe --remote-debugging-port=9222 --user-data-dir="C:\chrometemp"')
option = Options()
option.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_ver = chromedriver_autoinstaller.get_chrome_version().split('.')[0]
try:
driver = webdriver.Chrome(f'./{chrome_ver}/chromedriver.exe', options=option)
except:
chromedriver_autoinstaller.install(True)
driver = webdriver.Chrome(f'./{chrome_ver}/chromedriver.exe', options=option)
driver.get(searchpath)
def CallGoToArticleStep():
# go to main
driver.switch_to.parent_frame()
driver.find_element(By.XPATH, '//*[#id="menuLink1"]').click()
driver.switch_to.frame('cafe_main')
# click article3
time.sleep(2)
firstarticle = '//*[#id="main-area"]/div[4]/table/tbody/tr[2]/td[1]/div[3]/div/a[1]'
element3 = driver.find_element(By.XPATH, firstarticle)
element3.send_keys('\n')
#CallGoToArticleStep()
# write reply, send reply
for i in range (1):
time.sleep(4)
print (i)
replyString = '//*[#id="app"]/div/div/div[2]/div[2]/div[4]/div[2]/div[1]/textarea'
replyElement = driver.find_element(By.XPATH, replyString)
replyElement.send_keys('whisky life')
replyClickString = '//*[#id="app"]/div/div/div[2]/div[2]/div[4]/div[2]/div[2]/div[2]/a'
replyClickElement = driver.find_element(By.XPATH, replyClickString)
replyClickElement.click()
time.sleep(1000)`
In this source call CallGoToArticleStep() can get replyElement, not call then can't get replyElement but, browser element equal.
is there way to not call CallGoToArticleStep function and get replyElement?

Convert web_scraping table to csv - Python

**Hi!
With this script, I can generate a table on the site, after meeting
some criteria in the dropdowns.
I would like to return only this table in .csv format, so that power
bi can recognize and return this script as a table**
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import shutil
import os
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
driver.get('http://estatisticas.cetip.com.br/astec/series_v05/paginas/lum_web_v05_series_introducao.asp?str_Modulo=Ativo&int_Idioma=1&int_Titulo=6&int_NivelBD=2/')
driver.find_element_by_xpath('//*[#id="divContainerIframeBmf"]/div/dl/dd[2]/a').click()
time.sleep(3)
driver.switch_to.frame(driver.find_element(By.XPATH, '//iframe[#name="dados_corpo"]'))
driver.switch_to.frame(driver.find_element(By.XPATH, '//frame[#name="ativo"]'))
find_dp1 = driver.find_element(By.XPATH, '//select[#name="ativo"]')
select_find_dp1 = Select(find_dp1)
select_find_dp1.select_by_visible_text("CBIO - Crédito de descarbonização")
time.sleep(3)
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[1])
time.sleep(1)
informacoes = Select(driver.find_element(By.NAME, 'selectopcoes'))
informacoes.select_by_visible_text('Estoque')
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[2])
time.sleep(1)
# Data Inicial
driver.find_element(By.NAME, 'DT_DIA_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_MES_DE').send_keys('10')
driver.find_element(By.NAME, 'DT_ANO_DE').send_keys('2021')
# Data Final
driver.find_element(By.NAME, 'DT_DIA_ATE').send_keys('30')
driver.find_element(By.NAME, 'DT_MES_ATE').send_keys('12')
driver.find_element(By.NAME, 'DT_ANO_ATE').send_keys('2022')
driver.find_elements(By.CLASS_NAME, 'button')[1].click()

Erro no web_scraping com dropdowns - Python

Hi!
I made a script to be able to generate a table, after meeting some
criteria within the dropdowns. However, there is no data drop-down
list, when you try to set a data, it will always return the same
value, regardless of the data I set for it in the script. I would like
it to return today from the data: 01/01/2020, until.
What is wrong in the script?
My script:
import time
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import shutil
import os
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import matplotlib
import csv
import xlrd
import openpyxl
driver = webdriver.Chrome(ChromeDriverManager().install())
driver = webdriver.Chrome()
driver.get('http://estatisticas.cetip.com.br/astec/series_v05/paginas/lum_web_v05_series_introducao.asp?str_Modulo=Ativo&int_Idioma=1&int_Titulo=6&int_NivelBD=2/')
driver.find_element_by_xpath('//*[#id="divContainerIframeBmf"]/div/dl/dd[2]/a').click()
time.sleep(3)
driver.switch_to.frame(driver.find_element(By.XPATH, '//iframe[#name="dados_corpo"]'))
driver.switch_to.frame(driver.find_element(By.XPATH, '//frame[#name="ativo"]'))
find_dp1 = driver.find_element(By.XPATH, '//select[#name="ativo"]')
select_find_dp1 = Select(find_dp1)
select_find_dp1.select_by_visible_text("CBIO - Crédito de descarbonização")
time.sleep(3)
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[1])
time.sleep(1)
informacoes = Select(driver.find_element(By.NAME, 'selectopcoes'))
informacoes.select_by_visible_text('Aposentadoria')
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.ID, 'dados_corpo'))
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'frameset').find_elements(By.TAG_NAME, 'frame')[2])
time.sleep(2)
# Data Inicial
driver.find_element(By.NAME, 'DT_DIA_DE').send_keys('01')
driver.find_element(By.NAME, 'DT_MES_DE').send_keys('01')
driver.find_element(By.NAME, 'DT_ANO_DE').send_keys('2020')
# Data Final
driver.find_element(By.NAME, 'DT_DIA_ATE').send_keys('31')
driver.find_element(By.NAME, 'DT_MES_ATE').send_keys('12')
driver.find_element(By.NAME, 'DT_ANO_ATE').send_keys('2022')
driver.find_elements(By.CLASS_NAME, 'button')[1].click()
driver.switch_to.default_content()
driver.switch_to.frame(driver.find_element(By.TAG_NAME, 'iframe'))
time.sleep(1)
driver.find_element(By.CLASS_NAME, 'primary-text').find_element(By.TAG_NAME,'a').click()
time.sleep(4)
origem = 'C:\\Users\\prmatteo\\Downloads\\'
destino = os.path.join(origem, 'C:\\Users\\prmatteo\\OneDrive - COPERSUCAR S.A\\Área de Trabalho\\Arquivos Python\\renovabioaposentadoria.xls')
extensao = '.xls'
for file in os.listdir(origem):
if file.endswith(extensao):
shutil.move(os.path.join(origem,file), destino)

Reading weblink from dataframe throws "stale element reference: element is not attached to the page document" error

I got a dataframe that contains links to google reviews of two restaurants. I wanted to load all reviews of two restaurants (one by one) into the browser and then save them into a new data frame. I wrote a script that reads and load all reviews into the browser as follow:
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import time
link_df = Link
0 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
1 https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]
i = 0
driver = webdriver.Chrome()
for index, i in link_df.iterrows():
base_url = i['Link'] #link_df['Link'][i]
driver.get(base_url)
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[./span[text()='Newest']]"))).click()
print('Restaurant number is ',index)
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text =driver.find_element_by_xpath("//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int (total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
time.sleep(2)
total_reviews = len(all_reviews)
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
time.sleep(5)
all_reviews = WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.gws-localreviews__google-review')))
print(total_reviews)
total_reviews +=1
reviews_info = driver.find_elements_by_xpath("//div[#class='jxjCjc']")
review_information = pd.DataFrame(columns=["Restaurant title","Restaurant rating","Total reviews","Reviewer Name","Rating", "Review"])
name= ''
rating = ''
text = ''
for index,review_info in enumerate(reviews_info):
name = review_info.find_element_by_xpath("./div/div/a").text
rating = review_info.find_element_by_xpath(".//div[#class='PuaHbe']//g-review-stars//span").get_attribute('aria-label')
text = review_info.find_element_by_xpath(".//div[#class='Jtu6Td']//span").text
review_information.at[len(review_information)] = [title,overall_rating,num_reviews,name,rating,text]
filename = 'Google_reviews' + ' ' +pd.to_datetime("now").strftime("%Y_%m_%d")+'.csv'
files_present = glob.glob(filename)
if files_present:
review_information.to_csv(filename,index=False,mode='a',header=False)
else:
review_information.to_csv(filename,index=False)
driver.get('https:ww.google.com')
time.sleep(3)
The problem is that script throws an error when it reaches the following line.
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
It throws following error:
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=95.0.4638.69)
When I tried the same program without storing google links in dataframe (i.e. no for loop and instead of base_url = i['Link'], I wrote base_url = google review link) it works fine.
I am not sure where I am making the mistake. Any suggestion or help to fix the issue would be highly appreciated?
EDIT
you put the creation of driver outside the for loop
you cant launch the new url with gps data when the first popup is always in front, if you launch it, it stays in backdoor, the easier way is to launch a new url without gps data -> https:ww.google.com and wait 3 dec before to follow your loop:
your count is not good, i have changed your selector and change the total and set some lines in comment
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.options import Options
import time
link_df = ["https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
cap = DesiredCapabilities().FIREFOX
cap["marionette"] = True
options = Options()
options.binary = binary
driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
# i have to launch one time to accept the cookies manually
#by setting a breakpoint after, but you dont have that i think
#driver.get(link_df[0])
print ("Headless Firefox Initialized")
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Avis les plus récents']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
WebDriverWait(driver, 5, 0.25).until_not(EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class$="activityIndicator"]')))
all_reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()
it seems the solution for chrome needs some fixes:
org.openqa.selenium.StaleElementReferenceException: stale element reference: element is not attached to the page document
The literal meaning is about , The referenced element is out of date , No longer attached to the current page . Usually , This is because the page has been refreshed or skipped , The solution is , Reuse findElement or findElements Method to locate the element .
so its seems for chrome there is a problem of refreshing, so i suggest to load the number of record before to scroll, to have a fresh copy of DOM items, and i have to add a wait 1sec at the end of while loop
from selenium import webdriver
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.options import Options
import time
link_df = [
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318a3aa3041455:0x5f83f4fae76d8656,1,,,&rlfi=hd:;si:6882614014013965910,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEiglZKhm6qAgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSARJidXJtZXNlX3Jlc3RhdXJhbnSqAQwQASoIIgRmb29kKAA,y,UB2auy7TMYs;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]",
"https://www.google.com/search?q=restaurant+in+christchurch&biw=1280&bih=614&hotel_occupancy=2&tbm=lcl&sxsrf=AOaemvI4qlEAr3btedb6PCx9U53RtXkI2Q%3A1635630947742&ei=Y799YaHfLOKZ4-EPoeqjmA4&oq=restaurant+in+christchurch&gs_l=psy-ab.3...0.0.0.614264.0.0.0.0.0.0.0.0..0.0....0...1c..64.psy-ab..0.0.0....0.7jAOI05vCjI#lrd=0x6d318bf82139caaf:0xf115cd7fe794cbcc,1,,,&rlfi=hd:;si:17372017086881385420,l,ChpyZXN0YXVyYW50IGluIGNocmlzdGNodXJjaEjh9auu-q6AgAhaKBAAGAAYAiIacmVzdGF1cmFudCBpbiBjaHJpc3RjaHVyY2gqBAgDEACSAQpyZXN0YXVyYW50qgEMEAEqCCIEZm9vZCgA,y,ZeJbBWd7wDg;mv:[[-43.4870861,172.6509735],[-43.5490232,172.5976049]]"
]
i = 0
binaryfirefox = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe'
binarychrome = r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
options = Options()
#cap = DesiredCapabilities().CHROME
#cap["marionette"] = True
#cap = DesiredCapabilities().FIREFOX
#options.binary = binaryfirefox
#driver = webdriver.Firefox(options=options, capabilities=cap, executable_path="E:\\Téléchargement\\geckodriver.exe")
options.binary_location = binarychrome
driver = webdriver.Chrome(options=options, executable_path="E:\\Téléchargement\\chromedriver.exe" )
# same reason tha Firefox i have to load one time
# an url to accept manually the cookies
#driver.get(link_df[0])
print(link_df)
for url in link_df:
base_url = url # i['Link'] # link_df['Link'][i]
print(base_url)
driver.get(base_url)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[./span[text()='Newest']]"))).click()
title = driver.find_element_by_xpath("//div[#class='P5Bobd']").text
address = driver.find_element_by_xpath("//div[#class='T6pBCe']").text
overall_rating = driver.find_element_by_xpath("//div[#class='review-score-container']//span[#class='Aq14fc']").text
total_reviews_text = driver.find_element_by_xpath(
"//div[#class='review-score-container']//div//div//span//span[#class='z5jxId']").text
num_reviews = int(total_reviews_text.split()[0])
all_reviews = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
# time.sleep(2)
total_reviews = 0
while total_reviews < num_reviews:
#reload to avoid exception, or trap scroll with try/except but more expznsive
all_reviews = WebDriverWait(driver, 20).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#reviewSort .gws-localreviews__google-review')))
driver.execute_script('arguments[0].scrollIntoView(true);', all_reviews[-1])
total_reviews = len(all_reviews)
print(total_reviews, len(all_reviews))
time.sleep(1)
driver.get('https:ww.google.com') # or driver.close() if no bugs
time.sleep(3)
driver.close()
driver.quit()

How can I get data from a online pdf file using python?

I need to get the data value of the amount of money in a online pdf file. I need the number R$ 20.707,00 which is the total of "serviço"
I am using selenium to reach this PDF:
import os
import time
import xlrd # importando a biblioteca
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import zipfile
inicio = time.time()
datainicial = "01042019"
datafinal = '30042019'
cnpj = '13177807000146'
senha = 'qualita#2018'
driver = webdriver.Chrome()
# driver.maximize_window()
driver.get("https://directa.natal.rn.gov.br/")
# Logando
driver.switch_to.frame(driver.find_element_by_name("mainsystem"))
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "usuario"))).send_keys(cnpj)
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, "senha"))).send_keys(str(senha))
time.sleep(2)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "button.btn#acessar"))).click()
# Nota natalense
WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.NAME, "mainsystem")))
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'limenu9'))).click()
time.sleep(1)
# Consulta
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#limenu9 > div > a:nth-child(1)"))).click()
# Consulta NFe
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#formsmenu12 > li:nth-child(4) > a'))).click()
# Trocando frame
time.sleep(1)
driver.switch_to.frame(0)
driver.switch_to.frame(0)
# Selecionando empresa
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[#id="lay"\]/div\[2\]/div\[2\]/div\[7\]/div\[2\]/div/div/table/tbody/tr/td'))).click()
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[#id="lookupInput"\]/option\[2\]'))).click()
#Marcando serviços prestados
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[#id="HTMLGroupBox628126"\]/table/tbody/tr\[2\]/td/table/tbody/tr/td\[1\]/a/img'))).click()
#Marcando retenção de ISS
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[#id="HTMLGroupBox628123"\]/table/tbody/tr\[1\]/td/table/tbody/tr/td\[1\]/a/img'))).click()
# selecionando a data inicial
driver.find_element_by_name('WFRInput628128').send_keys(datainicial)
# selecionando a data final
driver.find_element_by_name('WFRInput628127').send_keys(datafinal)
#Consultar
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[#id="lay"\]/div\[2\]/div\[2\]/div\[21\]/div/table/tbody/tr/td'))).click()
#Imprimir relatório
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*\[#id="lay"\]/div\[2\]/div\[3\]/div\[5\]/div/table/tbody/tr/td'))).click()
fim = time.time()
duracao = fim - inicio
print('O programa rodou em: {} e foram baixadas {} empresas'.format(duracao, i))
What can I use to get only this data from this PDF file?
I would go easier route:
Download pdf file using any library like requests, then read pdf file text using tika, then search for my word using regex or any other method
from tika import parser
raw = parser.from_file('sample.pdf')
print(raw['content'])

Categories