Getting value after button click with BeautifulSoup Python - python

I'm trying to get a value that is given by the website after a click on a button.
Here is the website: https://www.4devs.com.br/gerador_de_cpf
You can see that there is a button called "Gerar CPF", this button provides a number that appears after the click.
My current script opens the browser and get the value, but I'm getting the value from the page before the click, so the value is empty. I would like to know if it is possible to get the value after the click on the button.
from selenium import webdriver
from bs4 import BeautifulSoup
from requests import get
url = "https://www.4devs.com.br/gerador_de_cpf"
def open_browser():
driver = webdriver.Chrome("/home/felipe/Downloads/chromedriver")
driver.get(url)
driver.find_element_by_id('bt_gerar_cpf').click()
def get_cpf():
response = get(url)
page_with_cpf = BeautifulSoup(response.text, 'html.parser')
cpf = page_with_cpf.find("div", {"id": "texto_cpf"}).text
print("The value is: " + cpf)
open_browser()
get_cpf()

open_browser and get_cpf are absolutely not related to each other...
Actually you don't need get_cpf at all. Just wait for text after clicking the button:
from selenium.webdriver.support.ui import WebDriverWait as wait
def open_browser():
driver = webdriver.Chrome("/home/felipe/Downloads/chromedriver")
driver.get(url)
driver.find_element_by_id('bt_gerar_cpf').click()
text_field = driver.find_element_by_id('texto_cpf')
text = wait(driver, 10).until(lambda driver: not text_field.text == 'Gerando...' and text_field.text)
return text
print(open_browser())
Update
The same with requests:
import requests
url = 'https://www.4devs.com.br/ferramentas_online.php'
data = {'acao': 'gerar_cpf', 'pontuacao': 'S'}
response = requests.post(url, data=data)
print(response.text)

You don't need to use requests and BeautifulSoup.
from selenium import webdriver
from time import sleep
url = "https://www.4devs.com.br/gerador_de_cpf"
def get_cpf():
driver = webdriver.Chrome("/home/felipe/Downloads/chromedriver")
driver.get(url)
driver.find_element_by_id('bt_gerar_cpf').click()
sleep(10)
text=driver.find_element_by_id('texto_cpf').text
print(text)
get_cpf()

Can you use a While loop until text changes?
from selenium import webdriver
url = "https://www.4devs.com.br/gerador_de_cpf"
def get_value():
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_id('bt_gerar_cpf').click()
while driver.find_element_by_id('texto_cpf').text == 'Gerando...':
continue
val = driver.find_element_by_id('texto_cpf').text
driver.quit()
return val
print(get_value())

I recommend this website that does exactly the same thing.
https://4devs.net.br/gerador-cpf
But to get the "gerar cpf" action with selenium, you can inspect the HTML source code with a browser and click on "copy XPath for this element".
It is much simpler than manually searching for the elements in the page.

Related

t.xpath not getting results from second page?

I am trying to iterate over multiple pages of a website, however the code I am using below is only returning the results from the first page, even though I am using Selenium to click to the next page. I am at a loss for what could be causing this. Any explanation would be much appreciated!
The website in question:
https://www.cruiseplum.com/search#{%22numPax%22:2,%22geo%22:%22US%22,%22portsMatchAll%22:true,%22numOptionsShown%22:100,%22ppdIncludesTaxTips%22:true,%22uiVersion%22:%22split%22,%22sortTableByField%22:%22dd%22,%22sortTableOrderDesc%22:false,%22filter%22:null}
from selenium import webdriver
import time
import xlsxwriter
from lxml import html
u = 'https://www.cruiseplum.com/search#{%22numPax%22:2,%22geo%22:%22US%22,%22portsMatchAll%22:true,%22numOptionsShown%22:100,%22ppdIncludesTaxTips%22:true,%22uiVersion%22:%22split%22,%22sortTableByField%22:%22dd%22,%22sortTableOrderDesc%22:false,%22filter%22:null}'
driver = webdriver.Chrome()
driver.get(u)
driver.maximize_window()
time.sleep(.3)
driver.find_element_by_id('restoreSettingsYesEncl').click() # select 'yes' on the webpage to restore settings
time.sleep(7) # wait until the website downloads data so we get a return value
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("innerHTML")
t = html.fromstring(source_code)
for i in range(5):
for i in t.xpath('.//td[#class="dc-table-column _0"]/text()'):
print(i.strip())
driver.find_element_by_xpath('//*[#id="listings-table-split"]/div[5]/div/span[4]').click() # click to next page
time.sleep(.05)
driver.quit()
In the code above, t is getting value outside the loop
elem = driver.find_element_by_xpath("//*") source_code =
elem.get_attribute("innerHTML")
t = html.fromstring(source_code)
for i in range(5):
so it is loaded only the first time and keeps repeating the same elements. To achieve you need to move it inside the loop like in the code below:
from selenium import webdriver
import time
import xlsxwriter
from lxml import html
u = 'https://www.cruiseplum.com/search#{%22numPax%22:2,%22geo%22:%22US%22,%22portsMatchAll%22:true,%22numOptionsShown%22:100,%22ppdIncludesTaxTips%22:true,%22uiVersion%22:%22split%22,%22sortTableByField%22:%22dd%22,%22sortTableOrderDesc%22:false,%22filter%22:null}'
driver = webdriver.Chrome()
driver.get(u)
driver.maximize_window()
time.sleep(.3)
driver.find_element_by_id('restoreSettingsYesEncl').click() # select 'yes' on the webpage to restore settings
time.sleep(7) # wait until the website downloads data so we get a return value
for i in range(5):
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("innerHTML")
t = html.fromstring(source_code)
for i in t.xpath('.//td[#class="dc-table-column _0"]/text()'):
print(i.strip())
driver.find_element_by_xpath('//*[#id="listings-table-split"]/div[5]/div/span[4]').click() # click to next page
time.sleep(.05)
driver.quit()

Pulling all text (multple p tags) with BeautifulSoup and Selenium returns []

I am trying to pull the p tag comments within a review card, eventually looping through a search on vivino.com through this link using BeautifulSoup and Selenium. I was able to open the first link but pulling the p text in the review boxes returns [].
url = "https://www.vivino.com/explore?e=eJwNyTEOgCAQBdHbbA2F5e-8gbE2uKyERBYCaOT20swrJlVYSlFhjaHkPixTHtg34pmVyvzhwutqlO5uyid8bJwf7UeRyqKdMrw0pgYdPwIzGwQ="
driver = webdriver.Chrome('/Users/myname/Downloads/chromedriver')
driver.implicitly_wait(30)
driver.get(url)
python_button = driver.find_element_by_class_name('anchor__anchor--2QZvA')
python_button.click()
soup = BeautifulSoup(driver.page_source, 'lxml')
print(soup.find_all('p'))
table = soup.findAll('div',attrs={"class":"reviewCard__reviewContainer--1kMJM"})
print(table)
driver.quit()
Could anybody advise on the correct way to pull the comments? Since there are more than 1 comment per page would I need to loop?
I also tried this with 'html.parser' instead of 'lxml'. Which is the correct one to use?
Thank you so much for your help.
Here is what you need to do:
import atexit
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.expected_conditions import visibility_of_all_elements_located
from selenium.webdriver.support.wait import WebDriverWait
def start_driver():
driver = webdriver.Chrome()
atexit.register(driver.quit)
driver.maximize_window()
return driver
def find_elements(driver, locator):
return WebDriverWait(driver, 10, 2).until(visibility_of_all_elements_located(locator))
URL = "https://www.vivino.com/explore?e=eJwNyTEOgCAQBdHbbA2F5e-8gbE2uKyERBYCaOT20swrJlVYSlFhjaHkPixTHtg34pmVyvzhwutqlO5uyid8bJwf7UeRyqKdMrw0pgYdPwIzGwQ="
RESULTS = By.CSS_SELECTOR, "div[class*='vintageTitle'] > a"
def main():
driver = start_driver()
driver.get(URL)
# note the results
wines = []
for element in find_elements(driver, RESULTS):
link = element.get_attribute("href")
name = element.find_element_by_css_selector("span[class*='vintageTitle__wine']").text
wines.append((name, link))
pprint(wines)
# go extract details from each result's page
for name, link in wines:
print("getting comments for wine: ", name)
driver.get(link)
# you can do the rest ;)
if __name__ == '__main__':
main()

How to get iframe source from page_source

Hello I try to extract the link from page_source and my code is:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import html5lib
driver_path = r"C:\Users\666\Desktop\New folder (8)\chromedriver.exe"
driver = webdriver.Chrome(driver_path)
driver.implicitly_wait(10)
driver.get("https://www.milversite.club/milver/outsiders-1x01-video_060893d7a.html")
try:
time.sleep(4)
iframe = driver.find_elements_by_tag_name('iframe')
for i in range(0, len(iframe)):
f = driver.find_elements_by_tag_name('iframe')[i]
driver.switch_to.frame(i)
# your work to extract link
text = driver.find_element_by_tag_name('body').text
print(text)
driver.switch_to.default_content()
output = driver.page_source
print (output)
finally:
driver.quit();
And now I want to scrape just this link
Try the below script to get the link you wanna parse. You didn't need to switch to iframe to get the link. Hardcoded delay is always the worst choice to parse any dynamic content. What if the link apprears after 5 seconds. I used Explicit Wait within the below script to make it robust.
from selenium import webdriver
from selenium.webdriver.support import ui
driver = webdriver.Chrome()
wait = ui.WebDriverWait(driver, 10)
driver.get("https://www.milversite.club/milver/outsiders-1x01-video_060893d7a.html")
elem = wait.until(lambda driver: driver.find_element_by_id("iframevideo"))
print(elem.get_attribute("src"))
driver.quit()
Output:
https://openload.co/embed/8wVwFQEP1Sw
Try with
element = driver.find_element_by_id('iframevideo')
link = element.get_attribute('src')

WebElement has no Attribute w3c

I am trying to tab through an Instagram page given a user input. I am able to get to the page. The page loads, then the class is found, then the code breaks. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import time
def get_posts(tag, user_count):
'''Getting html of page to be scrape for
hrefs that will get me the user names'''
print("getting page")
url = "https://www.instagram.com/explore/tags/" + tag + "/"
try:
driver = webdriver.Chrome()
driver.get(url)
print("successfully requested site")
except:
print("Unable to reach site")
quit()
browser = driver.find_element_by_class_name('_si7dy')
actions = ActionChains(browser)
for i in range(user_count):
actions = actions.send_keys(Keys.TAB)
time.sleep(0.5)
actions.perform()
soup = BeautifulSoup(driver.page_source, 'lxml')
try:
posts = soup.find_all("div", class_ = ["_mck9w","_gvoze","_f2mse"])
except:
print("No links found")
quit()
print("Length of posts: ",(len(posts)))
print(len(posts))
print(type(posts))
print("All Done")
driver.close()
return posts
I keep getting this error:
packages\selenium\webdriver\common\action_chains.py", line 69, in __init__
if self._driver.w3c:
AttributeError: 'WebElement' object has no attribute 'w3c'
I have searched around but have not found any info on w3c. I have never tabbed down a page before and so am using the answer found here: Send multiple tab key presses with selenium.
ActionChains seems like the best way to tab multiple times down the page, but if anyone has a better method I am open to trying that.
ActionChains should receive WebDriver, but you are sending WebElement instead
actions = ActionChains(driver)

How to scrape multiple pages with an unchanging URL - python

I'm trying to scrape this website: http://data.eastmoney.com/xg/xg/
So far I've used selenium to execute the javascript and get the table scraped. However, my code right now only gets me the first page. I was wondering if there's a way to access the other 17 pages, because when I click on next page the URL does not change, so I cannot just iterate over a different URL each time
Below is my code so far:
from selenium import webdriver
import lxml
from bs4 import BeautifulSoup
import time
def scrape():
url = 'http://data.eastmoney.com/xg/xg/'
d={}
f = open('east.txt','a')
driver = webdriver.PhantomJS()
driver.get(url)
lst = [x for x in range(0,25)]
htmlsource = driver.page_source
bs = BeautifulSoup(htmlsource)
heading = bs.find_all('thead')[0]
hlist = []
for header in heading.find_all('tr'):
head = header.find_all('th')
for i in lst:
if i!=2:
hlist.append(head[i].get_text().strip())
h = '|'.join(hlist)
print h
table = bs.find_all('tbody')[0]
for row in table.find_all('tr'):
cells = row.find_all('td')
d[cells[0].get_text()]=[y.get_text() for y in cells]
for key in d:
ret=[]
for i in lst:
if i != 2:
ret.append(d.get(key)[i])
s = '|'.join(ret)
print s
if __name__ == "__main__":
scrape()
Or is it possible for me to click next through the browser if I use webdriver.Chrome() instead of PhantomJS and then the Python run on the new page, after I click each time?
This is not a trivial page to interact with and would require the use of Explicit Waits to wait for invisibility of "loading" indicators.
Here is the complete and working implementation that you may use as a starting point:
# -*- coding: utf-8 -*-
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
url = "http://data.eastmoney.com/xg/xg/"
driver = webdriver.PhantomJS()
driver.get(url)
def get_table_results(driver):
for row in driver.find_elements_by_css_selector("table#dt_1 tr[class]"):
print [cell.text for cell in row.find_elements_by_tag_name("td")]
# initial wait for results
WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.XPATH, u"//th[. = '加载中......']")))
while True:
# print current page number
page_number = driver.find_element_by_id("gopage").get_attribute("value")
print "Page #" + page_number
get_table_results(driver)
next_link = driver.find_element_by_link_text("下一页")
if "nolink" in next_link.get_attribute("class"):
break
next_link.click()
time.sleep(2) # TODO: fix?
# wait for results to load
WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.XPATH, u"//img[contains(#src, 'loading')]")))
print "------"
The idea is to have an endless loop which we would exit only if the "Next Page" link becomes disabled (no more pages available). On every iteration, get the table results (printing on the console for the sake of an example), click the next link and wait for invisibility of the "loading" spinning circle appearing on top of the grid.
I found another way to do this in C# using Chromedriver and Selenium. All you have to do is add selenium references to the code and put chromedriver.exe references.
In your code you can navigate to the url using
using (var driver = new chromedriver())
{
driver.Navigate().GoToUrl(pathofurl);
//find your element by using FindElementByXpath
//var element = driver.FindElementByXpath(--Xpath--).Text;
}
Finding Xpath is easy - all you have to do is download scraper extension or x-path extension in chrome by going to chrome store. once you get a hang of x-path for elements you can find x-path for next button and use it in your code to navigate through pages very easily in a loop. Hope this helps.

Categories