Parse BeautifulSoup element into Selenium - python

I want to get the source code of a website using selenium; find a particular element using BeautifulSoup; and then parse it back into selenium as a selenium.webdriver.remote.webelement object.
Like so:
driver.get("www.google.com")
soup = BeautifulSoup(driver.source)
element = soup.find(title="Search")
element = Selenium.webelement(element)
element.click()
How can I achieve this?

A general solution that worked for me is to compute the xpath of the bs4 element, then use that to find the element in selenium,
xpath = xpath_soup(soup_element)
selenium_element = driver.find_element_by_xpath(xpath)
...
import itertools
def xpath_soup(element):
"""
Generate xpath of soup element
:param element: bs4 text or node
:return: xpath as string
"""
components = []
child = element if element.name else element.parent
for parent in child.parents:
"""
#type parent: bs4.element.Tag
"""
previous = itertools.islice(parent.children, 0, parent.contents.index(child))
xpath_tag = child.name
xpath_index = sum(1 for i in previous if i.name == xpath_tag) + 1
components.append(xpath_tag if xpath_index == 1 else '%s[%d]' % (xpath_tag, xpath_index))
child = parent
components.reverse()
return '/%s' % '/'.join(components)

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get("http://www.google.com")
soup = BeautifulSoup(driver.page_source, 'html.parser')
search_soup_element = soup.find(title="Search")
input_element = soup.select('input.gsfi.lst-d-f')[0]
search_box = driver.find_element(by='name', value=input_element.attrs['name'])
search_box.send_keys('Hello World!')
search_box.send_keys(Keys.RETURN)
This pretty much works. I can see reason for working with both webdriver and BeautifulSoup but not necessarily for this example.

I don't know of any way to pass from bs4 to selenium but you can just use selenium to find the element:
driver.find_element_by_xpath('//input[#title="Search"]').click()
Or to find using just the title text like your bs4 find:
driver.find_element_by_xpath('//*[#title="Search"]').click()

Related

Get all url in class by Selenium Python

I want to get all the links in the class tag like the image below.
enter image description here
driver.find_elements_by_xpath('/html/body/div[2]/div[2]/div[2]/div/div[2]/div[2]/div/div[1]/div[1]/div/div'):
url_video = a.get_property('href')
print(url_video)
i get the result is none
I use the 'a' tag to get all the links. I just want to get the links in the specified class. Please help me
This my code:
from selenium import webdriver
import time
browser=webdriver.Chrome()
time.sleep(6)
elements = browser.find_elements_by_xpath('/html/body/div[2]/div[2]/div[2]/div/div[2]/div[2]/div/div[1]/div[1]/div/div')
for element in elements:
videoUrl = element.get_attribute('href')
print(videoUrl)
----> The result is none
.find_elements_by_xpath() return a list, you should use element['href'] on each element of the list.
elements = driver.find_elements_by_xpath('...')
for element in elements:
videoUrl = element['href']
print(videoUrl)

How print as text an Selenium html element?

I'm making an Selenium project for fun. I want see all football score on my terminal. I use Selenium for scraping. But I cannot print the scraped element. How can I fix that?
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import os
from bs4 import BeautifulSoup
import lxml
team_both = []
team_one = []
team_two = []
team_id = []
driver = webdriver.Chrome(ChromeDriverManager().install())
time.sleep(1)
#os.system('clear')
time.sleep(1)
driver.get('https://www.bet365.de/#/IP/B1')
time.sleep(1)
try:
driver.find_element_by_class_name('iip-IntroductoryPopup_Cross').click()
except:
pass
time.sleep(1)
# Scroll to bottom
element = driver.find_element_by_class_name("ovm-OverviewScroller-enabled")
actions = ActionChains(driver)
actions.move_to_element(element).click_and_hold().perform()
soup = BeautifulSoup(driver.page_source, 'lxml')
time.sleep(5)
os.system('clear')
#ovm-FixtureDetailsTwoWay_TeamName
teams = soup.find_all('div' , {"class" : "ovm-FixtureDetailsTwoWay_TeamName "})
for i in teams:
print(i.text)
The simplest way to extract a text from web element is by applying the .text method, but since you are using find_all method, the team_both is not a single web element rather a list of web elements.
In order to get texts from the web elements in a list you have to iterate over the elements in the list and extract text from each element, as following:
team_both = soup.find_all('div', {"class" : "ovm-FixtureDetailsTwoWay_TeamName"})
for team in team_both:
print(team.text)

Get element text with a partial string match using Selenium (Python)

I am trying to extract the text from within a <strong> tag that is deeply nested in the HTML content of this webpage: https://www.marinetraffic.com/en/ais/details/ships/imo:9854612
For example:
The strong tag is the only one on the webpage that will contain the string 'cubic meters'.
My objective is to extract the entire text, i.e., "138124 cubic meters Liquid Gas". When I try the following, I get an error:
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
driver.get(url)
time.sleep(3)
element = driver.find_element_by_link_text("//strong[contains(text(),'cubic meters')]").text
print(element)
Error:
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"link text","selector":"//strong[contains(text(),'cubic meters')]"}
What am I doing wrong here?
The following also throws an error:
element = driver.find_element_by_xpath("//strong[contains(text(),'cubic')]").text
Your code works on Firefox(), but not on Chrome().
The page uses lazy loading, so you have to scroll to Summary and then it loads the text with the expected strong.
I used a little slower method - I search all
elements with class='lazyload-wrapper, and in the loop scroll to the item and check if there is strong. If there isn't any strong, then I scroll to the next class='lazyload-wrapper.
from selenium import webdriver
import time
#driver = webdriver.Firefox()
driver = webdriver.Chrome()
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
driver.get(url)
time.sleep(3)
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
elements = driver.find_elements_by_xpath("//span[#class='lazyload-wrapper']")
for number, item in enumerate(elements):
print('--- item', number, '---')
#print('--- before ---')
#print(item.text)
actions.move_to_element(item).perform()
time.sleep(0.1)
#print('--- after ---')
#print(item.text)
try:
strong = item.find_element_by_xpath("//strong[contains(text(), 'cubic')]")
print(strong.text)
break
except Exception as ex:
#print(ex)
pass
Result:
--- item 0 ---
--- item 1 ---
--- item 2 ---
173400 cubic meters Liquid Gas
The result shows that I could use elements[2] to skip two elements, but I wasn't sure if this text will be always in the third element.
Before I created my version I tested other versions and here is the full working code:
from selenium import webdriver
import time
#driver = webdriver.Firefox()
driver = webdriver.Chrome()
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
driver.get(url)
time.sleep(3)
def test0():
elements = driver.find_elements_by_xpath("//strong")
for item in elements:
print(item.text)
print('---')
item = driver.find_element_by_xpath("//strong[contains(text(), 'cubic')]")
print(item.text)
def test1a():
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
element = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//div")
actions.move_to_element(element).build().perform()
text = element.text
print(text)
def test1b():
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
text = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//strong").text
print(text)
def test2():
from bs4 import BeautifulSoup
import re
soup = BeautifulSoup(driver.page_source, "html.parser")
soup.find_all(string=re.compile(r"\d+ cubic meters"))
def test3():
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
elements = driver.find_elements_by_xpath("//span[#class='lazyload-wrapper']")
for number, item in enumerate(elements, 1):
print('--- number', number, '---')
#print('--- before ---')
#print(item.text)
actions.move_to_element(item).perform()
time.sleep(0.1)
#print('--- after ---')
#print(item.text)
try:
strong = item.find_element_by_xpath("//strong[contains(text(), 'cubic')]")
print(strong.text)
break
except Exception as ex:
#print(ex)
pass
#test0()
#test1a()
#test1b()
#test2()
test3()
You can use Beautiful Soup for this, and more precisely the string argument; from the documentation, "you can search for strings instead of tags".
As an argument, you can also pass a regex pattern.
>>> from bs4 import BeautifulSoup
>>> import re
>>> soup = BeautifulSoup(driver.page_source, "html.parser")
>>> soup.find_all(string=re.compile(r"\d+ cubic meters"))
['173400 cubic meters Liquid Gas']
If you're sure there is only one result, or you need just the first, you can also use find instead of find_all.
Your XPath expression is correct and works in Chrome. You get NoSuchElementException, because the element is not loaded within the 3 seconds you wait and does not exist.
To wait for the element, use the WebDriverWait class. It waits explicitly for a specific condition of the element, and in your case presents is enough.
In the code below, Selenium will wait for the element to be presented in the HTML for 10 seconds, polling every 500 milliseconds. You can read about WebDriverWait and conditions here.
Some useful information:
Not visible elements return an empty string. In such a case you need to wait for the visibility of the element, or if the element requires a scroll to scroll to it (example added).
You can also get the text from a not-visible element using JavaScript.
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium import webdriver
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
locator = "//strong[contains(text(),'cubic meters')]"
with webdriver.Chrome() as driver: # Type: webdriver
wait = WebDriverWait(driver, 10)
driver.get(url)
cubic = wait.until(ec.presence_of_element_located((By.XPATH, locator))) # Type: WebElement
print(cubic.text)
# The below examples are just for information
# and are not needed for the case
# Example with scroll. Scroll to the element to make it visible
cubic.location_once_scrolled_into_view
print(cubic.text)
# Example using JavaScript. Works for not visible elements.
text = driver.execute_script("return arguments[0].textContent", cubic)
print(text)
It would be correct to use the marinetraffic API.
I guess you should first scroll to that element and only after that try accessing it including getting it text.
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
element = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//div")
actions.move_to_element(element).build().perform()
text = element.text
In case the above still not good enough you can scroll page height one time like this:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
the_text = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//strong").text

Extracting text from a Python span tag

I'm doing the selenium robot, I need to extract information from the page after the robot does a search, but I'm having trouble.
I have the HTML in the following image
Image-here
I want to extract the texts from these italicized tags of "class = 'escapamentoLinhas'"
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
from selenium.webdriver.support.ui import Select
URL = '\x0\x0\x0\x0'
search = '\x0\x0'
print("Running...")
class ScrapingTJ:
def __init__(self):
self.browser = webdriver.Firefox()
self.browser.get(URL)
sleep(1)
select = Select(self.browser.find_element_by_id('cbPesquisa'))
select.select_by_value('NMPARTE')
sleep(1)
self.browser.find_element_by_xpath('//*[#id="campo_NMPARTE"]').send_keys(CNPJ_CLARO)
self.browser.find_element_by_id('pbEnviar').click()
sleep(2)
dados = self.browser.find_element_by_id('listagemDeProcessos')
HTML = dados.get_attribute("innerHTML")
scraping = BeautifulSoup(HTML, "html.parser")
# links
links = scraping.find_all('a')
for scrape in links:
print(scrape.get_text())
textos = scraping.find(class_ = 'espacamentoLinhas')
subtextos = scraping.find_all('span')
for ext in subtextos:
print(ext.get_text())
if __name__ == '__main__':
ScrapingTJ()
exit:
Exectdo:
Recebido em:
Exectda:
Recebido em:
in : I should get '30/04/2007 - Vara das Execuções Fiscais Estaduais ' is underlined in the image
Based on the HTML image you provided, it looks like the element text is in the div element and not the span. You will need to extract text from div instead of span. I would replace this block:
textos = scraping.find(class_ = 'espacamentoLinhas')
subtextos = scraping.find_all('span')
for ext in subtextos:
print(ext.get_text())
With this:
elements = self.browser.find_elements_by_xpath("//div[#class='espacamentoLinhas']")
for element in elements:
print(element.text)
span only contains the text 'Recebido em:', and not the text you are looking for, which is 30/04/2007 - Vara das Execuções Fiscais Estaduais. This text is actually contained in the div referenced in the XPath I included.
If you don't want to use self.browser.find_elements_by_xpath, you can probably just remove the scraping.find_all('span') part of your code:
textos = scraping.find(class_ = 'espacamentoLinhas')
for ext in textos:
print(ext.get_text())

Selenium no another page

I am scraping one page but the problem i came up today was that the page didn`t have another page and it gave me the previous page without any error from which i could determine that page was last one..
for ex: https://example/page-7
when i want to go to: https://example/page-8 which doesn`t exist it gives me
the last page: https://example/page-7
How could i determine that https://example/page-7 was the last page using python3???
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-1"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
for j in range(100):
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page{0}".format(j+2)
driver.get(page)
dd = driver.page_source
At first i was thinking about checking dublicates of collected data but this is too slow cause i have 30 000 links from which i have to collect data. Maybe there is easier solution??
Found the answer to my own question.
To find the page url just use driver.current_url
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
current_pages = []
for j in range(100):
page_url = driver.current_url
if(page_url not in current_pages):
current_pages.append(page_url)
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-{0}".format(j+2)
driver.get(page)
dd = driver.page_source
else:
print(current_pages)
driver.quit()
break

Categories