Get element text with a partial string match using Selenium (Python) - python

I am trying to extract the text from within a <strong> tag that is deeply nested in the HTML content of this webpage: https://www.marinetraffic.com/en/ais/details/ships/imo:9854612
For example:
The strong tag is the only one on the webpage that will contain the string 'cubic meters'.
My objective is to extract the entire text, i.e., "138124 cubic meters Liquid Gas". When I try the following, I get an error:
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
driver.get(url)
time.sleep(3)
element = driver.find_element_by_link_text("//strong[contains(text(),'cubic meters')]").text
print(element)
Error:
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"link text","selector":"//strong[contains(text(),'cubic meters')]"}
What am I doing wrong here?
The following also throws an error:
element = driver.find_element_by_xpath("//strong[contains(text(),'cubic')]").text

Your code works on Firefox(), but not on Chrome().
The page uses lazy loading, so you have to scroll to Summary and then it loads the text with the expected strong.
I used a little slower method - I search all
elements with class='lazyload-wrapper, and in the loop scroll to the item and check if there is strong. If there isn't any strong, then I scroll to the next class='lazyload-wrapper.
from selenium import webdriver
import time
#driver = webdriver.Firefox()
driver = webdriver.Chrome()
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
driver.get(url)
time.sleep(3)
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
elements = driver.find_elements_by_xpath("//span[#class='lazyload-wrapper']")
for number, item in enumerate(elements):
print('--- item', number, '---')
#print('--- before ---')
#print(item.text)
actions.move_to_element(item).perform()
time.sleep(0.1)
#print('--- after ---')
#print(item.text)
try:
strong = item.find_element_by_xpath("//strong[contains(text(), 'cubic')]")
print(strong.text)
break
except Exception as ex:
#print(ex)
pass
Result:
--- item 0 ---
--- item 1 ---
--- item 2 ---
173400 cubic meters Liquid Gas
The result shows that I could use elements[2] to skip two elements, but I wasn't sure if this text will be always in the third element.
Before I created my version I tested other versions and here is the full working code:
from selenium import webdriver
import time
#driver = webdriver.Firefox()
driver = webdriver.Chrome()
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
driver.get(url)
time.sleep(3)
def test0():
elements = driver.find_elements_by_xpath("//strong")
for item in elements:
print(item.text)
print('---')
item = driver.find_element_by_xpath("//strong[contains(text(), 'cubic')]")
print(item.text)
def test1a():
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
element = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//div")
actions.move_to_element(element).build().perform()
text = element.text
print(text)
def test1b():
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
text = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//strong").text
print(text)
def test2():
from bs4 import BeautifulSoup
import re
soup = BeautifulSoup(driver.page_source, "html.parser")
soup.find_all(string=re.compile(r"\d+ cubic meters"))
def test3():
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
elements = driver.find_elements_by_xpath("//span[#class='lazyload-wrapper']")
for number, item in enumerate(elements, 1):
print('--- number', number, '---')
#print('--- before ---')
#print(item.text)
actions.move_to_element(item).perform()
time.sleep(0.1)
#print('--- after ---')
#print(item.text)
try:
strong = item.find_element_by_xpath("//strong[contains(text(), 'cubic')]")
print(strong.text)
break
except Exception as ex:
#print(ex)
pass
#test0()
#test1a()
#test1b()
#test2()
test3()

You can use Beautiful Soup for this, and more precisely the string argument; from the documentation, "you can search for strings instead of tags".
As an argument, you can also pass a regex pattern.
>>> from bs4 import BeautifulSoup
>>> import re
>>> soup = BeautifulSoup(driver.page_source, "html.parser")
>>> soup.find_all(string=re.compile(r"\d+ cubic meters"))
['173400 cubic meters Liquid Gas']
If you're sure there is only one result, or you need just the first, you can also use find instead of find_all.

Your XPath expression is correct and works in Chrome. You get NoSuchElementException, because the element is not loaded within the 3 seconds you wait and does not exist.
To wait for the element, use the WebDriverWait class. It waits explicitly for a specific condition of the element, and in your case presents is enough.
In the code below, Selenium will wait for the element to be presented in the HTML for 10 seconds, polling every 500 milliseconds. You can read about WebDriverWait and conditions here.
Some useful information:
Not visible elements return an empty string. In such a case you need to wait for the visibility of the element, or if the element requires a scroll to scroll to it (example added).
You can also get the text from a not-visible element using JavaScript.
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium import webdriver
url = "https://www.marinetraffic.com/en/ais/details/ships/imo:9854612"
locator = "//strong[contains(text(),'cubic meters')]"
with webdriver.Chrome() as driver: # Type: webdriver
wait = WebDriverWait(driver, 10)
driver.get(url)
cubic = wait.until(ec.presence_of_element_located((By.XPATH, locator))) # Type: WebElement
print(cubic.text)
# The below examples are just for information
# and are not needed for the case
# Example with scroll. Scroll to the element to make it visible
cubic.location_once_scrolled_into_view
print(cubic.text)
# Example using JavaScript. Works for not visible elements.
text = driver.execute_script("return arguments[0].textContent", cubic)
print(text)
It would be correct to use the marinetraffic API.

I guess you should first scroll to that element and only after that try accessing it including getting it text.
from selenium.webdriver.common.action_chains import ActionChains
actions = ActionChains(driver)
element = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//div")
actions.move_to_element(element).build().perform()
text = element.text
In case the above still not good enough you can scroll page height one time like this:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
the_text = driver.find_element_by_xpath("//div[contains(#class,'MuiTypography-body1')][last()]//strong").text

Related

How print as text an Selenium html element?

I'm making an Selenium project for fun. I want see all football score on my terminal. I use Selenium for scraping. But I cannot print the scraped element. How can I fix that?
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import os
from bs4 import BeautifulSoup
import lxml
team_both = []
team_one = []
team_two = []
team_id = []
driver = webdriver.Chrome(ChromeDriverManager().install())
time.sleep(1)
#os.system('clear')
time.sleep(1)
driver.get('https://www.bet365.de/#/IP/B1')
time.sleep(1)
try:
driver.find_element_by_class_name('iip-IntroductoryPopup_Cross').click()
except:
pass
time.sleep(1)
# Scroll to bottom
element = driver.find_element_by_class_name("ovm-OverviewScroller-enabled")
actions = ActionChains(driver)
actions.move_to_element(element).click_and_hold().perform()
soup = BeautifulSoup(driver.page_source, 'lxml')
time.sleep(5)
os.system('clear')
#ovm-FixtureDetailsTwoWay_TeamName
teams = soup.find_all('div' , {"class" : "ovm-FixtureDetailsTwoWay_TeamName "})
for i in teams:
print(i.text)
The simplest way to extract a text from web element is by applying the .text method, but since you are using find_all method, the team_both is not a single web element rather a list of web elements.
In order to get texts from the web elements in a list you have to iterate over the elements in the list and extract text from each element, as following:
team_both = soup.find_all('div', {"class" : "ovm-FixtureDetailsTwoWay_TeamName"})
for team in team_both:
print(team.text)

How to scroll correctly in a dynamically-loading webpage with Selenium?

Here's the link of the website : website
I would like to have all the links of th hotels in this location.
Here's my script :
import pandas as pd
import numpy as np
from selenium import webdriver
import time
PATH = "driver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
driver = webdriver.Chrome(options=options, executable_path=PATH)
driver.get('https://fr.hotels.com/search.do?destination-id=10398359&q-check-in=2021-06-24&q-check-out=2021-06-25&q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER')
cookie = driver.find_element_by_xpath('//button[#class="uolsaJ"]')
try:
cookie.click()
except:
pass
for i in range(30):
driver.execute_script("window.scrollBy(0, 1000)")
time.sleep(5)
time.sleep(5)
my_elems = driver.find_elements_by_xpath('//a[#class="_61P-R0"]')
links = [my_elem.get_attribute("href") for my_elem in my_elems]
X = np.array(links)
print(X.shape)
#driver.close()
But I cannot find a way to tell the script : scroll down until there is nothing more to scroll.
I tried to change this parameters :
for i in range(30):
driver.execute_script("window.scrollBy(0, 1000)")
time.sleep(30)
I changed the time.sleep(), the number 1000 and so on but my output keep changing and not in the right way.
output
As you can see, I have scraped a lot of numbers differents. How to make my script scraping a same amout each time ? Not necessarily each links but at last a stable number.
Here it scroll and at one point it seems blocked and scrape all the links it has at the moment. That's not appropriate.
There are several issues here.
You are getting the elements and their links only AFTER you finished scrolling while you should do that inside the scrolling loop.
You should wait until the cookies alert is appearing to close it.
You can scroll until the footer element is presented.
Something like this:
import pandas as pd
import numpy as np
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PATH = "driver\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1200,900")
options.add_argument('enable-logging')
driver = webdriver.Chrome(options=options, executable_path=PATH)
wait = WebDriverWait(driver, 20)
driver.get('https://fr.hotels.com/search.do?destination-id=10398359&q-check-in=2021-06-24&q-check-out=2021-06-25&q-rooms=1&q-room-0-adults=2&q-room-0-children=0&sort-order=BEST_SELLER')
wait.until(EC.visibility_of_element_located((By.XPATH, '//button[#class="uolsaJ"]'))).click()
def is_element_visible(xpath):
wait1 = WebDriverWait(driver, 2)
try:
wait1.until(EC.visibility_of_element_located((By.XPATH, xpath)))
return True
except Exception:
return False
while not is_element_visible("//footer[#id='footer']"):
my_elems = driver.find_elements_by_xpath('//a[#class="_61P-R0"]')
links = [my_elem.get_attribute("href") for my_elem in my_elems]
X = np.array(links)
print(X.shape)
driver.execute_script("window.scrollBy(0, 1000)")
time.sleep(5)
#driver.close()
You can try this by directly calling the DOM and locate some element that will be only at the bottom of the page with .is_displayed() selenium method which returns true/false:
# https://stackoverflow.com/a/57076690/15164646
while True:
# it will be returning false until the element is located
# "#message" id = "No more results" at the bottom of the YouTube search
end_result = driver.find_element_by_css_selector('#message').is_displayed()
driver.execute_script("var scrollingElement = (document.scrollingElement || document.body);scrollingElement.scrollTop = scrollingElement.scrollHeight;")
# further code below
# once the element is found it returns True. If so, it will break out of the while loop
if end_result == True:
break
I wrote a blog post where I used this method to scrape YouTube Search.

Next Page Iteration in Selenium/BeautfulSoup for Scraping E-Commerce Website

I'm scraping an E-Commerce website, Lazada using Selenium and bs4, I manage to scrape on the 1st page but I unable to iterate to the next page. What I'm tyring to achieve is to scrape the whole pages based on the categories I've selected.
Here what I've tried :
# Run the argument with incognito
option = webdriver.ChromeOptions()
option.add_argument(' — incognito')
driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)
driver.get('https://www.lazada.com.my/')
driver.maximize_window()
# Select category item #
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
t = 10
try:
WebDriverWait(driver,t).until(EC.visibility_of_element_located((By.ID,"a2o4k.searchlistcategory.0.i0.460b6883jV3Y0q")))
except TimeoutException:
print('Page Refresh!')
driver.refresh()
element = driver.find_elements_by_class_name('card-categories-li-content')[0]
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
print('Page Load!')
#Soup and select element
def getData(np):
soup = bs(driver.page_source, "lxml")
product_containers = soup.findAll("div", class_='c2prKC')
for p in product_containers:
title = (p.find(class_='c16H9d').text)#title
selling_price = (p.find(class_='c13VH6').text)#selling price
try:
original_price=(p.find("del", class_='c13VH6').text)#original price
except:
original_price = "-1"
if p.find("i", class_='ic-dynamic-badge ic-dynamic-badge-freeShipping ic-dynamic-group-2'):
freeShipping = 1
else:
freeShipping = 0
try:
discount = (p.find("span", class_='c1hkC1').text)
except:
discount ="-1"
if p.find(("div", {'class':['c16H9d']})):
url = "https:"+(p.find("a").get("href"))
else:
url = "-1"
nextpage_elements = driver.find_elements_by_class_name('ant-pagination-next')[0]
np=webdriver.ActionChains(driver).move_to_element(nextpage_elements).click(nextpage_elements).perform()
print("- -"*30)
toSave = [title,selling_price,original_price,freeShipping,discount,url]
print(toSave)
writerows(toSave,filename)
getData(np)
The problem might be that the driver is trying to click the button before the element is even loaded correctly.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(PATH, chrome_options=option)
# use this code after driver initialization
# this is make the driver wait 5 seconds for the page to load.
driver.implicitly_wait(5)
url = "https://www.lazada.com.ph/catalog/?q=phone&_keyori=ss&from=input&spm=a2o4l.home.search.go.239e359dTYxZXo"
driver.get(url)
next_page_path = "//ul[#class='ant-pagination ']//li[#class=' ant-pagination-next']"
# the following code will wait 5 seconds for
# element to become clickable
# and then try clicking the element.
try:
next_page = WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, next_page_path)))
next_page.click()
except Exception as e:
print(e)
EDIT 1
Changed the code to make the driver wait for the element to become clickable. You can add this code inside a while loop for iterating multiple times and break the loop if the button is not found and is not clickable.

How to avoid StaleElementReferenceError when getting elements from different page?

I want to get all the results from a race. The website shows 50 rows/page.
I navigate to the next page (same URL with suffix #page-x) using selenium, but I get a StaleElementReferenceException error whenever I try to find elements (cells of the table = td) on the next page.
I tried to close the driver between the steps to get just one list of elements at a time. I've also tried to load the pages separately with the URL+suffix, but it doesn't load correctly. I've tried building separate lists (at first I wanted one big list with all the results).
from selenium import webdriver
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
#The block under works well and I get a list of cells as intended.
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)
elements = driver.find_elements_by_tag_name("td")
course = []
for i in range(len(elements)):
course.append(elements[i].text)
to_2 = driver.find_element_by_link_text("2")
to_2.click()
print(driver.current_url)
#I'm trying similar code for the next chunk, but it doesn't work.
elements2 = driver.find_elements_by_tag_name("td")
print(len(elements2))
print(elements2[5].text)
course2 = []
for i in range(len(elements2)):
course2.append(elements2[i].text)
driver.close()
I would expect a new list (course2), with the results of the second page, but I get a stale element error. When I print the current URL, the result is as expected. When I print the len(elements2), it's also OK. Looks like the problem is when I try to get the text of an element.
Solution-1:
Using BeautifulSoup and selenium, WebDriverWait is waiting for a certain condition to occur before proceeding further in the code. for more details about BeautifulSoup.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
course = []
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
page_soup = BeautifulSoup(driver.page_source, 'lxml')
# get table data
tbody = page_soup.find("tbody",{"id":"searchResultBoxParticipants"})
rows = tbody.find_all("tr")
for row in rows:
rowData = []
for td in row.find_all("td"):
rowData.append(td.text)
course.append(rowData)
data.append(course)
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
print(data)
Solution-2:
Using pandas library.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = "https://tickets.justrun.ca/quidchrono.php?a=qcResult&raceid=8444"
driver = webdriver.Chrome()
driver.get(url)
data = []
while True:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "tableJustrun")))
tables = pd.read_html(driver.page_source)
#append Participants table data
data.append(tables[0])
try:
pagination = driver.find_element_by_class_name("simple-pagination")
next_page = pagination.find_element_by_link_text("Suivant")
# iterate next page
next_page.click()
except Exception as e:
break
#Concat dataframe object
result = pd.concat(data)
print(result)

Selenium cannot get all elements of a page

i am using selenium to go search on agoda and scrape all the hotel name in the page, but the output only return 2 names.
Then i tried to add a line to scroll to the bottom, now the output gives me first 2 names and last 2 names (first two from beginning, last two from bottom)
I don't understand what's the problem, i added time.sleep() for each step so the whole page should have been loaded completely. Does selenium limit by page view that it can only scrape those element in sight?
my code below:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(30)
def scrape():
r = requests.get(current_page)
if r.status_code == requests.codes.ok:
print('start scraping!')
hotel = driver.find_elements_by_class_name('hotel-name')
hotels = []
for h in hotel:
if hotel:
hotels.append(h.text)
print(hotels, file=open("output.txt", 'a', encoding="utf-8"))
scrape()
Here is the page i want to scrape
Try to use below script to scroll page down until no more results appeared on page and then scrape all available names:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.maximize_window()
driver.get('https://www.agoda.com/pages/agoda/default/DestinationSearchResult.aspx?asq=8wUBc629jr0%2B3O%2BxycijdcaVIGtokeWrEO7ShJumN8xsNvkFkEV9bUgNnbx6%2Bx22ncbzTLOPBjT84OgAAKXmu6quf8aEKRA%2FQH%2BGoyXgowLt%2BXyB8OpN1h2WP%2BnBM%2FwNPzD%2BpaeII93w%2Bs4dMWI4QPJNbZJ8DWvRiPsrPVVBJY7ilpMPlUermwV1UKIKfuyeis3BqRkJh9FzJOs0E98zXQ%3D%3D&city=9590&cid=-142&tick=636818018163&languageId=20&userId=3c2c4cb9-ba6d-4519-8ef4-c85dfd280b8f&sessionId=d4qzq2tgymjrwsf22lnadxpc&pageTypeId=1&origin=HK&locale=zh-TW&aid=130589&currencyCode=HKD&htmlLanguage=zh-tw&cultureInfoName=zh-TW&ckuid=3c2c4cb9-ba6d-4519-8ef4-c85dfd280b8f&prid=0&checkIn=2019-01-16&checkOut=2019-01-17&rooms=1&adults=2&children=0&priceCur=HKD&los=1&textToSearch=%E5%A4%A7%E9%98%AA&productType=-1&travellerType=1')
# Get initial list of names
hotels = wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))
while True:
# Scroll down to last name in list
driver.execute_script('arguments[0].scrollIntoView();', hotels[-1])
try:
# Wait for more names to be loaded
wait(driver, 15).until(lambda driver: len(wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))) > len(hotels))
# Update names list
hotels = wait(driver, 15).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'hotel-name')))
except:
# Break the loop in case no new names loaded after page scrolled down
break
# Print names list
print([hotel.text for hotel in hotels])

Categories