Unable to scrape table using selenium - python

I'm trying to scrape the following webpage using selenium https://www.stakingrewards.com/cryptoassets/, butI keep getting the same error. Does anyone know what I'm doing wrong? It seems like it can't find the class name for some reason. The error I am getting is the following
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".table-wrap"}
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument("--enable-javascript")
options.add_argument('--no-sandbox')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
driver.get(f'https://www.stakingrewards.com/cryptoassets/')
driver.implicitly_wait(10)
element = driver.find_element_by_class_name("table-wrap")
html = BeautifulSoup(driver.execute_script("return arguments[0].innerHTML;", element), 'html.parser')
for item in html.findAll('div', {"class": "rt-tr-group"}):
print(item.text)

I always find elements using the full xPath. For me it has been a lot more reliable, and since there can be multiple occurrences of a class name I recommend not using that. What you can do is right-click the webpage and press inspect, find the element you're looking for, right-click the element, and press copy full xpath.
Replace:
element = driver.find_element_by_class_name("table-wrap")
with:
element = driver.find_element_by_xpath('/html/body/div[1]/section/section/main/div/div/section/section[2]')
Furthermore, you are trying to act on the driver variable with BeautifulSoup, which does not work. You have to pass BeautifulSoup the URL and keep going. I am unfamiliar with BeautifulSoup and can't help you much there :/

Related

Scrape #shadow-root (open)

I'm trying to scrape href tags that are hidden under a #shadow-root (open) on this website: https://iltacon2022.expofp.com/. I'm new to coding and was wondering if someone could help me out.
Here is the code I've been trying to access the #shadow-root (open), but I'm in front of my skies on what to do next.
Code:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
url = "https://iltacon2022.expofp.com/"
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
driver.get(url)
time.sleep(6)
root1 = driver.find_element(By.XPATH,"/html/body/div[1]/div").shadow_root
root2 = driver.find_element(By.XPATH,"/html/body/div[1]/div//div")
print(root2)
driver.quit()
error:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[1]/div//div"}
Desired output:
?access-corp
?accruent-inc
?aceds-association-of-certified-e-discovery-specialists
?actionstep
?aderant
etc
etc
etc

Extracting html table into panda DataFrame using Selenium

I'm trying to read an html table and then extract it into pd.DataFrame but instead getting something different. What am I doing wrong?
the error is: [<selenium.webdriver.remote.webelement.WebElement (session="38159852443c19167a9033a2b078fe45", element="ef6a42a1-2775-44c1-955f-5f01870bc758")>]
Here is my code:
import pandas as pd
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(executable_path = 'mypath/chromedriver.exe', options = options)
driver.get("https://ai.fmcsa.dot.gov/SMS")
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.XPATH, "//a[#title='Close']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "(//input[#name='MCSearch'])[2]"))).send_keys('1818437')
wait.until(EC.element_to_be_clickable((By.XPATH, "(//input[#name='search'])[2]"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[#id='BASICs']/p[2]/a"))).click()
tables=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.XPATH,'//*[#id="BASICs"]/table/tbody/tr[2]')))
print(tables)
Disregard the bunch of extra imports as I've been trying to approach the problem in different way but keep failing
I might have solved it, actually.
I changed the XPATH of the table from
'//*[#id="BASICs"]/table/tbody/tr[2]'
to
"//tr[#class='valueRow sumData']"
and I just realized I was printing the element and not its content so I changed the last line from
print(tables) to print(tables.text)
Now it's printing the table but for some reason it's not printing the very last value (0.05 in this case). Any ideas why?

Scraping flex-element Selenium Python

I am trying to scrape some tennis statistics starting from 01-01-2019.
For this I try to scrape the following webpage with selenium: https://www.sofascore.com/de/tennis/2019-01-01
When I click on the first match manually the container on the right side changes and shows the statistics.
This is what I want to access automatically.
When I try to click on the element with selenium it redirects me to another page.
Can anyone tell me why it is not just showing the same content as by manually clicking and how I can solve this issue?
Here is my code:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.webdriver.support import expected_conditions as EC
import time
options = Options()
options.binary_location = "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"
browser = webdriver.Chrome(chrome_options = options)
url = 'https://www.sofascore.com/de/tennis/2019-01-01'
browser.get(url)
browser.maximize_window()
xpath = '/html/body/div[1]/main/div/div[2]/div/div[3]/div[2]/div/div/div/div/div[2]/a/div'
browser.find_element_by_xpath(xpath).click()
time.sleep(2)
browser.close()`
You can use the below xpath :
//div[contains(#class, 'Col-pm5mcz-')]//descendant::div[contains(#class, 'styles__StyledWidget-')]
and get the innerHTML of that using get_attribute method
Code :
url = "https://www.sofascore.com/de/tennis/2019-01-01"
driver.get(url)
xpath = '/html/body/div[1]/main/div/div[2]/div/div[3]/div[2]/div/div/div/div/div[2]/a/div'
driver.find_element_by_xpath(xpath).click()
sleep(2)
details = driver.find_element_by_xpath("//div[contains(#class, 'Col-pm5mcz-')]//descendant::div[contains(#class, 'styles__StyledWidget-')]").get_attribute('innerHTML')
print(details)
The xpath that you are using is absolute xpath /html/body/div[1]/main/div/div[2]/div/div[3]/div[2]/div/div/div/div/div[2]/a/div
try to replace that with Relative xpath.
See if this works
tableRows = driver.find_elements_by_xpath(".//div[#class='ReactVirtualized__Grid ReactVirtualized__List']//following::div/a[contains(#class,'EventCellstyles__Link')]")
for e in tableRows:
e.click()
//You can add implicit wait here for the statics section to load
driver.find_element_by_xpath(".//a[text()='Statistiken']").click()

Crawling JavaScript site with selenium (python) returns error: Message: no such element: Unable to locate element:

I am new to python and webcrawling in general. I started with BeautifulSoup but quickly learned that sites that use JavaScript cant be crawled with bs4, so I started using selenium. Selenium, however, also returns an error and cant find the elements (search box) I am trying to scrape. So far I have also learned, that the page I am trying to crawl probably uses Angular, which somehow hides the elements I am looking for. Is there a way I could still use selenium or another package to enter search queries and crawl the site?
Any element I try to find cant be found, ive also tried finding them via xpath or name with out luck. I believe anything inside <app-root></app-root> cant be found simply with selenium.
Here is my code so far
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
import time
import sys
chrome_driver_path = "path"
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--no-sandbox')
webdriver = webdriver.Chrome(
executable_path=chrome_driver_path,
options=chrome_options
)
useBaseURL = "https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/home"
with webdriver as driver:
# timeout
wait = WebDriverWait(driver, 10)
driver.get(useBaseURL)
searchbox = driver.find_element_by_class_name("ng-tns-c6-0 ui-inputtext ui-widget ui-state-default ui-corner-all ui-autocomplete-input ng-star-inserted")
driver.close()
The following sends keys to that element. Your error was the usage of a compounded class name as a class name. I also added the next click.
driver.get(useBaseURL)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, " p-autocomplete > span > input"))).send_keys("AAA")
driver.find_element_by_css_selector('button.btn.btn-accent.btn-search').click()
Import
from selenium.webdriver.support import expected_conditions as EC

BeautifulSoup scraping from a web page already opened by Selenium

I would like to make scrape a web page which was opened by Selenium from a different webpage.
I entered a search term into a website using Selenium and this landed me in a new page. My aim is to create soup out of this new page. But, the soup is getting created out of the previous page where I entered my search term. Help please!
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Firefox()
driver.get('http://www.ratestar.in/')
inputElement = driver.find_element_by_css_selector("#txtStock")
inputElement.send_keys('GM Breweries')
inputElement.send_keys(Keys.ENTER)
driver.wait.until(staleness_of('txtStock')
source = driver.page_source
soup = BeautifulSoup(source)
You need to know the exect company names for your search. After you are using send_keys, you tried to check for staleness of an element. I did not understand how that statement should work. I added WebDriverWait for an element of the new page.
The following works for me reagrding the selenium part up to getting the page source:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Firefox()
driver.get('http://www.ratestar.in/')
inputElement = driver.find_element_by_css_selector("#txtStock")
inputElement.send_keys('GM Breweries Ltd.')
inputElement.send_keys(Keys.ENTER)
company = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'lblCompany')))
source = driver.page_source
You should add exception handling.
#Jens Dibbern has given a working solution. But it is not necessary that the exact name of the company should be given in the search. What happens is that when you type a non-exact name, a drop-down will pop up.
I have observed that until and unless this drop-down is present enter key is not working. You can check this by going to the site, pasting the name and without waiting press the enter key as fast as possible. Nothing happens.
You could also wait for this drop-down to be visible instead and the send the enter key.This also works perfectly. Note that this will end up selecting the first item in the drop-down if more than one is present.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get('http://www.ratestar.in/')
inputElement = driver.find_element_by_css_selector("#txtStock")
inputElement.send_keys('GM Breweries')
drop_down=driver.find_element_by_css_selector("#listPlacementStock")
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#listPlacementStock:not([style*="display: none"])')))
inputElement.send_keys(Keys.ENTER)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[#id="CompanyLink"]')))
source = driver.page_source
soup = BeautifulSoup(source,'html.parser')
print(soup)

Categories