Extracting html table into panda DataFrame using Selenium

Extracting html table into panda DataFrame using Selenium - python

I'm trying to read an html table and then extract it into pd.DataFrame but instead getting something different. What am I doing wrong?
the error is: [<selenium.webdriver.remote.webelement.WebElement (session="38159852443c19167a9033a2b078fe45", element="ef6a42a1-2775-44c1-955f-5f01870bc758")>]
Here is my code:
import pandas as pd
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(executable_path = 'mypath/chromedriver.exe', options = options)
driver.get("https://ai.fmcsa.dot.gov/SMS")
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.XPATH, "//a[#title='Close']"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "(//input[#name='MCSearch'])[2]"))).send_keys('1818437')
wait.until(EC.element_to_be_clickable((By.XPATH, "(//input[#name='search'])[2]"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[#id='BASICs']/p[2]/a"))).click()
tables=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.XPATH,'//*[#id="BASICs"]/table/tbody/tr[2]')))
print(tables)
Disregard the bunch of extra imports as I've been trying to approach the problem in different way but keep failing

I might have solved it, actually.
I changed the XPATH of the table from
'//*[#id="BASICs"]/table/tbody/tr[2]'
to
"//tr[#class='valueRow sumData']"
and I just realized I was printing the element and not its content so I changed the last line from
print(tables) to print(tables.text)
Now it's printing the table but for some reason it's not printing the very last value (0.05 in this case). Any ideas why?

Related

Retrieving specific matches from a list in python

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from time import sleep
from datetime import datetime
import pandas as pd
import warnings
import os
os.chdir('C:/Users/paulc/Documents/Medium Football')
warnings.filterwarnings('ignore')
base_url = 'https://www.sportingindex.com/spread-betting/football/international-world-cup'
option = Options()
option.headless = False
driver = webdriver.Chrome("C:/Users/paulc/Documents/Medium Football/chromedriver.exe",options=option)
driver.get(base_url)
links = [elem.get_attribute("href") for elem in driver.find_elements(By.TAG_NAME,"a")]
this code retrieves all the href links on this page. I want to search the links list and return only the matches that contain 'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a'
however I get the AttributeError: 'NoneType' object has no attribute 'startswith'
using
import re
[x for x in links if x.startswith('https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]
help is appreciated.

Instead of collecting all a elements on the page where will be a lot of irrelevant results you can use more precise locator.
So, instead of
driver.find_elements(By.TAG_NAME,"a")
Use this:
driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")
This will give you desired elements only.
And this
links = [elem.get_attribute("href") for elem in driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")]
will directly give you the wanted links only.
UPD
In case this is giving you an empty list you possibly are missing a delay. So, you can simply add some pause before that line, like time.sleep(2) but it's better to use WebDriverWait expected_conditions explicit waits for that.
I can't check it since my computer is blocking that link due to my company policy since that is a gambling site, but normally something like this should work:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
links = [elem.get_attribute("href") for elem in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")))]

The following code is filtering to grab the right links
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get('https://www.sportingindex.com/spread-betting/football/international-world-cup')
driver.maximize_window()
time.sleep(8)
soup = BeautifulSoup(driver.page_source,"lxml")
for u in soup.select('a[class="gatracking"]'):
link = 'https://www.sportingindex.com' + u.get('href')
if '-v-' in link:
print(link)
Output:
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.dd7a995d-7478-45f8-af27-9f234d37cc76/ecuador-v-senegal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.92232207-0f1e-4bb1-bacd-1332ef6b9007/netherlands-v-qatar
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.b913620e-69c7-4606-a153-7b48589b7c94/iran-v-usa
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7a4a18fb-d4ee-4880-849f-f1afdea33cd5/wales-v-england
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.20c098b4-4e97-4fd1-97b0-f42d84424361/australia-v-denmark
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5a7476e2-8d35-4a8e-8065-b4339e79f395/tunisia-v-france
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.8a869f02-9dd0-49c5-91bd-209ee224fc2a/poland-v-argentina
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.6379b787-f246-4ba4-a896-28a97396d02f/saudi-arabia-v-mexico
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.52737cfd-da19-42dd-b15b-c16c3e8e9a86/canada-v-morocco
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.168fab1f-8360-4e87-ba84-bfbd11a4a207/croatia-v-belgium
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.9fb541f0-43a4-409c-8e54-e34a43965714/costa-rica-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7379c8a7-ab5d-4653-b487-22bf7ff8eefe/japan-v-spain
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e7e4c6be-98b7-4258-ba40-74c54a790fe1/ghana-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e4c18c81-565e-47ce-b08d-9aed62c88a5d/south-korea-v-portugal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.18f44028-e23d-48d4-970b-e75c164589bd/cameroon-v-brazil
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.526f9b1b-6d95-4f44-abce-e0a6a30acfd4/serbia-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay

Selenium using WebDriverWait WebElement still doesn't print any text, recognizes string PYTHON

As title. Couldn't find a solution online. You'll see by the print statements that the program will get the text property from the WebElement object, but it is always an empty string even though I am using WebDriverWait.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
PATH = "C:\\Users\\Anthony\\Desktop\\chromedriver.exe"
driver = webdriver.Chrome(PATH)
website = "https://www.magicspoiler.com"
driver.get(website)
el_name = '//div[#class="set-card-2 pad5"]'
try:
main = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, el_name)))
articles = main.find_elements(By.XPATH, el_name)
for article in articles:
print(type(article.text))
print(article.text)
finally:
driver.quit()

I checked the HTML DOM and I don't see any text in the element you are looking for. They are simply images with anchor links and therefore you are getting blank responses.
I have given an alternate try, to extract the links and they were successful.
So, after refactoring your code, it looks like this:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
PATH = "C:\\Users\\Anthony\\Desktop\\chromedriver.exe"
driver = webdriver.Chrome(PATH)
website = "https://www.magicspoiler.com"
driver.get(website)
el_name = '//div[#class="set-card-2 pad5"]/a'
try:
main = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, el_name)))
articles = main.find_elements(By.XPATH, el_name)
for article in articles:
# print(type(article.text))
print(article.get_attribute("href"))
finally:
pass
driver.quit()
Here is the response I got:
So, please check if you want to get the links or really the text. If you want the text, then there is none that I see in DOM as I said. You may have to traverse through each of the link by clicking on it, and find if any you need fro the navigated page.

Find value from div where multiple divs with same name

I want to find the value of TTM EPS from link https://www.moneycontrol.com/india/stockpricequote/computers-software/infosys/IT
I wrote the following code:
import os
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--start-maximized')
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options)
company = "Infosys"
wait = WebDriverWait(driver, 20)
driver.get("https://www.moneycontrol.com")
inputElement = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#form_topsearch>.txtsrchbox.FL')))
inputElement.send_keys(company, Keys.ENTER)
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#proceed-button'))).click()
driver.implicitly_wait(30)
driver.find_element_by_class_name("oview_table")
Now, there are multiple classes with the same name "oview_table" and its table. How can I get the value of TTM EPS?

Try the following xpath //td[contains(text(),'TTM EPS')]/../td[contains(#class,'nseceps')]
This is the lement you are looking for.
Now you can extract the text value from it.
In order to get value of any other asset just pass it as a parameter to this string

This xpath should also work. Pass in the text you are looking for.
//div[#class='oview_table']//td[contains(normalize-space(.), 'TTM EPS')]

Not able to scrape particular table

I am using selenium and Python to scrape a website.I am not able to scrape particular table using Beautiful Soup.
Here is the code
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions
link='http://omms.nic.in/#'
browser=webdriver.Firefox()
browser.get(link)
time.sleep(15)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/ul/li[3]/a'))).click()
time.sleep(10)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/ul/li[3]/ul/li[1]/ul/li[5]/a'))).click()
select_state=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[1]/td[3]/select'))
select_state.select_by_index(35)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div[1]/form/table/tbody/tr[3]/td[7]/input[1]'))).click()
select_district=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[1]/td[5]/select'))
options = [x.text for x in select_district.options]
select_district.select_by_index(3)
select_year=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[2]/td[3]/select'))
select_year.select_by_index(8)
time.sleep(10)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div[1]/form/table/tbody/tr[4]/td[3]/input'))).click()
time.sleep(5)
soup=BeautifulSoup(browser.page_source,"html.parser")
table=soup.find_all('table',attrs={'class':"A35402edea1d24691942da96210fa88a3382"})
data_name = pd.read_html(str(table))[0]
I am getting the error as No tables found

This is probably because the table you are looking for is not under browser.page_source. It is loaded from a separate iframe. We can switch to the frame and then get the source.
browser.switch_to.frame(driver.find_element_by_xpath("//*[#id='loadReport']//iframe"))
print(browser.page_source)
soup = BeautifulSoup(browser.page_source,"html.parser")
In case if you need to interact with the page again, You can switch back using
browser.switch_to.default_content()
Also consider increasing the time to wait for the table to load.

WebDriverWait on finding element by CSS Selector

I want to retrieve the price of the flight of this webpage using Python 3: https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/04jpl.2018-12-17;c:EUR;e:1;a:FR;sd:1;t:f;tt:o
At first I got an error which after many hours I realized was due to the fact that I wasn't giving the webdriver enough time to load all elements. So to ensure that it had enough time I added a time.sleep like so:
time.sleep(1)
This made it work! However, I've read and was advised to not use this solution and to use WebDriverWait instead. So after many hours and several tutorials im stuck trying to pinpoint the exact CSS class the WebDriverWait should wait for.
The closest I think I've got is:
WebDriverWait(d, 1).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".flt-subhead1.gws-flights-results__price.gws-flights-results__cheapest-price")))
Any ideas on what I'm missing on?

You could use a css attribute = value selector to target, or if that value is dynamic you can use a css selector combination to positional match.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/04jpl.2018-12-17;c:EUR;e:1;a:FR;sd:1;t:f;tt:o")
#element = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR , '[jstcache="9322"]')))
element = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR , '.flt-subhead1.gws-flights-results__price.gws-flights-results__cheapest-price span + jsl')))
print(element.text)
#driver.quit()
No results case:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
driver = webdriver.Chrome()
url ="https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/04jpl.2018-12-17;c:EUR;e:1;a:FR;sd:1;t:f;tt:o" #"https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/04jpl.2018-11-28;c:EUR;e:1;a:FR;sd:1;t:f;tt:o"
driver.get(url)
try:
status = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR , 'p[role=status')))
print(status.text)
except TimeoutException as e:
element = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR , '.flt-subhead1.gws-flights-results__price.gws-flights-results__cheapest-price span + jsl')))
print(element.text)
#driver.quit()

I may be wrong but I think you are trying to get the price of the flight trip.
If my assumption is correct, take a look at my approach. I find the Search Results list, then all the Itinerary inside the Search Results list, loop over and get all the price information. This is the best approach I can come up with and avoiding all the dynamic attributes
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = 20
driver = Chrome()
driver.get("https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/04jpl.2018-12-17;c:EUR;e:1;a:FR;sd:1;t:f;tt:o")
# Get the Search Result List
search_results= WebDriverWait(driver, wait).until(EC.presence_of_element_located((By.CSS_SELECTOR , 'ol[class="gws-flights-results__result-list"]')))
# loop through all the Itinerary
for result in search_results.find_elements_by_css_selector('div[class*="gws-flights-results__collapsed-itinerary"]'):
price = result.find_element_by_css_selector('div[class="gws-flights-results__itinerary-price"]')
print(price.text)
Output
€18

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extracting html table into panda DataFrame using Selenium - python

Related

Retrieving specific matches from a list in python

Selenium using WebDriverWait WebElement still doesn't print any text, recognizes string PYTHON

Find value from div where multiple divs with same name

Not able to scrape particular table

WebDriverWait on finding element by CSS Selector

Categories

Resources