I am trying to scrape a table on a website. The problem is that there is no <tbody> in the html code. I tired with both requests and selenium, but always the same result. Does any body have any idea?
This is the code(with requests) website: https://bscscan.com/token/0xAdeaE50E0097fBf8139Bdff45e7ed00de4b14170#balances
from urllib.request import Request, urlopen
import bs4
link="https://bscscan.com/token/0xAdeaE50E0097fBf8139Bdff45e7ed00de4b14170#balances"
req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = bs4.BeautifulSoup(webpage,"html.parser" )
print(soup)
This is with selenium:
import time
import bs4
from selenium import webdriver
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver.get("https://bscscan.com/token/0xAdeaE50E0097fBf8139Bdff45e7ed00de4b14170#balances")
time.sleep(7)
html=driver.page_source
soup=bs4.BeautifulSoup(html,"lxml" )
print(soup)
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".table > tbody:nth-child(2)")))
print(driver.page_source)
The table you are trying to access is inside iframe. You will need tos switch to that iframe in order to access that element:
import time
import bs4
from selenium import webdriver
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()))
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver.get("https://bscscan.com/token/0xAdeaE50E0097fBf8139Bdff45e7ed00de4b14170#balances")
time.sleep(7)
html=driver.page_source
soup=bs4.BeautifulSoup(html,"lxml" )
print(soup)
WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe#tokeholdersiframe")))
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".table > tbody:nth-child(2)")))
print(driver.page_source)
When finished, you will have to switch back to the default content with
driver.switch_to.default_content()
Related
I am trying to get href but they give me nothing these is page link https://www.nascc.aisc.org/trade-show
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
URL = 'https://www.nascc.aisc.org/trade-show'
driver.get(URL)
page_links =[element.get_attribute('href') for element in
driver.find_elements(By.XPATH, "//table[#class='ffTableSet table table-striped']//a[starts-with(#href, 'https://n2a.goexposoftware.com/events/nascc23/goExpo/exhibitor')]")]
print(page_links)
Here's how to avoid dealing with iframes on that page: (Go directly to the inner site)
from selenium import webdriver
driver = webdriver.Chrome()
URL = 'https://n2a.goexposoftware.com/events/nascc23/goExpo/public/listExhibitorsFrame.php'
driver.get(URL)
page_links = [element.get_attribute("href") for element in
driver.find_elements("css selector",
'[href*="nascc23/goExpo/exhibitor"]')]
print(page_links)
driver.quit()
The element is inside nested iframe you need to switch to both frames.
URL = 'https://www.nascc.aisc.org/trade-show'
driver.get(URL)
WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[name='htmlComp-iframe']")))
WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe#geFrame1")))
time.sleep(5)
page_links =[element.get_attribute('href') for element in
driver.find_elements(By.XPATH, "//table[#class='ffTableSet table table-striped']//a[starts-with(#href, 'https://n2a.goexposoftware.com/events/nascc23/goExpo/exhibitor')]")]
print(page_links)
You need to import below libraries
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
I've been having trouble trying to extract the phone number after clicking the "llamar" button. So far I've used the xpath method with selenium and also tried using beautiful soup to extract the number but unfortunately nothing has worked. I usually get an invalid selector error (if I use an xpath selector with selenium) and with BS4 I get a - AttributeError: 'NoneType' object has no attribute 'text' ...
I hope you can help me out!
Here is the url to the link - https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm
Heres the code that I tried:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import UnexpectedAlertPresentException
url = 'https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque - 386352344.htm'
path = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe'
path1 = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\firefox'
# driver = webdriver.Chrome(path)
options = Options()
driver = webdriver.Chrome(path)
driver.get(url)
a = []
mah_div = driver.page_source
soup = BeautifulSoup(mah_div, features='lxml')
cookie_button = '//*[#id="sui-TcfFirstLayerModal"]/div/div/footer/div/button[2]'
btn_press = driver.find_element_by_xpath(cookie_button)
btn_press.click()
llam_button = '//*[#id="ad-detail-contact"]/a[2]'
llam_press = driver.find_element_by_xpath(llam_button)
llam_press.click()
time.sleep(10)
for item in soup.find_all("div", {"class": "contenido"}):
a.append(item.find("div", {"class": "plaincontenido"}).text)
print(a)
The phone is stored inside Javascript. You can use re module to extract it:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm"
phone_url = "https://www.milanuncios.com/datos-contacto/?usePhoneProxy=0&from=detail&includeEmail=false&id={}"
ad_id = re.search(r"(\d+)\.htm", url).group(1)
html_text = requests.get(phone_url.format(ad_id)).text
soup = BeautifulSoup(html_text, "html.parser")
phone = re.search(r"getTrackingPhone\((.*?)\)", html_text).group(1)
print(soup.select_one(".texto").get_text(strip=True), phone)
Prints:
ana (Particular) 639....
With Selenium you will need to click the button and to switch to iframe.
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".def-btn.phone-btn")))
tel_button = driver.find_element_by_css_selector(".def-btn.phone-btn")
tel_button.click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, "ifrw")))
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".texto>.telefonos")))
tel_number = driver.find_element_by_css_selector(".texto>.telefonos").text
Please note, I used much stable locators.
trying to find a way to extract the book's summary from the good reads page. Have tried Beautiful soup / Selenium, unfortunately to no avail.
link:https://www.goodreads.com/book/show/67896.Tao_Te_Ching?from_search=true&from_srp=true&qid=D19iQu7KWI&rank=1
code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
link='https://www.goodreads.com/book/show/67896.Tao_Te_Ching?from_search=true&from_srp=true&qid=D19iQu7KWI&rank=1'
driver.get(link)
Description=driver.find_element_by_xpath("//div[contains(text(),'TextContainer')]")
#first TextContainer contains the sumary of the book
book_page = requests.get(link)
soup = BeautifulSoup(book_page.text, "html.parser")
print(soup)
Container = soup.find('class', class_='leftContainer')
print(Container)
Error:
container is empty +
NoSuchElementException: no such element: Unable to locate element:
{"method":"xpath","selector":"//div[contains(text(),'TextContainer')]"}
(Session info: chrome=83.0.4103.116)
You can get the description like so
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
...
driver.get("https://www.goodreads.com/book/show/67896.Tao_Te_Ching?from_search=true&from_srp=true&qid=D19iQu7KWI&rank=1")
description = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div#description span[style="display:none"]'))
)
print(description.get_attribute('textContent'))
I have utilised a CSS Selector to get the specific hidden span that contains the full description. I have also used an explicit wait to give the element time to load.
I am using selenium and Python to scrape a website.I am not able to scrape particular table using Beautiful Soup.
Here is the code
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions
link='http://omms.nic.in/#'
browser=webdriver.Firefox()
browser.get(link)
time.sleep(15)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/ul/li[3]/a'))).click()
time.sleep(10)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/ul/li[3]/ul/li[1]/ul/li[5]/a'))).click()
select_state=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[1]/td[3]/select'))
select_state.select_by_index(35)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div[1]/form/table/tbody/tr[3]/td[7]/input[1]'))).click()
select_district=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[1]/td[5]/select'))
options = [x.text for x in select_district.options]
select_district.select_by_index(3)
select_year=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[2]/td[3]/select'))
select_year.select_by_index(8)
time.sleep(10)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div[1]/form/table/tbody/tr[4]/td[3]/input'))).click()
time.sleep(5)
soup=BeautifulSoup(browser.page_source,"html.parser")
table=soup.find_all('table',attrs={'class':"A35402edea1d24691942da96210fa88a3382"})
data_name = pd.read_html(str(table))[0]
I am getting the error as No tables found
This is probably because the table you are looking for is not under browser.page_source. It is loaded from a separate iframe. We can switch to the frame and then get the source.
browser.switch_to.frame(driver.find_element_by_xpath("//*[#id='loadReport']//iframe"))
print(browser.page_source)
soup = BeautifulSoup(browser.page_source,"html.parser")
In case if you need to interact with the page again, You can switch back using
browser.switch_to.default_content()
Also consider increasing the time to wait for the table to load.
I am scraping one of the URLs like this which loads data via Ajax. When using Firefox it's able to scrape HTML but upon using PhantomJS it return:
<html><head></head><body></body></html>
Code is below:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import selenium.webdriver.support.ui as ui
import sys
import os
from time import sleep
driver = None
url = 'https://sports.bovada.lv/live-betting/event/2391243'
driver = webdriver.PhantomJS('/Setups/phantomjs-1.9.8-macosx/bin/phantomjs')
driver.set_window_size(1128, 768) # optional
driver.get(url)
wait = ui.WebDriverWait(driver, 3000)
sleep(40)
#wait.until(EC.staleness_of(driver.find_element_by_id("coupon")), 'visible')
html = driver.page_source
#userElement = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "coupon")))
print(html)
Update
Ok this is happening with every URL regardless of ajax or non-Ajax