Extracting text from a website using selenium

Extracting text from a website using selenium - python

trying to find a way to extract the book's summary from the good reads page. Have tried Beautiful soup / Selenium, unfortunately to no avail.
link:https://www.goodreads.com/book/show/67896.Tao_Te_Ching?from_search=true&from_srp=true&qid=D19iQu7KWI&rank=1
code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import requests
link='https://www.goodreads.com/book/show/67896.Tao_Te_Ching?from_search=true&from_srp=true&qid=D19iQu7KWI&rank=1'
driver.get(link)
Description=driver.find_element_by_xpath("//div[contains(text(),'TextContainer')]")
#first TextContainer contains the sumary of the book
book_page = requests.get(link)
soup = BeautifulSoup(book_page.text, "html.parser")
print(soup)
Container = soup.find('class', class_='leftContainer')
print(Container)
Error:
container is empty +
NoSuchElementException: no such element: Unable to locate element:
{"method":"xpath","selector":"//div[contains(text(),'TextContainer')]"}
(Session info: chrome=83.0.4103.116)

You can get the description like so
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
...
driver.get("https://www.goodreads.com/book/show/67896.Tao_Te_Ching?from_search=true&from_srp=true&qid=D19iQu7KWI&rank=1")
description = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div#description span[style="display:none"]'))
)
print(description.get_attribute('textContent'))
I have utilised a CSS Selector to get the specific hidden span that contains the full description. I have also used an explicit wait to give the element time to load.

Related

trying to get href using selenium

I am trying to get href but they give me nothing these is page link https://www.nascc.aisc.org/trade-show
import time
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
URL = 'https://www.nascc.aisc.org/trade-show'
driver.get(URL)
page_links =[element.get_attribute('href') for element in
driver.find_elements(By.XPATH, "//table[#class='ffTableSet table table-striped']//a[starts-with(#href, 'https://n2a.goexposoftware.com/events/nascc23/goExpo/exhibitor')]")]
print(page_links)

Here's how to avoid dealing with iframes on that page: (Go directly to the inner site)
from selenium import webdriver
driver = webdriver.Chrome()
URL = 'https://n2a.goexposoftware.com/events/nascc23/goExpo/public/listExhibitorsFrame.php'
driver.get(URL)
page_links = [element.get_attribute("href") for element in
driver.find_elements("css selector",
'[href*="nascc23/goExpo/exhibitor"]')]
print(page_links)
driver.quit()

The element is inside nested iframe you need to switch to both frames.
URL = 'https://www.nascc.aisc.org/trade-show'
driver.get(URL)
WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[name='htmlComp-iframe']")))
WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe#geFrame1")))
time.sleep(5)
page_links =[element.get_attribute('href') for element in
driver.find_elements(By.XPATH, "//table[#class='ffTableSet table table-striped']//a[starts-with(#href, 'https://n2a.goexposoftware.com/events/nascc23/goExpo/exhibitor')]")]
print(page_links)
You need to import below libraries
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time

Web Scraping Identify and Extract Hyperlink

HI I have the following script that extracts the name and address of each site but I want to be able to also extract the href for each site so that I link to the individual sites. Any suggestions?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://order.marstons.co.uk/")
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[#id="app"]/div/div/div/div[2]/div'))
).find_elements_by_tag_name('a')
for el in element:
print("heading", el.find_element_by_tag_name('h3').text)
print("address", el.find_element_by_tag_name('p').text)
finally:
driver.quit()

You mean like this?
print(el.get_attribute("href"))
You can get attribute of a element from this.

Hidden phone number can't be scraped

I've been having trouble trying to extract the phone number after clicking the "llamar" button. So far I've used the xpath method with selenium and also tried using beautiful soup to extract the number but unfortunately nothing has worked. I usually get an invalid selector error (if I use an xpath selector with selenium) and with BS4 I get a - AttributeError: 'NoneType' object has no attribute 'text' ...
I hope you can help me out!
Here is the url to the link - https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm
Heres the code that I tried:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import UnexpectedAlertPresentException
url = 'https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque - 386352344.htm'
path = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe'
path1 = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\firefox'
# driver = webdriver.Chrome(path)
options = Options()
driver = webdriver.Chrome(path)
driver.get(url)
a = []
mah_div = driver.page_source
soup = BeautifulSoup(mah_div, features='lxml')
cookie_button = '//*[#id="sui-TcfFirstLayerModal"]/div/div/footer/div/button[2]'
btn_press = driver.find_element_by_xpath(cookie_button)
btn_press.click()
llam_button = '//*[#id="ad-detail-contact"]/a[2]'
llam_press = driver.find_element_by_xpath(llam_button)
llam_press.click()
time.sleep(10)
for item in soup.find_all("div", {"class": "contenido"}):
a.append(item.find("div", {"class": "plaincontenido"}).text)
print(a)

The phone is stored inside Javascript. You can use re module to extract it:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm"
phone_url = "https://www.milanuncios.com/datos-contacto/?usePhoneProxy=0&from=detail&includeEmail=false&id={}"
ad_id = re.search(r"(\d+)\.htm", url).group(1)
html_text = requests.get(phone_url.format(ad_id)).text
soup = BeautifulSoup(html_text, "html.parser")
phone = re.search(r"getTrackingPhone\((.*?)\)", html_text).group(1)
print(soup.select_one(".texto").get_text(strip=True), phone)
Prints:
ana (Particular) 639....

With Selenium you will need to click the button and to switch to iframe.
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".def-btn.phone-btn")))
tel_button = driver.find_element_by_css_selector(".def-btn.phone-btn")
tel_button.click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, "ifrw")))
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".texto>.telefonos")))
tel_number = driver.find_element_by_css_selector(".texto>.telefonos").text
Please note, I used much stable locators.

Not able to scrape particular table

I am using selenium and Python to scrape a website.I am not able to scrape particular table using Beautiful Soup.
Here is the code
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.support import expected_conditions
link='http://omms.nic.in/#'
browser=webdriver.Firefox()
browser.get(link)
time.sleep(15)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/ul/li[3]/a'))).click()
time.sleep(10)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[1]/div/div/ul/li[3]/ul/li[1]/ul/li[5]/a'))).click()
select_state=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[1]/td[3]/select'))
select_state.select_by_index(35)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div[1]/form/table/tbody/tr[3]/td[7]/input[1]'))).click()
select_district=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[1]/td[5]/select'))
options = [x.text for x in select_district.options]
select_district.select_by_index(3)
select_year=Select(browser.find_element_by_xpath('/html/body/div[2]/div[1]/form/table/tbody/tr[2]/td[3]/select'))
select_year.select_by_index(8)
time.sleep(10)
WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/div[1]/form/table/tbody/tr[4]/td[3]/input'))).click()
time.sleep(5)
soup=BeautifulSoup(browser.page_source,"html.parser")
table=soup.find_all('table',attrs={'class':"A35402edea1d24691942da96210fa88a3382"})
data_name = pd.read_html(str(table))[0]
I am getting the error as No tables found

This is probably because the table you are looking for is not under browser.page_source. It is loaded from a separate iframe. We can switch to the frame and then get the source.
browser.switch_to.frame(driver.find_element_by_xpath("//*[#id='loadReport']//iframe"))
print(browser.page_source)
soup = BeautifulSoup(browser.page_source,"html.parser")
In case if you need to interact with the page again, You can switch back using
browser.switch_to.default_content()
Also consider increasing the time to wait for the table to load.

Not Able to See Entire Page Source of Website using Selenium,Python

I am trying to scrape this website
https://script.google.com/a/macros/cprindia.org/s/AKfycbzgfCVNciFRpcpo8P7joP1wTeymj9haAQnNEkNJJ2fQ4FBXEco/exec
I am using selenium and python.I am not able to view entire page source,Basically i have to scrape the table inside it and click on next button,but the code of next and table not visible on page source.Here is my code
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium import webdriver
browser = webdriver.PhantomJS()
browser.get(link)
pass1 = browser.find_element_by_xpath("/html/body/div[2]/table[2]/tbody/tr[1]/td/div/div/div[2]/div[2]")
pass1.click()
time.sleep(30)
I am getting this error,NoSuchElementException.

There are two iframes present on the page, so you need to first switch on those iframe and then you need to click on the element.
And you can apply explicit wait on the element so that the script waits until the element is visible on the page.
You can do it like:
browser = webdriver.PhantomJS()
browser.get(link)
browser.switch_to.frame(driver.find_element_by_id('sandboxFrame'))
browser.switch_to.frame(driver.find_element_by_id('userHtmlFrame'))
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, "//div[contains(#class,'charts-custom-button-collapse-left')]//div"))).click()
Note: You have to add the following imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extracting text from a website using selenium - python

Related

trying to get href using selenium

Web Scraping Identify and Extract Hyperlink

Hidden phone number can't be scraped

Not able to scrape particular table

Not Able to See Entire Page Source of Website using Selenium,Python

Categories

Resources