Scraping hidden elements in a dynamically changing html - python

I need to scrape some information from a dynamically changing html. The website in question is :
https://www.mitartlending.com/featuredartworks. Here, when you click on a given image and hover your mouse over the enlarged image a text overlay pops up. I am trying to scrape that text. After trying to do this with BS I decided that I am going to have to probably use selenium. How would you go about about solving this problem? So far, I have:
from selenium import webdriver
driver = webdriver.Chrome('/Users/Abramo/SeleniumDrivers/chromedriver')
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver.get('https://www.mitartlending.com/featuredartworks')
driver.implicitly_wait(3)
my_element = driver.find_element_by_xpath(f'/html/body/div[5]/div[2]/div/main/section/div/div/div/div[3]/div/div/div/div[1]/div/a/img')
my_element.click()
copy_from = driver.find_element_by_class_name('sqs-lightbox-meta overlay-description-visible')
my_next_button = driver.find_element_by_class_name('sqs-lightbox-next')

The data is all there within attributes. You just need to extract the appropriate ones. No need for the overhead of selenium.
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.mitartlending.com/featuredartworks')
soup = bs(r.content, 'lxml')
results = {i['data-title']:' '.join(bs(i['data-description'], 'lxml').text.split('\n')) for i in soup.select('.margin-wrapper > a')}
print(results)

You can locate any of those images by
images = driver.find_elements_by_xpath('//img[contains(#class,'thumb-image loaded')]')
So, for example to click on the second image you can with
images[1].click()
To hover over the element you can do this:
from selenium.webdriver.common.action_chains import ActionChains
hover = ActionChains(driver).move_to_element(images[1])
hover.perform()
Now, when the text is appeared you can locate and get it with
text = driver.find_elements_by_xpath('(//img[contains(#class,'thumb-image loaded')])[2]/..//p').text
The same can be done for any other image there.
Altogether the code will look like:
from selenium.webdriver.common.action_chains import ActionChains
images = driver.find_elements_by_xpath('//img[contains(#class,"thumb-image loaded")]')
images[1].click()
time.sleep(2)
hover = ActionChains(driver).move_to_element(images[1])
hover.perform()
time.sleep(2)
text = driver.find_elements_by_xpath('(//img[contains(#class,"thumb-image loaded")])[2]/..//p')
for t in text:
print(t.text)
I added sleeps just to make it simply while it's preferred to use expected conditions waits instead

Related

Use Python Selenium to extract span text

Hi i'm new at selenium and webscraping and i need some help.
i try to scrape one site and i need and i dont know how to get span class.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
PATCH = "/Users/bobo/Downloads/chromedriver"
driver = webdriver.Chrome(PATCH)
driver.get("https://neonet.pl")
print(driver.title)
search = driver.find_element_by_class_name("inputCss-input__label-263")
search.send_keys(Keys.RETURN)
time.sleep(5)
i try to extract this span
<span class="inputCss-input__label-263">Szukaj produktu</span>
I can see that you are trying to search something in the search bar.
First I recommend you to use the xpath instead of the class name, here is a simple technique to get the xpath of every element on a webpage:
right-click/ inspect element/ select the mouse in a box element on the upper left/ click on the element on the webpage/ it will directly show you the corresponding html/ then right click on the selected html/ copy options and then xpath.
Here is a code example that searches an element on the webpage, I also included the 'Webdriver-wait' option because sometimes the code runs to fast and can't find the next element so this function make the code wait till the element is visible:
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import sleep
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(executable_path="/Users/bobo/Downloads/chromedriver")
driver.get("https://neonet.pl") #loading page
wait = WebDriverWait(driver, 20) #defining webdriver wait
search_word = 'iphone\n' # \n is going to act as an enter key
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="root"]/main/div[1]/div[4]/div/div/div[2]/div/button[1]'))).click() #clicking on cookies popup
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="root"]/main/header/div[2]/button'))).click() #clicking on search button
wait.until(EC.visibility_of_element_located((By.XPATH, '//*[#id="root"]/aside[2]/section/form/label/input'))).send_keys(search_word) #searching on input button
print('done!')
sleep(10)
Hope this helped you!
wait=WebDriverWait(driver,10)
driver.get('https://neonet.pl')
elem=wait.until(EC.visibility_of_element_located((By.XPATH, "//span[contains(#class,'inputCss-input')]"))).text
print(elem)
To output the value of the search bar you use .text on the Selenium Webelement.
Imports:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

Why does Selenium think this HTML span is empty?

I am trying to scrape NASDAQ's website for real time stock quotes. When I use chrome developer tools, I can see the span I want to target is (for example with Alphabet as of writing this) <span class="symbol-page-header__pricing-price">$2952.77</span>. I want to extract the $2952.77. My python code is:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
def get_last_price(ticker):
driver.get(f"https://www.nasdaq.com/market-activity/stocks/{ticker}")
price = driver.find_element(By.CLASS_NAME, "symbol-page-header__pricing-last-price")
print(price.get_attribute('text'))
# p = price.get_attribute('innerHTML')
get_last_price('googl')
The above code returns 'None'. If you uncomment out the line defining p and print it's output, it shows that Selenium thinks the span is empty.
<span class="symbol-page-header__pricing-price"></span>
I don't understand why this is happening. My thought is that it has something to do with the fact that it's probably being rendered dynamically with Javascript, but I thought that was an advantage of Selenium say as opposed to BeautifulSoup... there shouldn't be an issue right?
If you look into the HTML DOM of NASDAQ's Coinbase Global your Locator Strategy selects 2 nodes and the the one you don't want is:
Solution
To print the price information you can use the following Locator Strategy:
Using XPATH and text attribute:
driver.get("https://www.nasdaq.com/market-activity/stocks/coin")
print(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//span[#class='symbol-page-header__pricing-price' and text()]"))).text)
Using XPATH and get_attribute("innerHTML"):
driver.get("https://www.nasdaq.com/market-activity/stocks/coin")
print(WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,"//span[#class='symbol-page-header__pricing-price' and text()]"))).get_attribute("innerHTML"))
Console Output:
$263.91
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
There are 2 nodes with the class symbol-page-header__pricing-price. The node that you want is under
<div class="symbol-page-header__pricing-details symbol-page-header__pricing-details--current symbol-page-header__pricing-details--decrease"></div>
So, you need to get inside this div first to ensure you scrape the right one.
Anyways, I'd recommend you to use BeautifulSoup to scrape the HTML text after you’ve finished interacting with the dynamic website with selenium. This will save your time and your memory. It has no need to keep running the browser, so, it would be better if you terminate it (i.e. driver.close()) and use BeautifulSoup to explore the static HTML text.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
s = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=s)
def get_last_price(ticker):
driver.get(f"https://www.nasdaq.com/market-activity/stocks/{ticker}")
time.sleep(1)
driver.close()
soup = BeautifulSoup(driver.page_source, "lxml")
header = soup.find('div', attrs={'class': 'symbol-page-header__pricing-details symbol-page-header__pricing-details--current symbol-page-header__pricing-details--decrease'})
price = header.find('span', attrs={'class':'symbol-page-header__pricing-price'})
print(price)
print(price.text)
get_last_price('googl')
Output:
>>> <span class="symbol-page-header__pricing-price">$2952.77</span>
>>> $2952.77

How do I retrieve the link of an image through Selenium

I'm trying to make my program fetch the link of an image and then store it as a string in a variable.
This is the xpath of the image. I need to do it through xpaths because the xpaths on the website are very similar bar the "/article[x]". This allow me to increase the number with a variable so that I can go through all the xpaths on the page.
/html/body/div[2]/div[2]/div[3]/div[2]/div[2]/div[1]/div/article[1]/div[2]/div[1]/a/img
Picture of the website that I'm trying to retrieve the links of the image
My code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import tkinter
import time
Anime = input("Enter Anime:")
driver = webdriver.Chrome(executable_path=r"C:\Users\amete\Documents\chromedriver.exe")
driver.get("https://myanimelist.net/search/all?q=one%20piece&cat=all")
search = driver.find_element_by_xpath('//input[#name="q"]')
wait = WebDriverWait(driver, 20)
wait.until(EC.element_to_be_clickable((By.XPATH, '//input[#name="q"]')))
# Clears the field
search.send_keys(Keys.CONTROL, 'a')
search.send_keys(Keys.DELETE)
# The field is now cleared and the program can type whatever it wants
search.send_keys(Anime)
search.send_keys(Keys.RETURN)
# Accept the cookies
wait.until(EC.element_to_be_clickable((By.XPATH, '//*[#id="qc-cmp2-ui"]/div[2]/div/button[3]'))).click()
# Added this wait
wait.until(EC.element_to_be_clickable((By.XPATH,'//h2[#id="anime"]//ancestor::div[#class="content-left"]//article[1]/div[contains(#class, "list")][1]/div[contains(#class, "information")]/a[1]')))
link = driver.find_element_by_xpath('//h2[#id="anime"]//ancestor::div[#class="content-left"]//article[1]/div[contains(#class, "list")][1]/div[contains(#class, "information")]/a[1]').text
piclink = driver.('/html/body/div[2]/div[2]/div[3]/div[2]/div[2]/div[1]/div/article[1]/div[2]/div[1]/a/img')
print (piclink)
you can get it like this (specify the attribute)
piclink = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[3]/div[2]/div[2]/div[1]/div/article[1]/div[2]/div[1]/a/img').get_attribute('src')
print(piclink)

How to find window/iframe from Chrome DevTools

I'm trying to web scrape using Selenium, Python and Beautiful Soup. I am scraping this page, but I want to scrape information off the pop-up window that appears when you click on the 'i' (information) icons in the corner of each product. My code is as follows:
import requests
from bs4 import BeautifulSoup
import time
import selenium
import math
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import chromedriver_binary
import re
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(ChromeDriverManager().install())
r = requests.get('https://dmarket.com/csgo-skins/product-card/ak-47-redline/field-tested')
driver.get('https://dmarket.com/csgo-skins/product-card/ak-47-redline/field-tested')
html_getter = BeautifulSoup(r.text, "html.parser")
data = html_getter.findAll(attrs={"class":"c-asset__priceNumber"})
dataskin = html_getter.findAll(attrs={"class" : "c-asset__exterior"})
time.sleep(2)
driver.find_element_by_id("onesignal-slidedown-cancel-button").click()
time.sleep(2)
driver.find_element_by_class_name("c-dialogHeader__close").click()
time.sleep(30)
driver.find_element_by_class_name("c-asset__action--info").click()
time.sleep(30)
price_element = driver.switch_to.active_element
print("<<<<<TEXT>>>>>")
print(price_element.text)
print("<<<<<END>>>>>")
driver.close()
However, when I run this, the only text that prints are "close." If you inspect the information page pop-up, it should print out the price, data from the chart, etc. How can I get it to print this info? Specifically, I want the amount sold on the most recent day and the price listed on the chart on the most recent day (both seem to be accessible in Chrome DevTools). I don't think I'm looking at the wrong frame, as I switch to the active frame, so I'm not sure how to fix this!

Is there a more broad way to get the src in a webelement through selenium?

I have pulled 'src' from other webelements before however this one seems to elude me. I am trying to grab all the links to all the images on this webpage. However, even though the webelement is highlighted by a css selector, it does not return the link when I try to grab it through get_attribute('src'). I can achieve grabbing one image if I get specific with Xpath or Css Selector but I want to grab all (7 urls) of them at once and toss em into a list.
Any suggestions?
Here is my code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from json import dumps, loads, JSONEncoder, JSONDecoder
import re
import json
import timeit
import time
start = timeit.default_timer()
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 5)
def ImageFind():
driver.get('https://shop.freedommobile.ca/devices/Apple/iPhone_11?sku=190199220546&planSku=Freedom%205GB')
phoneImage = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.slide')))
imageLink = phoneImage[0].get_attribute('src') #test for grabbing url for just one webelement
print(len(phoneImage)) #check to make sure I have the right amount of elements being selected
print(phoneImage)
print(imageLink)
return imageLink
ImageFind()
You used wrong selector, .slide is a li element and don't have src attribute. To get images use .slide img like in example below:
images = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.slide img')))
for image in images:
print(image.get_attribute('src'))
Your method:
def image_find():
driver.get('https://shop.freedommobile.ca/devices/Apple/iPhone_11?sku=190199220546&planSku=Freedom%205GB')
images = [image.get_attribute('src') for image in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.slide img')))]
return images

Categories