I want to make a parser for scraping price, however I can't find the working method of parsing innerHTML
I don't know why, but selenium (getAttribute(innerHTML)), phantomjs (page.evaluation function(){return document.ElementToParse.innerHTML}) and scrapy-splash (loaded a webpage using WebPageEngine and parse html) don't work. All the time, result is empty "[]", null or webelement
I test my code on banggood's products and also on landing page but result is always the same.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("https://www.banggood.com/BlitzWolf-Ampcore-Turbo-TC10-3A-Durable-USB-Type-C-Charging-Data-Cable-p-1188424.html?rmmds=category&cur_warehouse=CN") #random url
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "item_now_price"))
)
finally:
driver.quit()
print(element)
and output:
<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="b0593791-138b-4177-a8f3-e7983143824a", element="d08f4717-d3f1-4594-8f2b-1bf943deb9f9")>
when need something like:
6.59(or US$6.59)
i also tried
price = driver.find_element_by_class_name('item_now_price').getAttribute("innerHTML")
and
var page = require('webpage').create();
page.open('https://www.banggood.com/BlitzWolf-Ampcore-Turbo-TC10-3A- Durable-USB-Type-C-Charging-Data-Cable-p-1188424.html?rmmds=category&cur_warehouse=CN', function(status) {
var price = page.evaluate(function() {
return document.getElementByClassName('item_now_price').innerHTML;
});
console.log('price is ' + price);
phantom.exit();
});
but result is null and when i add
page.includeJs(/url/to/js)
terminal stops working
s
Once you get the element in selenium, you can get the text of that element with .text
See the slight adjustment to your first example below:
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "item_now_price"))
)
print(element.text)
finally:
See if that gets the results you're looking for.
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
driver.get("https://www.banggood.com/BlitzWolf-Ampcore-Turbo-TC10-3A-Durable-USB-Type-C-Charging-Data-Cable-p-1188424.html?rmmds=category&cur_warehouse=CN") #random url
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "item_now_price"))
).text
finally:
driver.quit()
print(element)
Related
I am currently working on a demo Selenium project with Python. I have been able to navigate to a page but when trying to collect text within a "div class" selenium fails to find the HTML :
Code to be collected
I have made use of the wait functionality but the code still does not find the Html element.
Any suggestions on how to resolve this issue would be appreciated, please see my code below :
Image of my selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import json
# establish Json dict
global data
data = {}
global date
date = '''&checkin=2021-02-22&checkout=2021-02-28&adults=1&source'''
def find_info(place):
data[place] = []
driver = webdriver.Chrome('chromedriver.exe')
driver.get("https://www.airbnb.co.uk/")
time.sleep(2)
#first_page_search_bar
search_bar = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "_1xq16jy")))
time.sleep(2)
search_bar.clear()
time.sleep(2)
search_bar.send_keys(f"{place}")
time.sleep(2)
enter_button = driver.find_element_by_class_name("_1mzhry13")
enter_button.click()
#load page
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "_ljad0a")))
page = driver.current_url
new_url = page.replace("&source", date)
# driver = webdriver.Chrome('chromedriver.exe')
driver.get(new_url)
time.sleep(3)
click_button = driver.find_element_by_xpath('//*[#id="menuItemButton-price_range"]/button')
click_button.click()
time.sleep(5)
price = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '/html/body/div[16]/section/div/div/div[2]/div/section/div[2]/div/div/div[1]')))
print(price)
find_info("London, United Kingdom")
I've fixed the xpath at the end of your script:
price = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '(//div[#role="menu"]//div[#dir="ltr"])[1]/preceding-sibling::div')))
print(price.text)
Explanation: Under the <div role="menu" ... there are 3 <div dir="ltr">elements and the first one happens to be just after the div you are looking for. So we find that one and select the preceding sibling.
Another recommendation: if you replace EC.presence_of_element_located to EC.element_to_be_clickable when you are looking for the input fields at the start you can get rid of a few time.sleep statements.
I can't get the text from the element. I think it is a dynamically added text (from Angular) to the element and therefore not loaded directly in the element. The text inside the element is in the format of e.g. "3" with citation marks around ut.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import xlsxwriter
import re
pattern = r"[\"\d{1, 2}\"]"
PATH = "C:\Program Files (x86)\chromedriver.exe"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(PATH, chrome_options=chrome_options)
driver.get("some-url")
xpathPain = "/html/body/div[2]/div/div/div[1]/div/div/div[1]/div[3]/div/div/div[1]/div[3]/development-numbers/status-numbers/div/div[2]/div/h4"
try:
element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, xpathPain)))
elementPain = driver.find_element_by_xpath(xpathPain)
print(elementPain.text)
except TimeoutException:
print("Failed to load elementPain")
I get the output: (blank , like an empty string)
. I have tried to wait til the text is loaded with the EC text_to_be_present_in_element(locator, text_) and tried to use a regular expression for the text part.
The page source for the element is:
<h4 class="status-numbers__number">
"6"
<!---->
</h4>
So how do I get the number 6 from this element?
I have tried print(elementPain.get_attribute("innerHTML")) and that gets the "<!---->" part of the text but not the '"6"' part. I have also tried .getAttribute("innerText"), .getAttribute("textContent").
I have tried using the firefox geckodriver instead as well. No result.
I have managed to solve the issue using Firefox and this code:
try:
element = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, xpathPain)))
elementPain = driver.find_element_by_xpath(xpathPain)
print(elementPain.get_attribute("innerHTML"))
Don't know it it had to do with the element out of viewport.
I have managed to solve the issue using Firefox and this code:
try:
element = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, xpathPain)))
elementPain = driver.find_element_by_xpath(xpathPain)
print(elementPain.get_attribute("innerHTML"))
Don't know it it had to do with the element out of viewport.
Use the following XPath to identify the element.
You can use element.text or element.get_attribute("textContent") to get the text.
try:
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, "//h4[#class='status-numbers__number']")))
elementPain = driver.find_element_by_xpath("//h4[#class='status-numbers__number']")
print(elementPain.text) #To get the text using text
print(elementPain.get_attribute("textContent")) #To get the text using get_attribute()
except TimeoutException:
print("Failed to load elementPain")
Pic of inspect element
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get('https://website.com/')
driver.maximize_window()
search = driver.find_element_by_id('UserName')
search.send_keys('UserName')
search = driver.find_element_by_id('Password')
search.send_keys('Password')
search.send_keys(Keys.RETURN)
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "Admin"))
)
element.click
link = driver.find_element_by_link_text('Admin')
link.click()
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "Reports"))
)
element.click
link = driver.find_element_by_link_text('Reports')
link.click()
except:
driver.quit()
driver.implicitly_wait(5)
sales_link = driver.find_element_by_link_text('Sales').click()
Below is the info from the website, I want to click on Sales but can't seem to do so any help would be appreciated
a _ngcontent-hyf-c12="" routerlink="./SalesReport" routerlinkactive="active" href="/Reports/SalesReport"Sales /a
Pic of error
This what appears if I try to click on it with XPATH
Error Pic
In your html pic, I see a whitespace after Sales.
Look carefully: href="/Reports/SalesReport">Sales </a.
So find_element_by_link_text('Sales') will not work.
You can change it to find_element_by_link_text('Sales ').
However, this will be better:
driver.find_elements_by_xpath("//a[contains(text(), 'Sales')]")
A web page has page numbers which can be clicked to advance to the next page.
The page numbers are in an tag with href to a javascript call.
After this...
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
xpath = 'appropriate_xpath_string'
wait_presence = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, xpath))
)
wait_clickable = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, xpath))
)
And calculating scroll_y = distance needed to make element visible...
driver.execute_script('window.scrollTo(0,', scroll_y, ')')
I've tried the following:
driver_find_element_by_xpath('xpath').click()
driver_find_element_by_xpath('xpath').send_keys('\ue007') (Sending the Enter key.)
driver.execute_script("arguments[0].click()", element) (The JavaScript trick.)
ActionChains clicking
Problematic Element:
2
__doThis() function essentially does a form .submit() with some parameters.
Edit:
After sending the click event Developer Tools shows an error:
VM1560:1 Uncaught ReferenceError: __doThis is not defined
at <anonymous>:1:1
After scanning through the HTML/js I noticed that the __doThis function is not defined in ChromeDriver. When I load the page in Edge or Chrome the tag with __doThis function is present. So why is the javascript function not loading in ChromeDriver?
EDIT 2:
I loaded chromedriver.exe without configuring special options and this solved the problem. Now I need to figure out what option was breaking the page's javascript function.
EDIT 3:
It seems this section of code breaks the in-page javascript:
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": {"User-Agent": "browser1"}})
I need to do some research to understand why.
Try below code:
wait = WebDriverWait(driver, 10)
wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "2"))).click()
Note : please add below imports to your solution
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
Try following code which will scrolled to element view using location_once_scrolled_into_view
wait_presence = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
wait_presence.location_once_scrolled_into_view
driver.execute_script("arguments[0].click();", wait_presence)
OR javascripts executor to scroll.
wait_presence = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
driver.execute_script("arguments[0].scrollIntoView()", wait_presence)
driver.execute_script("arguments[0].click();", wait_presence)
I am attempting to click the "View More Results" button on the following page: http://www.chadbourne.com/search/people?az[b]=b
My code is straight forward, and I have tried a number of iterations.
driver = driver.Firefox()
driver.get("http://www.chadbourne.com/search/people?az[b]=b")
element = driver.find_element_by_partial_link_text("View more results")
or
element = driver.find_element_by_partial_link_text("view")
or
element = driver.find_element_by_partial_link_text("results")
No matter which of the above options I try, I get a NoSuchElementException.
This is odd, because the element clearly exists on the page:
View more results
Any thoughts?
It takes some time for the page to load, the element is not immediately available. Let's wait for it to be clickable and change the locator to a CSS selector:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("http://www.chadbourne.com/search/people?az[b]=b")
wait = WebDriverWait(driver, 20)
# get more results
more_results = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, ".load-more-pager-wrapper .pager-next"))
)
more_results.click()