Extract data from table by field name. Xpath, python - python

I want to extract data from this page https://mbasic.facebook.com/kristina.layus
There is a table "Places lived" with two rows
Current city --- Moscow, Russia
Home town --- Saint Petersburg, Russia
I can extract data with help of full xpath (extracted data "Moscow, Russia"):
/html/body/div/div/div[2]/div/div[1]/div[4]/div/div/div[1]/div/table/tbody/tr/td[2]/div/a
But I want extract data with help of names in table. I tried this
//div[#id='living']//div[#title='Current City']//a/text()
But received error
NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//div[#id='living']//div[#title='Current City']//a/text()"}
(Session info: chrome=84.0.4147.89)
My code
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class FacebookParser:
LOGIN_URL = 'https://www.facebook.com/login.php'
def __init__(self, login, password):
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
chrome_options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(chrome_options=chrome_options)
self.wait = WebDriverWait(self.driver, 10)
self.login(login, password)
def login(self, login, password):
self.driver.get(self.LOGIN_URL)
# wait for the login page to load
self.wait.until(EC.visibility_of_element_located((By.ID, "email")))
self.driver.find_element_by_id('email').send_keys(login)
self.driver.find_element_by_id('pass').send_keys(password)
self.driver.find_element_by_id('loginbutton').click()
def get_user_by_id(self, id):
self.driver.get(BASIC_URL + 'profile.php?id=' + str(id))
def get_user_by_url(self, url):
self.driver.get(url)
def find_element_by_xpath_safe(self, path):
try:
return parser.driver.find_element_by_xpath(path)
except:
return None
def get_first_name(self):
res = self.find_element_by_xpath_safe('//span/div/span/strong')
if res:
vec = res.text.split()
if len(vec) > 0:
return vec[0]
else:
print("Can't split {}".format(res.text))
return ""
def get_second_name(self):
res = self.find_element_by_xpath_safe('//span/div/span/strong')
if res:
vec = res.text.split()
if len(vec) > 1:
return vec[1]
else:
print("Can't split {}".format(res.text))
return ""
def get_curr_city(self):
res = self.find_element_by_xpath_safe('/html/body/div/div/div[2]/div/div[1]/div[4]/div/div/div[1]/div/table/tbody/tr/td[2]/div/a')
if res:
return res.text
return ""
def get_home_town(self):
res = self.find_element_by_xpath_safe('/html/body/div/div/div[2]/div/div[1]/div[4]/div/div/div[2]/div/table/tbody/tr/td[2]/div/a')
if res:
return res.text
return ""
#####################################
LOGIN = '----.com'
PASSWORD = '----'
BASIC_URL = 'https://mbasic.facebook.com/'
#####################################
parser = FacebookParser(login=LOGIN, password=PASSWORD)
parser.driver.get("https://mbasic.facebook.com/kristina.layus")
parser.driver.get("https://mbasic.facebook.com/kristina.layus")
print(parser.get_curr_city())

To print the text Moscow, Russia you need to induce WebDriverWait for the visibility_of_element_located() and you can use the following xpath based Locator Strategy:
Printing Moscow, Russia:
driver.get('https://mbasic.facebook.com/kristina.layus')
print(WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, "//span[text()='Current City']//following::td//a"))).text)
Note : You have to add the following imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
References
You can find a couple of relevant discussions on NoSuchElementException in:
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element while trying to click Next button with selenium
selenium in python : NoSuchElementException: Message: no such element: Unable to locate element

Try to add following code between login (loginbutton.click()) and opening target page:
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
WebDriverWait(wd, DELAY).until(EC.presence_of_element_located((By.ID, "mount_0_0")))
This code will wait till login process will be finished,only after that target page should be opened.
Also check your xpath expression: when investigating page source div element with id="living" can be found, but div with attribute title="Current City" is absent.

Related

Can't locate HTML element on Rotten Tomatoes

I would like to find a movie and get ratings of it on Rotten Tomatoes, but I'm stuck because I don't know how to click on it in search results section. I tried almost every XPATH or CLASS NAME but every time I got error message that it couldn't find the element. I'm using Python Selenium.
My code:
import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def element(driver, by_x, html_element):
try:
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((by_x, html_element))
)
return element
except:
print("Can not locate this element")
class rottenTomatoes:
def __init__(self, film):
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
self.driver = webdriver.Chrome(executable_path="./drivers/chromedriver", options=options)
self.film = film
self.driver.get("https://www.rottentomatoes.com/")
def search(self):
# search for a film
search_bar = self.driver.find_element_by_class_name("search-text")
search_bar.click()
search_bar.send_keys(self.film, Keys.RETURN)
# filter movies only
element(self.driver, By.XPATH, "//*[#id='main-page-content']/div/section[1]/search-result-
container/nav/ul/li[3]/span").click()
# accept cookies
self.driver.find_element_by_id("truste-consent-button").click()
# click on film (THE PROBLEM)
element(self.driver, By.CLASS_NAME, "media-col thumbnail-group").click()
rottentomatoes = rottenTomatoes("Shawshank")
rottentomatoes.search()
EDIT
Error message:
Can not locate this element
Traceback (most recent call last):
File "d:\Programovanie\selenium\movie_info\rotten_tomat.py", line 38, in <module>
rottentomatoes.search()
File "d:\Programovanie\selenium\movie_info\rotten_tomat.py", line 35, in search
element(self.driver, By.CLASS_NAME, "media-col thumbnail-group").click()
AttributeError: 'NoneType' object has no attribute 'click'
import sys
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
def element(driver, by_x, html_element):
try:
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((by_x, html_element))
)
return element
except:
print("Can not locate this element")
class rottenTomatoes:
def __init__(self, film):
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
self.driver = webdriver.Chrome(
options=options)
self.film = film
self.driver.get("https://www.rottentomatoes.com/")
def search(self):
# search for a film
search_bar = self.driver.find_element_by_class_name("search-text")
search_bar.click()
search_bar.send_keys(self.film, Keys.RETURN)
# filter movies only
element(self.driver, By.XPATH, "// *[#id='main-page-content']/div/section[1]/search-result-container/nav/ul/li[3]/span").click()
# accept cookies
time.sleep(5)
try:
self.driver.find_element_by_id("truste-consent-button").click()
except:
pass
ele = self.driver.execute_script(
"return document.querySelector('search-result-container').shadowRoot.querySelector('[type=\"movie\"]').shadowRoot.querySelector('media-row').shadowRoot.querySelector('[class=\"media-row center\"]')")
# click on film (THE PROBLEM)
ele.click()
time.sleep(10)
rottentomatoes = rottenTomatoes("Shawshank")
rottentomatoes.search()
Added the full code the element was inside shadowRoot , so you have to use javascript
You can use requests:
import requests
shearch = "Shawshank"
choose_type = "movie"
url = f"https://www.rottentomatoes.com/napi/search/all?type={choose_type}&searchQuery={shearch}"
r = requests.get(url)
r_json = r.json()
print(r_json)

Parsing a site where URL doesn't change with Selenium Python

I'm trying to scrape [this][1] site its URL doesnt change when next page is clicked on. So, I used Selenium to click on the next page, but doing that doesnt help. As my driver keeps getting the old page even after next page is clicked on. Is there any other way to get to the next page and scrape it?
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Safari()
store_pages = []
#10306 is total number of pages.
for i in range (10306):
Starting_url = 'site'
driver.get(Starting_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")
print (store_pages.append(i))
timeout = 20
try:
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_lblDisclaimerMsg']")))
except TimeoutException:
print("Timed out waiting for page to load")
driver.quit()
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
timeout = 20
wait = WebDriverWait(driver, 10).until(EC.text_to_be_present_in_element_value((By.CSS_SELECTOR, '#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a > div.act_search_results > div.act_search_header'), "206113 Record(s) | Page [2 of 10306]"))
NGO_element = driver.find_element_by_class_name("faq-sub-content exempted-result")
NGO_name = NGO_element.find_elements_by_tag_name("h1")
NGO_name_pancard = driver.find_elements_by_class_name("pan-id")
NGO_data = NGO_element.find_elements_by_tag_name("ul")
NGO_sub_data = NGO_element.find_elements_by_tag_name("li")
for i, p, t in zip(NGO_name, NGO_name_pancard, NGO_data):
n_name = i.text.replace(p.text, '')
n_data = t.text
n_pan = p.text
print ("Name of NGO:", n_name, "Fields of NGO:", n_data, "Pancard number:", n_pan)
nextpage_url = driver.find_element_by_name("ctl00$SPWebPartManager1$g_d6877ff2_42a8_4804_8802_6d49230dae8a$ctl00$imgbtnNext").click()
#timeout = 2
You need to make sure when you reach the next page, the content of the earlier page has become stale otherwise, you will have stale element error or get the same thing repeatedly. Try the below approach, it should get you there. The rest you can modify yourself.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get("http://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"[id^='arrowex']"))):
print(elem.text)
try:
wait.until(EC.presence_of_element_located((By.ID, "ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_imgbtnNext"))).click()
wait.until(EC.staleness_of(elem))
except:
break
driver.quit()

How to wait until element is available in selenium python

I am writing script using selenium python but there is problem i have tried to find solution but i can not find one that was helpful to me. here is the code
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import unittest
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class sulekhastart(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
def test_parse_contact_urls_and_go_to_next_page(self):
pagenumber = 'Page'
#assign WEBDRIVER to local webdriver
driver = self.driver
#Website open by below url
driver.get("http://www.sulekha.com/ac-dealers/bangalore")
self.assertIn("Sulekha Bangalore", driver.title)
#close the lightbox thnat appears at the firsttime load of page
startlightbox = driver.find_element_by_xpath('//a[#class="lcf-close"]')
startlightbox.click()
while True:
#get the page number
pageno = driver.find_element_by_xpath('//li[#id="numberPage"]/strong')
print pageno.text
print pagenumber
#check if page same as last page or not
if str(pageno.text) != pagenumber:
pagenumber = str(pageno.text)
businessname = driver.find_elements_by_xpath('//li/div/div[#class="busi-name"]/h3/a')
records = len(businessname)
#print all data that are available on the webpage
for i in range(0,records):
print businessname[i].get_attribute('href')
print businessname[i].text
nextpage = driver.find_element_by_xpath('//li[#id="nextPage"]')
nextpage.click()
else:
print 'This is last page all data is scraped change url and get another data'
break
element = WebDriverWait(driver, 10).until_not(EC.presence_of_element_located((By.XPATH, "/html/body/div/div/svg")))
def tearDown(self):
self.driver.close()
print 'page not be closed'
if __name__ == "__main__":
unittest.main()
and i want to wait script after click on the next button until By.XPATH, "/html/body/div/div/svg" this element gone from DOM or page source and then after wait until 3 seconds
as andersson commented
replacing
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/svg")))
with
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/*[name()='svg']")))
solves the problem

Selenium about how to get the new_driver after click the more_button?

Q:
I'm using Selenium to get a page with contents, and after I click the more button,the page outputs more content,and how I get the new page through webdriver?
some codes like this:
def parase_questions(self):
driver = self.login()
driver.implicitly_wait(2)
more_btn = driver.find_element_by_css_selector(".zg-btn-white.zg-r3px.zu-button-more")
more_btn.click()
# should I do something to get the new driver ?
print driver.page_source
question_links = driver.find_elements_by_css_selector('.question_link')
print len(question_links)
If I understand you correctly, after you click the More button, there are more elements with question_link class loaded. You would need a way to wait for the question links to be loaded.
Here is one idea - a custom Expected Condition that would help you to wait until there are more than N number of elements:
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
class wait_for_more_than_n_elements(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
count = len(EC._find_elements(driver, self.locator))
return count > self.count
except StaleElementReferenceException:
return False
Usage:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
driver = self.login()
driver.implicitly_wait(2)
question_links = driver.find_elements_by_css_selector('.question_link')
more_btn = driver.find_element_by_css_selector(".zg-btn-white.zg-r3px.zu-button-more")
more_btn.click()
# wait
wait = WebDriverWait(driver, 10)
wait.until(wait_for_more_than_n_elements((By.CSS_SELECTOR, ".question_link"), len(question_links))
# now more question links were loaded, get the page source
print(driver.page_source)

Can I make Selenium wait for completion of Xhr requests

I´m trying to scrape pricing data for each numerous vehicles, for example:
http://www.leasingcar.dk/privatleasing/Citro%C3%ABn-C1/VTi-68-Feel
I´m iterating over the selection boxes "leasingPeriod" and then "annualMileage".
My problem is that by the time the request has returned, I´ve already scraped the data, so I´m retrieving the same price every time. I´ve tried to use an implicit wait but it doesn´t seem to have any effect? I´ve also tried to wait for the completion of ajax calls but to no avail.
My code looks like this:
enter code# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import unittest
class DataTest(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
self.driver.get("http://www.leasingcar.dk/privatleasing")
def testData(self):
driver = self.driver
vehicleLinksList = []
vehicleLinks =driver.find_elements_by_css_selector('div.vehicle[data-nice_url]')
for linkElement in vehicleLinks:
vehicleLinksList.append(linkElement.get_attribute("data-nice_url"))
for link in vehicleLinksList:
fullUrl = ""
fullUrl = "http://www.leasingcar.dk" + str(link)
driver.get(fullUrl)
leasingPeriodElements = driver.find_element_by_css_selector("select[id=leasingPeriod]") #get the select element
periodsOptions = leasingPeriodElements.find_elements_by_tag_name("option") #get all the options into a list
mileageElements = driver.find_element_by_css_selector("select[id=annualMileage]") #get the select element
mileageOptions = mileageElements.find_elements_by_tag_name("option") #get all the options into a list
periodOptionsList = []
mileageOptionList = []
for option in periodsOptions:
periodOptionsList.append(option.get_attribute("value"))
for option in mileageOptions:
mileageOptionList.append(option.get_attribute("value"))
for optionValue in periodOptionsList:
print "starting loop on option %s" % optionValue
leasingPeriodElement = Select(driver.find_element_by_css_selector("select[id=leasingPeriod]"))
leasingPeriodElement.select_by_value(optionValue)
for mileageValue in mileageOptionList:
mileageElement = Select(driver.find_element_by_css_selector("select[id=annualMileage]"))
mileageElement.select_by_value(mileageValue)
#driver.implicitly_wait(10)
#WebDriverWait(driver, 10).until(ajax_complete, "Timeout waiting for page to load")
wait = WebDriverWait(driver, 10)
price = wait.until(wait_for_visible_element_text_to_contain((By.CSS_SELECTOR, "span.total-price"), "Kr."))
print price.text
#driver.refresh()
#driver.implicitly_wait(10)
def tearDown(self):
self.driver.quit()
if __name__ == '__main__':
unittest.main()
class wait_for_visible_element_text_to_contain(object):
def __init__(self, locator, text):
self.locator = locator
self.text = text
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
for element in elements:
if self.text in element.text and element.is_displayed():
return element
except StaleElementReferenceException:
return False
def ajax_complete(driver):
try:
return 0 == driver.execute_script("return jQuery.active")
except WebDriverException:
pass
Is there any way to check whether requests have been completed, or if a value has been refreshed?
Frank,
Use an explicit wait to determine when the "Leasing Period" changes in the data table. Something like:
from selenium.webdriver.support.ui import WebDriverWait
xpath = "//div[#class='data-last']/span[#class='period']"
for elm in driver.find_elements_by_xpath(xpath):
if elm.is_displayed():
WebDriverWait(driver, 10).until(
lambda _: elm.text == "48"
)
NOTE: I had to use find_elements_by_xpath and check if the elements are displayed because there is a hidden element with that same xpath.

Categories