Find a specific Text in WebPage via Selenium - python

how is it possible for me to let Selenium search for a specific Line in the Code of a Website?
I am searching for the Line i attached in the photo.
Thanks in advance!
Image Info Website

You can get the inner text of an element by either:
inner_text = element.get_attribute('innerText');
inner_text = element.text;
Therefore, you can scan all those divs with the condition of inner_text == "Fehler".
Since the ids follow a pattern, here's how you can scan the divs and select the element:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def id_value(index):
return f"cdk-describe-message-{index}"
def find_innertext(driver, url, text, N):
driver.get(url)
for idx in range(N):
element = driver.find_elements(By.ID, id_value(idx))
if element.text == text:
return element
def main():
driver = webdriver.Firefox()
url = "https://...."
target = "Fehler"
number_of_divs = 40
return find_innertext(driver, url, target, number_of_divs)
if __name__ == "__main__":
element = main()
Notes:
you have to know the number of divs elements there are in advance;
consider using waits.

This is what i got so far:
def main():
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument("--incognito")
chrome_options.add_argument('ignore-certificate-errors')
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)
smalogin = "https://172.16.63.100/webui/login"
driver.get(smalogin)
driver.implicitly_wait(100)
email = driver.find_element(By.NAME, "username")
email.send_keys('user')
password = driver.find_element(By.NAME, "password")
password.send_keys('pass')
submit = driver.find_element("xpath",'//*[#id="login"]/button')
submit.click()
monitoring = driver.find_element(By.ID,"ennexos-element-monitoring")
monitoring.click()
statusliste = driver.find_element("xpath",'//*[#id="cdk-accordion-child-0"]/div/div/sma-feature-board-slot[1]/sma-navigation-link')
statusliste.click()
time.sleep(10)
if __name__ == '__main__':
main()

Related

Instagram Comment Scraping, Scrapes Username instead of comment

So I am trying to scrape usernames and comments from multiple posts. Using this code below.
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
import sys
import pandas as pd
from pandas import ExcelWriter
import os.path
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url=['https://www.instagram.com/p/CRLe53_hmMH','https://www.instagram.com/p/CRX7VL1sL54/?utm_medium=share_sheet',
'https://www.instagram.com/p/CRVB7ykM7-R/?utm_medium=share_sheet', 'https://www.instagram.com/p/CRQ9Bq5M6ce/?utm_medium=share_sheet',
'https://www.instagram.com/p/CRQT1BJMmSi/?utm_medium=share_sheet', 'https://www.instagram.com/p/CM8T3HgMQG0/?utm_medium=copy_link'
'https://www.instagram.com/p/COrn5fYs78O/?utm_medium=share_sheet']
user_names = []
user_comments = []
driver = driver = webdriver.Chrome('E:/chromedriver')
driver.get(url[0])
time.sleep(3)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))
username.clear()
username.send_keys('myuname')
password.clear()
password.send_keys('mypassword')
Login_button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()
time.sleep(4)
not_now = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
not_now2 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
for n in url:
try:
driver.get(n)
time.sleep(3)
load_more_comment = driver.find_element_by_class_name('glyphsSpriteCircle_add__outline__24__grey_9')
print("Found {}".format(str(load_more_comment)))
i = 0
while load_more_comment.is_displayed() and i < 10:
load_more_comment.click()
time.sleep(1.5)
load_more_comment = driver.find_element_by_class_name('glyphsSpriteCircle_add__outline__24__grey_9')
print("Found {}".format(str(load_more_comment)))
i += 1
user_names.pop(0)
user_comments.pop(0)
except Exception as e:
print(e)
pass
comment = driver.find_elements_by_class_name('gElp9 ')
for c in comment:
container = c.find_element_by_class_name('C4VMK')
name = container.find_element_by_class_name('_6lAjh ').text
content = container.find_element_by_tag_name('span').text
content = content.replace('\n', ' ').strip().rstrip()
user_names.append(name)
user_comments.append(content)
print(content)
user_names.pop(0)
user_comments.pop(0)
#export(user_names, user_comments)
driver.close()
df = pd.DataFrame(list(zip(user_names, user_comments)),
columns =['Name', 'Comments'])
#df.to_excel('ujicoba_gabung_IG_6.xlsx')
print(df)
But somehow instead of returning username and comment, both user_names and user_comments return usernames. Where did I make a mistake?
Here Are My outputs
I think my problem is on the for loop where I declare the container as C4VMK. But I inspected the element on Instagram it is already the same
There are two span in C4VMK class. First in h3 -> first div -> span and second is that one you want.
For getting the second span that is the comment, replace your code with below and get the second element.
content = container.find_elements_by_tag_name('span')[1].text
Your container is correct. However, when you search for a span by tag name like this:
content = container.find_element_by_tag_name('span').text
Selenium will find the first span that is under the content. Which in this case is the username span with the class 'Jv7Aj mArmR MqpiF '.
What you are looking for is the other span that I highlighted in the image, which is a direct child of the container with an empty class.
You can select it like this:
content = container.find_element_by_xpath("/span[#class='']")

Conditional dropdown for loop is not working in the expected way

I had posted in Stack Exchange earlier; however, did not get much response from that yet; hence, posting it here.
I am trying to scrape some data using the following code. When I run the code line by line, it works fine. However, when I want to run all code at one go, the dropdown options go blank and as a result, the last line returns error. Your help would be much appreciated. The code is below.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os
path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)
url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()
browser.find_element_by_link_text("BIHAR").click()
browser.implicitly_wait(5)
year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlFin")
elem2.send_keys(year[0])
browser.implicitly_wait(5)
select_dist = browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist")
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
for e in range(len(options)):
select_dist = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddldist"))
select_dist.select_by_index(e)
select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
options1 = select_block.options
for f in range(len(options1)):
select_block = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlblock"))
select_block.select_by_index(f)
select_gp = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
options2 = select_gp.options
for g in range(len(options2)):
select_gp = Select(browser.find_element_by_name("ctl00$ContentPlaceHolder1$ddlpanchayat"))
select_gp.select_by_index(g)
browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_rbLoginLevel_1").click()
browser.implicitly_wait(10)
elem6 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodFrom")
elem6.send_keys('01/04/2016')
browser.implicitly_wait(10)
elem7 = browser.find_element_by_name("ctl00$ContentPlaceHolder1$txtperiodTo")
elem7.send_keys('31/03/2017')
browser.implicitly_wait(10)
browser.find_element_by_css_selector("#ctl00_ContentPlaceHolder1_login").click()
browser.implicitly_wait(10)
browser.find_element_by_link_text("Download All Reports").click()
Besides that the target page is slower than an aged snail, and those 10 second waits are barely enough for anything, there are two things you missed an those caused your troubles:
you did not take account that the first element of the select options are "select an option" types. So if you try to cycle trough all of them, you must ignore the option at the first index, else it will look like "nothing is selected"
wait for that spinner. After the spinner is gone, page will be refreshed. Do not grab the elements before page refresh is complete, wait until the spinner is gone.
With these two helper functions it is possible to press the "Get Reports" button without issues:
def is_spinner_gone(arg):
loaded_spinner = browser.find_element_by_xpath('//div[//div[#class="loader"]]')
if loaded_spinner:
return loaded_spinner.get_attribute('style') == 'display: none;'
return True
def wait_for_element(xpath):
# this is necessary because the spinner does not pop up instantly
time.sleep(1)
no_spinner = WebDriverWait(browser, 500).until(is_spinner_gone)
element = WebDriverWait(browser, 500).until(
EC.element_to_be_clickable((By.XPATH, xpath)))
return element
If you get your elements via the wait_for_element call then you'll be able to interact with them without error. I guess you know that pressing that button is not the end of the road yet, you'll have to choose the report format and who knows what later on.
Adjusted code:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import os
import time
path = os.path.join(r"D:\ScrapedData\TN\SocialAudit")
path_to_chromedriver = 'D:\ScrapedData/chromedriver'
options = webdriver.ChromeOptions()
prefs = {'download.default_directory' : path}
options.add_experimental_option('prefs', prefs)
browser = webdriver.Chrome(chrome_options=options ,executable_path=path_to_chromedriver)
start = time.time()
url = "http://mnregaweb4.nic.in/netnrega/SocialAudit/StateList.aspx"
browser.get(url)
browser.set_page_load_timeout(45)
browser.maximize_window()
loaded = time.time()
print(f'PAGE LOADED IN {loaded-start} seconds')
browser.find_element_by_link_text("BIHAR").click()
def is_spinner_gone(arg):
loaded_spinner = browser.find_element_by_xpath('//div[//div[#class="loader"]]')
if loaded_spinner:
return loaded_spinner.get_attribute('style') == 'display: none;'
return True
def wait_for_element(xpath):
# this is necessary because the spinner does not pop up instantly
time.sleep(1)
no_spinner = WebDriverWait(browser, 500).until(is_spinner_gone)
element = WebDriverWait(browser, 500).until(
EC.element_to_be_clickable((By.XPATH, xpath)))
return element
year=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
elem2 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlFin"]')
selector_page_loaded = time.time()
print(f'WORK AREA LOADED IN {selector_page_loaded-loaded} seconds')
elem2.send_keys(year[0])
select_dist = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddldist"]')
options = [x for x in select_dist.find_elements_by_tag_name("option")]
dist=[]
# ISSUE: default fields are included in the options!
for e in range(1,len(options)):
select_dist = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddldist"]'))
select_dist.select_by_index(e)
select_block = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlblock"]'))
options1 = select_block.options
for f in range(1, len(options1)):
select_block = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlblock"]'))
select_block.select_by_index(f)
select_gp = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
options2 = select_gp.options
for g in range(1, len(options2)):
select_gp = Select(wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$ddlpanchayat"]'))
select_gp.select_by_index(g)
wait_for_element('//*[#id="ctl00_ContentPlaceHolder1_rbLoginLevel_1"]').click()
elem6 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$txtperiodFrom"]')
elem6.send_keys('01/04/2016')
elem7 = wait_for_element('//*[#name="ctl00$ContentPlaceHolder1$txtperiodTo"]')
elem7.send_keys('31/03/2017')
wait_for_element('//*[#value="Get Reports"]').click()
print(f'FIRST RUN IN {time.time()-selector_page_loaded}')

How to wait until element is available in selenium python

I am writing script using selenium python but there is problem i have tried to find solution but i can not find one that was helpful to me. here is the code
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import unittest
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class sulekhastart(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
def test_parse_contact_urls_and_go_to_next_page(self):
pagenumber = 'Page'
#assign WEBDRIVER to local webdriver
driver = self.driver
#Website open by below url
driver.get("http://www.sulekha.com/ac-dealers/bangalore")
self.assertIn("Sulekha Bangalore", driver.title)
#close the lightbox thnat appears at the firsttime load of page
startlightbox = driver.find_element_by_xpath('//a[#class="lcf-close"]')
startlightbox.click()
while True:
#get the page number
pageno = driver.find_element_by_xpath('//li[#id="numberPage"]/strong')
print pageno.text
print pagenumber
#check if page same as last page or not
if str(pageno.text) != pagenumber:
pagenumber = str(pageno.text)
businessname = driver.find_elements_by_xpath('//li/div/div[#class="busi-name"]/h3/a')
records = len(businessname)
#print all data that are available on the webpage
for i in range(0,records):
print businessname[i].get_attribute('href')
print businessname[i].text
nextpage = driver.find_element_by_xpath('//li[#id="nextPage"]')
nextpage.click()
else:
print 'This is last page all data is scraped change url and get another data'
break
element = WebDriverWait(driver, 10).until_not(EC.presence_of_element_located((By.XPATH, "/html/body/div/div/svg")))
def tearDown(self):
self.driver.close()
print 'page not be closed'
if __name__ == "__main__":
unittest.main()
and i want to wait script after click on the next button until By.XPATH, "/html/body/div/div/svg" this element gone from DOM or page source and then after wait until 3 seconds
as andersson commented
replacing
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/svg")))
with
element = WebDriverWait(driver, 10).until_not(
EC.presence_of_element_located((
By.XPATH, "/html/body/div/div/*[name()='svg']")))
solves the problem

Selenium about how to get the new_driver after click the more_button?

Q:
I'm using Selenium to get a page with contents, and after I click the more button,the page outputs more content,and how I get the new page through webdriver?
some codes like this:
def parase_questions(self):
driver = self.login()
driver.implicitly_wait(2)
more_btn = driver.find_element_by_css_selector(".zg-btn-white.zg-r3px.zu-button-more")
more_btn.click()
# should I do something to get the new driver ?
print driver.page_source
question_links = driver.find_elements_by_css_selector('.question_link')
print len(question_links)
If I understand you correctly, after you click the More button, there are more elements with question_link class loaded. You would need a way to wait for the question links to be loaded.
Here is one idea - a custom Expected Condition that would help you to wait until there are more than N number of elements:
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
class wait_for_more_than_n_elements(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
count = len(EC._find_elements(driver, self.locator))
return count > self.count
except StaleElementReferenceException:
return False
Usage:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
driver = self.login()
driver.implicitly_wait(2)
question_links = driver.find_elements_by_css_selector('.question_link')
more_btn = driver.find_element_by_css_selector(".zg-btn-white.zg-r3px.zu-button-more")
more_btn.click()
# wait
wait = WebDriverWait(driver, 10)
wait.until(wait_for_more_than_n_elements((By.CSS_SELECTOR, ".question_link"), len(question_links))
# now more question links were loaded, get the page source
print(driver.page_source)

Can I make Selenium wait for completion of Xhr requests

I´m trying to scrape pricing data for each numerous vehicles, for example:
http://www.leasingcar.dk/privatleasing/Citro%C3%ABn-C1/VTi-68-Feel
I´m iterating over the selection boxes "leasingPeriod" and then "annualMileage".
My problem is that by the time the request has returned, I´ve already scraped the data, so I´m retrieving the same price every time. I´ve tried to use an implicit wait but it doesn´t seem to have any effect? I´ve also tried to wait for the completion of ajax calls but to no avail.
My code looks like this:
enter code# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import unittest
class DataTest(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
self.driver.get("http://www.leasingcar.dk/privatleasing")
def testData(self):
driver = self.driver
vehicleLinksList = []
vehicleLinks =driver.find_elements_by_css_selector('div.vehicle[data-nice_url]')
for linkElement in vehicleLinks:
vehicleLinksList.append(linkElement.get_attribute("data-nice_url"))
for link in vehicleLinksList:
fullUrl = ""
fullUrl = "http://www.leasingcar.dk" + str(link)
driver.get(fullUrl)
leasingPeriodElements = driver.find_element_by_css_selector("select[id=leasingPeriod]") #get the select element
periodsOptions = leasingPeriodElements.find_elements_by_tag_name("option") #get all the options into a list
mileageElements = driver.find_element_by_css_selector("select[id=annualMileage]") #get the select element
mileageOptions = mileageElements.find_elements_by_tag_name("option") #get all the options into a list
periodOptionsList = []
mileageOptionList = []
for option in periodsOptions:
periodOptionsList.append(option.get_attribute("value"))
for option in mileageOptions:
mileageOptionList.append(option.get_attribute("value"))
for optionValue in periodOptionsList:
print "starting loop on option %s" % optionValue
leasingPeriodElement = Select(driver.find_element_by_css_selector("select[id=leasingPeriod]"))
leasingPeriodElement.select_by_value(optionValue)
for mileageValue in mileageOptionList:
mileageElement = Select(driver.find_element_by_css_selector("select[id=annualMileage]"))
mileageElement.select_by_value(mileageValue)
#driver.implicitly_wait(10)
#WebDriverWait(driver, 10).until(ajax_complete, "Timeout waiting for page to load")
wait = WebDriverWait(driver, 10)
price = wait.until(wait_for_visible_element_text_to_contain((By.CSS_SELECTOR, "span.total-price"), "Kr."))
print price.text
#driver.refresh()
#driver.implicitly_wait(10)
def tearDown(self):
self.driver.quit()
if __name__ == '__main__':
unittest.main()
class wait_for_visible_element_text_to_contain(object):
def __init__(self, locator, text):
self.locator = locator
self.text = text
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
for element in elements:
if self.text in element.text and element.is_displayed():
return element
except StaleElementReferenceException:
return False
def ajax_complete(driver):
try:
return 0 == driver.execute_script("return jQuery.active")
except WebDriverException:
pass
Is there any way to check whether requests have been completed, or if a value has been refreshed?
Frank,
Use an explicit wait to determine when the "Leasing Period" changes in the data table. Something like:
from selenium.webdriver.support.ui import WebDriverWait
xpath = "//div[#class='data-last']/span[#class='period']"
for elm in driver.find_elements_by_xpath(xpath):
if elm.is_displayed():
WebDriverWait(driver, 10).until(
lambda _: elm.text == "48"
)
NOTE: I had to use find_elements_by_xpath and check if the elements are displayed because there is a hidden element with that same xpath.

Categories