I am working on a scraping project using selenium in python, but I am running into an error when I try to print text that I get from an XPath element. The error says:
print(AdditionalCerts.text)
AttributeError: 'str' object has no attribute 'text'
Here is my code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
driver = webdriver.Chrome("/Users/nzalle/Downloads/chromedriver")
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# Scraping Code
try:
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div')
IssuedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[1]/div[2]')
CertificationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]')
CertfiedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]')
RecertificationCycle = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[3]/div[2]')
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]')
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a')
AdditionalCerts = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[1]/div[8]/div[1]/a/div/div')
except NoSuchElementException:
Name = "N/A"
IssuedBy = "N/A"
CertificationNumber = "N/A"
CertfiedSince = "N/A"
RecertificationCycle = "N/A"
Expires = "N/A"
AccreditedBy = "N/A"
AdditionalCerts = "N/A"
print(Name.text + " ; " + IssuedBy.text + " : " + CertificationNumber.text + " : " + CertfiedSince.text + " : " + RecertificationCycle + " : " + Expires.text + " : " + AccreditedBy.text + " : " + AdditionalCerts.text)
driver.close()
driver.switch_to.window(driver.window_handles[0])
driver.close()
Please let me know how to view the text from my Name, IssuedBy, CertificationNumber, ext. Thank You :)
Strings in Python don't have a value .text you need to move these into the try.
Name = driver.find_element_by_xpath(...).text
print(Name + " ; " + ...)
I have created simple automation which will first read the read subdomain folder and then it will get into it and then find for the specific html file. If it will find it (i.e: .html) then edit it and i want to add the specific tag somewhere in that file but i am unable to do it.
staticWord = "Hair"
htmlTag = "<a href='"+ staticRandomPathList[0] + "'>" + staticWord + "</a>"
print(htmlTag)
# Now on we are working statically
folderfound = 0
filefound = 0
for domainNameFolder in range(len(staticRandomPathList)):
subDomainSelectedFilesAddress = driver.find_element(By.XPATH,"//table/tbody/tr[" + str(domainNameFolder + 1) + "]/td[" + str(1) + "]")
subDomainName = new_list[domainNameFolder] + '.' + domain_list[domain_variable]
if subDomainSelectedFilesAddress.text == "logs" or subDomainSelectedFilesAddress.text == "public_html":
continue
else:
if subDomainSelectedFilesAddress.text == "test1.testlab.com":
action = ActionChains(driver)
action.double_click(subDomainSelectedFilesAddress).perform()
time.sleep(1)
for file in range(0, 10):
time.sleep(1)
selectedFile = driver.find_element(By.XPATH, "//table/tbody/tr[" + str(
file + 1) + "]/td[" + str(1) + "]")
if selectedFile.text == "5.html":
selectedFile.click()
editFile = driver.find_element(By.XPATH, "//a[#ng-click='showHTMLEditorModal()']")
editFile.click()
# addHtmlTag = WebDriverWait(driver, 20).until(
# EC.visibility_of_element_located((By.CLASS_NAME, "ace_content")))
# insertAnchorTag = driver.find_element(By.CLASS_NAME, "ace_content")
# insertAnchorTag.click()
#
time.sleep(2)
textinput = driver.find_element(By.CLASS_NAME, "ace_text-layer")
print(textinput.text)
gettingTextFromServer = textinput.text
Html_file = open("HTMLParsing.html", "w")
newHTMLFile = Html_file.write(gettingTextFromServer)
html = newHTMLFile
print(html)
# soup = Soup(html)
# bodyTag = soup.find('body')
# anchor = soup.new_tag('a')
# anchor['href'] = staticRandomPathList[0]
# bodyTag.insert(anchor)
Html_file.close()
# print(insertAnchorTag.text)
# mapHTMLTag = driver.find_element(By.ID, "id='htmlEditorContent'")
# mapHTMLTag.send_keys(htmlTag)
# addHtmlTag.send_keys(htmlTag)
filefound = 1
break
else:
continue
if filefound == 1:
break
folderfound = 1
break
else:
continue
print("Successfully Outside Loop")
I am attaching the picture so you would be able to see where I want to place that tag.
This is nodejs selenium code that works. May be you can reproduce in python. Don't forget to navigate to any page page before editing page html. Good luck.
async navigateToHomePage() {
logger.info(`inside navigateToHomePage`)
await driver.get(this.baseUrl)
}
async insertHtmlIntoDocumentBody(html) {
logger.info(`inside insertHtmlIntoDocumentBody`)
let htmlElement = driver.findElement(By.css('html'))
let headElement = driver.findElement(By.css('head'))
let bodyElement = driver.findElement(By.css('body'))
logger.info(`starting html edition`)
await driver.executeScript(`let div =
document.createElement('html');
div.innerHTML='${JSON.stringify(html)}';
arguments[0].removeChild(arguments[1]);
arguments[0].removeChild(arguments[2]);
arguments[0].appendChild(div)`, htmlElement, headElement,
bodyElement)
logger.info(`completed html edition`)
}
I have developed following code. Normally it works fine. But, sometimes while downloading the records (record()), it fails as server fails to respond or if internet is disconnected. To avoid breaking of code I used 'try-exceptions' but then, I can not break out from record() as well as outer function simultaneously. So, looking for solution. Code around 197th line doesn't help in continuing the code from outer while loop. Code is here:
import glob
import datetime
import cv2
import base64
from PIL import Image
from io import BytesIO
import time
import selenium
import self as self
from pytesseract import pytesseract
from selenium.webdriver.common.keys import Keys
import os
from selenium.webdriver.support import expected_conditions as EC, expected_conditions
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException, \
WebDriverException, ElementNotInteractableException, UnexpectedAlertPresentException
main_Directory = r'/home/sangharshmanuski/Documents/e_courts/mha/downloads4'
log_Directory = r'/home/sangharshmanuski/Documents/e_courts/mha/log'
driver = selenium.webdriver.Firefox()
url = r'https://districts.ecourts.gov.in/'
driver.get(url)
# create wait time variable for regular, short and mid
wait = WebDriverWait(driver, 180)
waitShort = WebDriverWait(driver, 3)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#sateist > option:nth-child(22)")))
select = Select(driver.find_element_by_css_selector('#sateist'))
options = select.options
select.select_by_visible_text('Maharashtra')
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.region')))
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#sateist')))
districtListDropdown = Select(driver.find_element_by_css_selector("#sateist"))
distOptions = districtListDropdown.options
# iterate over each district
i = 1
while i < len(distOptions):
try:
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#sateist')))
newDistDropDown = Select(driver.find_element_by_css_selector("#sateist"))
except:
continue
newDistOptions = newDistDropDown.options
distName = newDistOptions[i].text
print(distName)
newDistDropDown.select_by_index(i)
# for creating directory as per each district.
district_directory = os.path.join(
main_Directory, distName) # create new
if not os.path.exists(district_directory): # if not directory exists, create one
os.mkdir(district_directory)
district_log_directory = os.path.join(log_Directory, distName)
if not os.path.exists(district_log_directory): # if not directory exists, create one
os.mkdir(district_log_directory)
headingDist = driver.find_element_by_css_selector('.heading')
if headingDist.text.lower() == distName.lower():
wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.accordion2:nth-child(2)'))).click()
current = driver.window_handles[0]
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR,
'div.panel:nth-child(3) > ul:nth-child(1) > li:nth-child(6) > a:nth-child(1)'))).click()
# wait until new tab opens.
wait.until(EC.number_of_windows_to_be(2))
# define new tab by differentiating from current tab.
newWindow = [window for window in driver.window_handles if window != current][0]
# switch to the new tab. ref: https://stackoverflow.com/questions/41571217/python-3-5-selenium-how-to-handle-a-new-window-and-wait-until-it-is-fully-lo
driver.switch_to.window(newWindow)
# wait till court complex list appears.
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#court_complex_code')))
# create list of all court complex.
# 2 approaches - 1 select 2 click.
time.sleep(3)
def complex_and_act():
this = driver.current_window_handle
def imgtotxt():
elem = driver.find_element_by_id("captcha_image")
loc = elem.location
size = elem.size
left = loc['x']
top = loc['y']
width = size['width']
height = size['height']
box = (int(left), int(top), int(left + width), int(top + height))
screenshot = driver.get_screenshot_as_base64()
img = Image.open(BytesIO(base64.b64decode(screenshot)))
area = img.crop(box)
area.save('/home/sangharshmanuski/Documents/e_courts/captcha/file_trial.png', 'PNG')
fullPath = r'/home/sangharshmanuski/Documents/e_courts/captcha'
f = os.listdir(fullPath)
desPath = r"/home/sangharshmanuski/Documents/e_courts/editC"
img = cv2.imread(os.path.join(fullPath, 'file_trial.png'), 0)
ret, thresh1 = cv2.threshold(img, 111, 255, cv2.THRESH_BINARY)
cv2.imwrite('/home/sangharshmanuski/Documents/e_courts/editC/oneDisNoLoop.png', thresh1)
# know the text with pytesseract
captchaText = pytesseract.image_to_string(
Image.open('/home/sangharshmanuski/Documents/e_courts/editC/oneDisNoLoop.png'))
captcha = driver.find_element_by_id('captcha')
captcha.send_keys(captchaText)
driver.find_element_by_css_selector('input.button:nth-child(1)').click()
time.sleep(1)
def proceed():
while True:
try:
waitShort.until(EC.alert_is_present())
driver.switch_to.alert.accept()
driver.switch_to.window(this)
driver.find_element_by_css_selector(
'#captcha_container_2 > div:nth-child('
'1) > div:nth-child(1) > span:nth-child(3) > a:nth-child(7) > img:nth-child(1)').click()
log_file = open(os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('alert was present' + '\n')
print('alert was present')
imgtotxt()
except:
# if the waitmsg is on, wait for 5 sec
log_file = open(os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('no alert' + '\n')
print('no alert')
waitmsg = 0
while driver.find_element_by_css_selector('#waitmsg').is_displayed():
if waitmsg < 7:
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('wait' + '\n')
print('waitmsg')
time.sleep(1)
waitmsg += 1
else:
log_file = open(os.path.join(
log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('waiting finished' + '\n')
print('waiting finished')
break
invalidCaptcha = "Invalid Captcha"
norecord = "Record Not Found"
try:
waitShort.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#errSpan > p:nth-child(1)')))
incorrect = driver.find_element_by_css_selector('#errSpan > p:nth-child(1)').text
if incorrect == invalidCaptcha:
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('Invalid Captcha' + '\n')
print('invalid captcha')
imgtotxt()
continue
else:
if incorrect == norecord:
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('Record not Found' + '\n')
return print('record not found')
except:
pass
def record():
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('Record Found' + '\n')
print('record fun started')
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.someclass')))
listAllView = driver.find_elements_by_css_selector(
'a.someclass')
# make new dirctory by name of Court Complex
distDir2 = os.path.join(
main_Directory, distName, nameCourtComp)
if not os.path.exists(distDir2):
os.makedirs(distDir2)
x = 0
for view in listAllView:
try:
view.click()
wait.until(EC.presence_of_element_located((By.ID, 'back_top')))
openFile = open(
os.path.join(distDir2, "file_" + str(x) + ".html"), "w")
openFile.write(driver.page_source)
openFile.close()
back = driver.find_element_by_id('back_top')
back.click()
x += 1
except (TimeoutException, ElementNotInteractableException):
driver.refresh()
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write(
'While Downloading record for '
+ nameCourtComp + ' error occured, retrying now...' + '\n')
nonlocal courtComp
courtComp -= 1
return print(
'While Downloading record for '
+ nameCourtComp + ' error occured, retrying now...')
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('record completed, ' + str(x) + ' records found' + '\n')
print('record completed, ' + str(x) + ' records found')
return
record()
return
courtComp = 1
courtComplexDownload = Select(
driver.find_element_by_css_selector('#court_complex_code'))
courtComplexDownloadList = courtComplexDownload.options
courtComplexLen = len(courtComplexDownloadList)
while courtComp < courtComplexLen:
nameCourtComp = courtComplexDownloadList[courtComp].text
log_file = open(os.path.join(log_Directory, nameCourtComp + '.txt'), 'w')
log_file.write(nameCourtComp + '\n' + '\n')
print(nameCourtComp)
courtComplexDownload.select_by_index(courtComp)
acts = Select(driver.find_element_by_css_selector('#actcode'))
actsOpt = acts.options
act = 0
while len(actsOpt) < 2:
if act < 10:
time.sleep(1)
act += 1
else:
#if there is no list to populate break out of this loop & go to next complex
raise Exception()
try:
acts.select_by_value('33')
except NoSuchElementException:
print('PoA not applicable')
log_file = open(
os.path.join(log_Directory, nameCourtComp + '.txt'), 'a')
log_file.write('No PoA' + '\n')
courtComp += 1
continue
imgtotxt()
proceed()
courtComp += 1
complex_and_act()
driver.close()
print("all court complexes in " + distName + " completed")
driver.switch_to.window(current)
driver.back()
else:
time.sleep(5)
continue
i += 1
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#sateist > option:nth-child(22)")))
select = Select(driver.find_element_by_css_selector('#sateist'))
options = select.options
select.select_by_visible_text('Maharashtra')
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.region')))
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#sateist')))
This is exactly what exceptions are for. Whenever you raise an exception, it's going to go to the containing scope, and if that scope doesn't catch it, it'll stop whatever it's doing and the exception will go up to the next scope, et cetera. It's a very easy way to make sure that if something goes wrong, you can break out of multiple levels of loop or multiple levels of function call without having to have each level check a return value.
When you swallow an exception, as you're doing here:
except (TimeoutException, ElementNotInteractableException):
...
return print(
'While Downloading record for '
+ nameCourtComp + ' error occured, retrying now...')
you're thwarting that process. (Note that you're returning None, so the caller gets no information whatsoever!) Instead, maybe do something like:
except (TimeoutException, ElementNotInteractableException):
...
print(
'While Downloading record for '
+ nameCourtComp + ' error occured, retrying now...')
raise
The raise will just re-raise the same exception up to the next level, where they can do their own handling. Alternatively, you could create your own exception (maybe with more information) and raise that instead; it's completely valid to catch an exception and then raise a different type of exception as a way of translating it for the caller.
I try save screenshot on test failure in python with 'splinter'
1) This code works for Selenium:
# #pytest.fixture(scope="function")
# def browser(request):
# options = Options()
# options.add_argument("--headless")
# options.add_argument("--start-maximized")
# # browser = webdriver.Chrome(ChromeDriverManager().install())
# browser = webdriver.Chrome(options=options)
# browser.implicitly_wait(5)
# failed_before = request.session.testsfailed
# yield browser
# if request.session.testsfailed != failed_before:
# test_name = request.node.name
# take_screenshot(browser, test_name)
# browser.quit()
#
# def take_screenshot(browser, test_name):
# screenshots_dir = "C:\\Users\Ark\\PycharmProjects\\Gop\\Reports"
# screenshot_file_path = "{}/{}.png".format(screenshots_dir, test_name)
# browser.save_screenshot(
# screenshot_file_path)
But doesn't works with Splinter (browser don't close and don't make screenshot):
#pytest.fixture(scope="function")
def browser(request):
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
browser = Browser("chrome", headless=False, incognito=True, options=options)
failed_before = request.session.testsfailed
yield browser
if request.session.testsfailed != failed_before:
test_name = request.node.name
take_screenshot(browser, test_name)
browser.quit()
def take_screenshot(browser, test_name):
screenshots_dir = "C:\\Users\Ark\\PycharmProjects\\Gop\\Reports"
screenshot_file_path = "{}/{}.png".format(screenshots_dir, test_name)
browser.save_screenshot(
screenshot_file_path)
print("\n!!! SCREENSHOT OF FAILURE '" + test_name + "' SAVED INTO: '" + screenshots_dir + "' WITH NAME '" + test_name + "'")
2) Or how do this function working? (pytest-splinter)
splinter_make_screenshot_on_failure
https://github.com/pytest-dev/pytest-splinter
Can you help?
i'm trying to get files name's in a ftp directory.acctualy my problem is that i get eatch time an empty string when i use a loop.in case that i run my program without a lopp i get the right files name's.
This is my program
class Watch:
def __init__(self):
self.m=""
def goh(self):
while True:
j = 0
ftp = FTP('')
ftp.connect('127.0.0.1', 1026)
ftp.login(user='user', passwd='12345')
ftp.cwd("/FTM/Simulateur/1.MPTC_ACK")
files = ftp.nlst()
while j < len(files):
timestamp = ftp.voidcmd("MDTM " + files[j])[4:].strip()
time = parser.parse(timestamp)
self.time_dic = str(time)
self.tab_file = files[j]
os.chdir("/Users/ouhejjouyou/Desktop/eleclink/Fichier_in/1.MPTC_ACK")
fhandle = open(files[j], 'wb')
ftp.retrbinary("RETR " + str(self.tab_file), fhandle.write)
fhandle.close()
ftp.delete(self.tab_file)
self.m = self.time_dic + " Reception du fichier " + self.tab_file + " réussi\n"
j = j + 1
a=Watch()
t = Thread(target=a.goh)
t.start()
print(a.m)
t.join()