Python Selenium Function In Seperate File - NameError - python

I am building a Python script and want to split up certain functions into separate files to make maintenance easier.
I have two files currently called main.py and function1.py
main.pydef
#Setup Imports
import os
import os.path
import sys
# Import Functions
from function1 import myfunction
#Setup Selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
#Launch Firefox
def init_driver():
driver = webdriver.Firefox()
return driver
url_list = ['http://www.example.com/page1', 'http://www.example.com/contact', 'http://www.example.com/about', 'http://www.example.com/test'];
driver = init_driver()
# Init Blank List
checked_urls = []
for url in url_list:
myfunction(driver)
print(checked_urls)
function1.py
def myfunction(driver):
driver.get(url)
htmlText = driver.find_element_by_css_selector("#phrase").text
if "This Is My Phrase" in htmlText:
checked_urls.extend(['PHRASE_FOUND'])
else:
checked_urls.extend(['PHRASE_FOUND'])
I am trying to get it to visit each URL in the list and check for This Is My Phrase on the page. If it finds it then it should add to the list.
I am seeing the following error when running the script...
NameError: name 'url' is not defined
I am pretty sure it's related to the way I am importing the separate function but can't work out whats wrong, can anyone help?

You have to also pass url variable to myfunction:
def myfunction(driver, url):
driver.get(url)
htmlText = driver.find_element_by_css_selector("#phrase").text
if "This Is My Phrase" in htmlText:
checked_urls.extend(['PHRASE_FOUND'])
else:
checked_urls.extend(['PHRASE_FOUND'])
Then in main file:
for url in url_list:
myfunction(driver, url)

I think some code should be corrected:
Frist, delete the blank space before url_list:
#url_list = ['http://www.example.com/page1', 'http://www.example.com/contact', 'http://www.example.com/about', 'http://www.example.com/test'];
url_list = ['http://www.example.com/page1', 'http://www.example.com/contact', 'http://www.example.com/about', 'http://www.example.com/test'];
Then, the url is a local variable, it's not directly accessible in the function myfunction. But it can be accessed as a function parameter:
def myfunction(driver, url):
...

Related

How to download PDF from url in python

Note: This is very different problem compared to other SO answers (Selenium Webdriver: How to Download a PDF File with Python?) available for similar questions.
This is because The URL: https://webice.ongc.co.in/pay_adv?TRACKNO=8262# does not directly return the pdf but in turn makes several other calls and one of them is the url that returns the pdf file.
I want to be able to call the url with a variable for the query param TRACKNO and to be able to save the pdf file using python.
I was able to do this using selenium, but my code fails to work when the browser is used in headless mode and I need it to work in headless mode. The code that I wrote is as follows:
import requests
from urllib3.exceptions import InsecureRequestWarning
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
def extract_url(driver):
advice_requests = driver.execute_script("var performance = window.performance || window.mozPerformance || window.msPerformance || window.webkitPerformance || {}; var network = performance.getEntries() || {}; return network;")
print(advice_requests)
for request in advice_requests:
if(request.get('initiatorType',"") == 'object' and request.get('entryType',"") == 'resource'):
link_split = request['name'].split('-')
if(link_split[-1] == 'filedownload=X'):
print("..... Successful")
return request['name']
print("..... Failed")
def save_advice(advice_url,tracking_num):
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
response = requests.get(advice_url,verify=False)
with open(f'{tracking_num}.pdf', 'wb') as f:
f.write(response.content)
def get_payment_advice(tracking_nums):
options = webdriver.ChromeOptions()
# options.add_argument('headless') # DOES NOT WORK IN HEADLESS MODE SO COMMENTED OUT
driver = webdriver.Chrome(options=options)
for num in tracking_nums:
print(num,end=" ")
driver.get(f'https://webice.ongc.co.in/pay_adv?TRACKNO={num}#')
try:
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'ls-highlight-domref')))
time.sleep(0.1)
advice_url = extract_url(driver)
save_advice(advice_url,num)
except:
pass
driver.quit()
get_payment_advice['8262']
As it can be seen I get all the network calls that the browser makes in the first line of the extract_url function and then parse each request to find the correct one. However this does not work in headless mode
Is there any other way of doing this as this seems like a workaround? If not, can this be fixed to work in headless mode?
I fixed it, i only changed one function. The correct url is in the given page_source of the driver (with beautifulsoup you can parse html, xml etc.):
from bs4 import BeautifulSoup
def extract_url(driver):
soup = BeautifulSoup(driver.page_source, "html.parser")
object_element = soup.find("object")
data = object_element.get("data")
return f"https://webice.ongc.co.in{data}"
The hostname part may can be extracted from the driver.
I think i did not changed anything else, but if it not work for you, I can paste the full code.
Old Answer:
if you print the text of the returned page (print(driver.page_source)) i think you would get a message that says something like:
"Because of your system configuration the pdf can't be loaded"
This is because the requested site checks some preferences to decide if you are a roboter or not. Maybe it helps to change some arguments (screen size, user agent) to fix this. Here are some information about, how to detect a headless browser.
And for the next time you should paste all relevant code into the question (imports) to make it easier to test.

Driver.get a group of links?

How do I use driver.get to open several URLs in Chrome.
My code:
import requests
import json
import pandas as pd
from selenium import webdriver
chromeOptions = webdriver.ChromeOptions()
chromedriver = r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(executable_path=r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe",chrome_options=chromeOptions)
links = []
request1 = requests.get('https://api.beta.tab.com.au/v1/recommendation-service/featured-events?jurisdiction=NSW')
json1 = request1.json()
for n in json1['nextToGoRaces']:
if n['meeting']['location'] in ['VIC','NSW','QLD','SA','WA','TAS','IRL']:
links.append(n['_links']['self'])
driver.get('links')
Based on the comments - you'll want a class to manage your browsers, a class for your tests, then a runner to run in parallel.
Try this:
import unittest
import time
import testtools
from selenium import webdriver
class BrowserManager:
browsers=[]
def createBrowser(self, url):
browser = webdriver.Chrome()
browser.get(url)
self.browsers.append(browser)
def getBrowserByPartialURL(self, url):
for browser in self.browsers:
if url in browser.current_url:
return browser
def CloseItAllDown(self):
for browser in self.browsers:
browser.close()
class UnitTest1(unittest.TestCase):
def test_DoStuffOnGoogle(self):
browser = b.getBrowserByPartialURL("google")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
def test_DoStuffOnYahoo(self):
browser = b.getBrowserByPartialURL("yahoo")
#Point of this is to watch the output! you'll see this +other test intermingled (proves parallel run)
for i in range(10):
print(browser.current_url)
time.sleep(1)
#create a global variable for the brwosers
b = BrowserManager()
# To Run the tests
if __name__ == "__main__":
##move to an init to Create your browers
b.createBrowser("https://www.google.com")
b.createBrowser("https://www.yahoo.com")
time.sleep(5) # This is so you can see both open at the same time
suite = unittest.TestLoader().loadTestsFromTestCase(UnitTest1)
concurrent_suite = testtools.ConcurrentStreamTestSuite(lambda: ((case, None) for case in suite))
concurrent_suite.run(testtools.StreamResult())
This code doesn't do anything exciting - it's an example of how to manage multiple browsers and run tests in parallel. It goes to the specified urls (which you should move to an init/setup), then prints out the URL it's on 10 times.
This is how you add a browser to the manager: b.createBrowser("https://www.google.com")
This is how you retrieve your browser: browser = b.getBrowserByPartialURL("google") - note it's a partial URL so you can use the domain as a keyword.
This is the output (just the first few lines- not all of it...) - It's a print URL for google then yahoo, then google then yahoo - showing that they're running at the same time:
PS C:\Git\PythonSelenium\BrowserManager> cd 'c:\Git\PythonSelenium'; & 'C:\Python38\python.exe' 'c:\Users\User\.vscode\extensions\ms-python.python-2020.7.96456\pythonFiles\lib\python\debugpy\launcher' '62426' '--' 'c:\Git\PythonSelenium\BrowserManager\BrowserManager.py'
DevTools listening on ws://127.0.0.1:62436/devtools/browser/7260dee3-368c-4f21-bd59-2932f3122b2e
DevTools listening on ws://127.0.0.1:62463/devtools/browser/9a7ce919-23bd-4fee-b302-8d7481c4afcd
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/
https://consent.yahoo.com/collectConsent?sessionId=3_cc-session_d548b656-8315-4eef-bb1d-82fd4c6469f8&lang=en-GB&inline=false
https://www.google.com/

How to download website when URL doesn't change after data addition

I would like to download data from http://ec.europa.eu/taxation_customs/vies/ site. Case is that when I enter data on it through program the URL doesn't change, so file saved on disc has a page same as the one which were opened from the begining without data.Maybe I don't know how to access this site after adding data? I'm new in Python and tried to look for solution but with no result so if there was such issue, please link me. Here's my code. I appreciate all responses:)
import requests
import selenium
import select as something
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import pdfkit
url = "http://ec.europa.eu/taxation_customs/vies/?locale=pl"
driver = webdriver.Chrome(executable_path ="C:\\Users\\Python\\Chromedriver.exe")
driver.get("http://ec.europa.eu/taxation_customs/vies/")
#wait = WebDriverWait(driver, 10)
obj = Select(driver.find_element_by_id("countryCombobox"))
obj = obj.select_by_index(1)
vies_r = requests.get(url)
vies_vat = driver.find_element_by_id("number")
vies_vat.send_keys('U54799909')
vies_verify = driver.find_element_by_id("submit")
vies_verify.click()
path_wkhtmltopdf = r'C:\Users\Python\wkhtmltox\wkhtmltox\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)
print(driver.current_url)
pdfkit.from_url(driver.current_url, "out.pdf", configuration=config)
Ukalo

Selenium loop page refreshed in python

I have some questions related to doing the loop with Selenium in Python. In fact, I want to iterate a list of links tracked by 'driver.find_elements_by_id' and click on them one by one, but the problem is such that each time I click on the link ('linklist' in the code), the page is refreshed so there is an error message indicating that
'Message: The element reference is stale. Either the element is no longer attached to the DOM or the page has been refreshed.'
I know that the reason is because the list of links disappeared after the click. But how can I generally in Selenium iterate the list even though the page doesn't exist anymore. I used 'driver.back()' and apparently it doesn't work.
The error message pops up after this line in the code:
link.click()
the linklist is located in this URL (I want to clink on the button Document and then download the first file after the refreshed page is displayed) 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0001467373&type=10-K&dateb=20101231&owner=exclude&count=40'
Can someone have a look at this problem?
Thank you!
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import unittest
import os
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
import requests
import html2text
class LoginTest(unittest.TestCase):
def setUp(self):
self.driver=webdriver.Firefox()
self.driver.get("https://www.sec.gov/edgar/searchedgar/companysearch.html")
def test_Login(self):
driver=self.driver
cikID="cik"
searchButtonID="cik_find"
typeID="//*[#id='type']"
priorID="prior_to"
cik="00001467373"
Type="10-K"
prior="20101231"
search2button="//*[#id='contentDiv']/div[2]/form/table/tbody/tr/td[6]/input[1]"
documentsbuttonid="documentsbutton"
formbuttonxpath='//a[text()="d10k.htm"]'
cikElement=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_id(cikID))
cikElement.clear()
cikElement.send_keys(cik)
searchButtonElement=WebDriverWait(driver,20).until(lambda driver:driver.find_element_by_id(searchButtonID))
searchButtonElement.click()
typeElement=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_xpath(typeID))
typeElement.clear()
typeElement.send_keys(Type)
priorElement=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_id(priorID))
priorElement.clear()
priorElement.send_keys(prior)
search2Element=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_xpath(search2button))
search2Element.send_keys(Keys.SPACE)
time.sleep(1)
documentsButtonElement=WebDriverWait(driver,20).until(lambda driver:driver.find_element_by_id(documentsbuttonid))
a=driver.current_url
window_be1 = driver.window_handles[0]
linklist=driver.find_elements_by_id(documentsbuttonid)
with open("D:/doc2/"+"a"+".txt", mode="w",errors="ignore") as newfile:
for link in linklist:
link.click()
formElement=WebDriverWait(driver,30).until(lambda driver:driver.find_element_by_xpath(formbuttonxpath))
formElement.click()
time.sleep(1)
t=driver.current_url
r = requests.get(t)
data = r.text
newfile.write(html2text.html2text(data))
drive.back()
drive.back()
def terdown(self):
self.driver.quit()
if __name__=='__main__':
unittest.main()
You should not use a list of web-elements, but a list of links. Try something like this:
linklist = []
for link in driver.find_elements_by_xpath('//h4[#class="title"]/a'):
linklist.append(link.get_attribute('href'))
And then you can iterate through list of links
for link in linklist:
driver.get(link)
# do some actions on page
If you want to physically click on each link, you might need to use
for link in linklist:
driver.find_element_by_xpath('//h4[#class="title"]/a[#href=%s]' % link).click()
# do some actions on page

NameError: name 'driver' is not defined

I use above code to scrape friend list from facebook UID and am getting an error:
File "C:\Users\Tn\PycharmProjects\untitled\test\1.py", line 15, in friend_uid_list
soup = from_uid(uid)
File "C:\Users\Tn\PycharmProjects\untitled\test\1.py", line 11, in from_uid
driver.get('https://www.facebook.com/' + uid + '/friends')
NameError: name 'driver' is not defined
"""
Can you show me how to fix it ? Thank you very much ! Below code is my code
import multiprocessing
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
def from_uid(uid):
driver.get('https://www.facebook.com/' + uid + '/friends')
return BeautifulSoup(driver.page_source, "html5lib")
def friend_uid_list(uid):
soup = from_uid(uid)
friends = soup.find_all("div", class_="fsl fwb fcb")
target = open('C:/friend_uid_list.txt', 'a')
for href in friends:
href = href.find('a')
try:
target.write(href + "\n")
except:
pass
target.close()
if __name__ == '__main__':
driver = webdriver.Firefox()
driver.get("https://www.facebook.com/")
driver.find_element_by_css_selector("#email").send_keys("myemail#gmail.com")
driver.find_element_by_css_selector("#pass").send_keys("mypass")
driver.find_element_by_css_selector("#u_0_m").click()
pool = multiprocessing.Pool(3)
pool.map(friend_uid_list, [100004159542140,100004159542140,100004159542140])
The reason is simple: You create some new processes, and it can't see the variables in another process(main process).
There are several solutions:
Pass the variables you need as arguments. But this is not possible since driver is not picklable.
Create a new driver for each process.
Use multi-threading instead of multi-processing. However I'm not sure if selenium works this way, you'll have to test it yourself.

Categories