Can`t attach to detached selenium window in python - python

Cant send commands to selenium webdriver in detached session because link http://localhost:port died.
But if i put breakpoint 1 link stay alive
import multiprocessing
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def create_driver_pool(q):
options = Options()
driver = webdriver.Chrome(options=options)
pass #breakpoint 1
return driver.command_executor._url
windows_pool = multiprocessing.Pool(processes=1)
result = windows_pool.map(create_driver_pool, [1])
print(result)
pass # breakpoint 2 for testing link
why is this happening and what can i do about it?

After some research i finally found the reason for this behavor.
Thanks https://bentyeh.github.io/blog/20190527_Python-multiprocessing.html and some googling about signals.
This is not signals at all.
I found this code in selenium.common.service
def __del__(self):
print("del detected")
# `subprocess.Popen` doesn't send signal on `__del__`;
# so we attempt to close the launched process when `__del__`
# is triggered.
try:
self.stop()
except Exception:
pass
This is handler for garbage collector function, that killing subprocess via SIGTERM
self.process.terminate()
self.process.wait()
self.process.kill()
self.process = None
But if you in the debug mode with breakpoint, garbage collector wont collect this object, and del wont start.

Related

Python - Selenium - Single thread to multiple threads

I have an automation project made with Python and Selenium which I'm trying to make it run with multiple browsers in parallel.
The current workflow:
open a browser for manual login
save cookies for later use
in a loop, open additional browsers, load the saved session in each newly opened browser
The described workflow is opening some browsers, one by one, until all required browsers are opened.
My code contains several classes: Browser and Ui.
The object instantiated with Ui class contains a method which at some point executes the following code:
for asset in Inventory.assets:
self.browsers[asset] = ui.Browser()
# self.__open_window(asset) # if it is uncommented, the code is working properly without multi threading part; all the browsers are opened one by one
# try 1
# threads = []
# for asset in Inventory.assets:
# threads.append(Thread(target=self.__open_window, args=(asset,), name=asset))
# for thread in threads:
# thread.start()
# try 2
# with concurrent.futures.ThreadPoolExecutor() as executor:
# futures = []
# for asset in Inventory.assets:
# futures.append(executor.submit(self.__open_window, asset=asset))
# for future in concurrent.futures.as_completed(futures):
# print(future.result())
The problem appear when self.__open_window is executed within a thread. There i get an error related to Selenium, something like: 'NoneType' object has no attribute 'get', when self.driver.get(url) is called from the Browser class.
def __open_window(self, asset):
self.interface = self.browsers[asset]
self.interface.open_browser()
In class Browser:
def open_browser(self, driver_path=""):
# ...
options = webdriver.ChromeOptions()
# ...
#
web_driver = webdriver.Chrome(executable_path=driver_path, options=options)
#
self.driver = web_driver
self.opened_tabs["default"] = web_driver.current_window_handle
#
# ...
def get_url(self, url):
try:
self.driver.get(url) # this line cause problems ...
except Exception as e:
print(e)
My questions are:
Why do i have this issue in a multi threading environment?
What should i do in order to make the code work properly?
Thank You
I found the mistake, it was because of a wrong object reference.
After modification the code is working well.
I updated the following lines at __open_window:
def __open_window(self, asset, browser):
browser.interface = self.browsers[asset]
browser.interface.open_browser()
and in # try 1 code section:
threads.append(Thread(target=self.__open_window, args=(asset, browser, ), name=asset))

Using WebDriver to take screenshots with always on process

I have a service that takes screenshots of given url using Selenium Web Driver.
It workes Ok, raises a process -> takes the screenshot -> closes the process.
the problem is - it takes too long to return.
is there a way that the web driver process stays always-on and waits for requests?
here is my code
class WebDriver(webdriver.Chrome):
def __init__(self, *args, **kwargs):
logger.info('Start WebDriver instance.')
self.start_time = datetime.now()
self.lock = threading.Lock()
kwargs['chrome_options'] = self.get_chrome_options()
super().__init__(*args, **kwargs)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
logger.info(f'Quiting Webdriver instance {id(self)}, took {datetime.now() - self.start_time}')
self.quit()
#staticmethod
def get_chrome_options():
chrome_options = ChromeOptions()
chrome_options.headless = True
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument("--no-sandbox") # Bypass OS security model
chrome_options.add_argument('--disable-dev-shm-usage') # overcome limited resource problems
chrome_options.add_argument("--lang=en")
chrome_options.add_argument("--disable-infobars") # disabling infobars
chrome_options.add_argument("--disable-extensions") # disabling extensions
chrome_options.add_argument("--hide-scrollbars")
return chrome_options
def capture_screenshot_from_html_string(self, html_str, window_size):
with tempfile.TemporaryDirectory() as tmpdirname:
html_filename = tmpdirname + f'/template.html'
with open(html_filename, 'w') as f:
f.write(html_str)
url = 'file://' + html_filename
img_str = self.capture_screenshot(url, window_size)
return img_str
def capture_screenshot(self, url, window_size):
self.lock.acquire()
try:
self.set_window_size(*window_size)
self.get(url)
self.maximize_window()
self.set_page_load_timeout(PAGE_LOAD_TIMEOUT)
img_str = self.get_screenshot_as_png()
except Exception as exc:
logger.error(f'Error capturing screenshot url: {url}; {exc}')
img_str = None
finally:
self.lock.release()
return img_str
After some research i found a solution and im posting it to maybe help others in similar problem.
using py-object-pool library.
Object pool library creates a pool of resource class instance and use them in your project. Pool is implemented using python built in library Queue.
Each time creating a new browser instance is time consuming task which will make client to wait.
If you have one browser instance and manage with browser tab, it will become cumbersome to maintain and debug in case of any issue arises.
Object Pool will help you to manage in that situation as it creates resource pool and provides to each client when it requests. Thus separating the process from one another without waiting or creating new instance on the spot.
Code Example
ff_browser_pool = ObjectPool(FirefoxBrowser, min_init=2)
with ff_browser_pool.get() as (browser, browser_stats):
title = browser.get_page_title('https://www.google.co.in/')
for more information see link below
https://pypi.org/project/py-object-pool/

Closing Selenium Browser that was Opened in a Child Process

Here's the situation:
I create a child process which opens and deals with a webdriver. The child process is finicky and might error, in which case it would close immediately, and control would be returned to the main function. In this situation, however, the browser would still be open (as the child process never completely finished running). How can I close a browser that is initialized in a child process?
Approaches I've tried so far:
1) Initializing the webdriver in the main function and passing it to the child process as an argument.
2) Passing the webdriver between the child and parent process using a queue.
The code:
import multiprocessing
def foo(queue):
driver = webdriver.Chrome()
queue.put(driver)
# Do some other stuff
# If finicky stuff happens, this driver.close() will not run
driver.close()
if __name__ == '__main__':
queue = multiprocessing.Queue()
p = multiprocessing.Process(target=foo, name='foo', args=(queue,))
# Wait for process to finish
# Try to close the browser if still open
try:
driver = queue.get()
driver.close()
except:
pass
I found a solution:
In foo(), get the process ID of the webdriver when you open a new browser. Add the process ID to the queue. Then in the main function, add time.sleep(60) to wait for a minute, then get the process ID from the queue and use a try-except to try and close the particular process ID.
If foo() running in a separate process hangs, then the browser will be closed in the main function after one minute.

Selenium webdriver + PhantomJS processes not closing

Here's just about the simplest open and close you can do with webdriver and phantom:
from selenium import webdriver
crawler = webdriver.PhantomJS()
crawler.set_window_size(1024,768)
crawler.get('https://www.google.com/')
crawler.quit()
On windows (7), every time I run my code to test something out, new instances of the conhost.exe and phantomjs.exe processes begin and never quit. Am I doing something stupid here? I figured the processes would quit when the crawler.quit() did...
Go figure. Problem resolved with a reboot.
Rebooting is not a solution for this problem. I have experimented this hack in LINUX system. Try modifying the stop() function defined in service.py
def stop(self):
"""
Cleans up the process
"""
if self._log:
self._log.close()
self._log = None
#If its dead dont worry
if self.process is None:
return
#Tell the Server to properly die in case
try:
if self.process:
self.process.stdin.close()
#self.process.kill()
self.process.send_signal(signal.SIGTERM)
self.process.wait()
self.process = None
except OSError:
# kill may not be available under windows environment
pass
Added line send_signal explicitly to give the signal to quit phantomjs process. Don't forget to add import signal statement at start of this file.

Restart a process if running longer than x amount of minutes

I have a program that creates a multiprocessing pool to handle a webextraction job. Essentially, a list of product ID's is fed into a pool of 10 processes that handle the queue. The code is pretty simple:
import multiprocessing
num_procs = 10
products = ['92765937', '20284759', '92302047', '20385473', ...etc]
def worker():
for workeritem in iter(q.get, None):
time.sleep(10)
get_product_data(workeritem)
q.task_done()
q.task_done()
q = multiprocessing.JoinableQueue()
procs = []
for i in range(num_procs):
procs.append(multiprocessing.Process(target=worker))
procs[-1].daemon = True
procs[-1].start()
for product in products:
time.sleep(10)
q.put(product)
q.join()
for p in procs:
q.put(None)
q.join()
for p in procs:
p.join()
The get_product_data() function takes the product, opens an instance of Selenium, and navigates to a site, logs in, and collects the details of the product and outputs to a csv file. The problem is, randomly (literally... it happens at different points of the website's navigation or extraction process) Selenium will stop doing whatever it's doing and just sit there and stop doing it's job. No exceptions are thrown or anything. I've done everything I can in the get_product_data() function to get this to not happen, but it seems to just be a problem with Selenium (i've tried using Firefox, PhantomJS, and Chrome as it's driver, and still run into the same problem no matter what).
Essentially, the process should never run for longer than, say, 10 minutes. Is there any way to kill a process and restart it with the same product id if it has been running for longer than the specified time?
This is all running on a Debian Wheezy box with Python 2.7.
You could write your code using multiprocessing.Pool and the timeout() function suggested by #VooDooNOFX. Not tested, consider it an executable pseudo-code:
#!/usr/bin/env python
import signal
from contextlib import closing
from multiprocessing import Pool
class Alarm(Exception):
pass
def alarm_handler(*args):
raise Alarm("timeout")
def mp_get_product_data(id, timeout=10, nretries=3):
signal.signal(signal.SIGALRM, alarm_handler) #XXX could move it to initializer
for i in range(nretries):
signal.alarm(timeout)
try:
return id, get_product_data(id), None
except Alarm as e:
timeout *= 2 # retry with increased timeout
except Exception as e:
break
finally:
signal.alarm(0) # disable alarm, no need to restore handler
return id, None, str(e)
if __name__=="__main__":
with closing(Pool(num_procs)) as pool:
for id, result, error in pool.imap_unordered(mp_get_product_data, products):
if error is not None: # report and/or reschedule
print("error: {} for {}".format(error, id))
pool.join()
You need to ask Selenium to wait an explicit amount of time, or wait for some implicit DOM object to be available. Take a quick look at the selenium docs about that.
From the link, here's a process that waits 10 seconds for the DOM element myDynamicElement to appear.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
ff = webdriver.Firefox()
ff.get("http://somedomain/url_that_delays_loading")
try:
element = WebDriverWait(ff, 10).until(EC.presence_of_element_located((By.ID, "myDynamicElement")))
except TimeoutException as why:
# Do something to reject this item, possibly by re-adding it to the worker queue.
finally:
ff.quit()
If nothing is available in the given time period, a selenium.common.exceptions.TimeoutException is raised, which you can catch in a try/except loop like above.
EDIT
Another option is to ask multiprocessing to timeout the process after some amount of time. This is done using the built-in library signal. Here's an excellent example of doing this, however it's still up to you to add that item back into the work queue when you detect a process has been killed. You can do this in the def handler section of the code.

Categories