Python multiprocessing a class - python

I am trying to multiprocess selenium where each process is spawned with a selenium driver and a session (each process is connected with a different account).
I have a list of URLs to visit.
Each URL needs to be visited once by one of the account (no matter which one).
To avoid some nasty global variable management, I tried to initialize each process with a class object using the initializer of multiprocessing.pool.
After that, I can't figure out how to distribute tasks to the process knowing that the function used by each process has to be in the class.
Here is a simplified version of what I'm trying to do :
from selenium import webdriver
import multiprocessing
account = [{'account':1},{'account':2}]
class Collector():
def __init__(self, account):
self.account = account
self.driver = webdriver.Chrome()
def parse(self, item):
self.driver.get(f"https://books.toscrape.com{item}")
if __name__ == '__main__':
processes = 1
pool = multiprocessing.Pool(processes,initializer=Collector,initargs=[account.pop()])
items = ['/catalogue/a-light-in-the-attic_1000/index.html','/catalogue/tipping-the-velvet_999/index.html']
pool.map(parse(), items, chunksize = 1)
pool.close()
pool.join()
The problem comes on the the pool.map line, there is no reference to the instantiated object inside the subprocess.
Another approach would be to distribute URLs and parse during the init but this would be very nasty.
Is there a way to achieve this ?

Since Chrome starts its own process, there is really no need to be using multiprocessing when multithreading will suffice. I would like to offer a more general solution to handle the case where you have N URLs you want to retrieve where N might be very large but you would like to limit the number of concurrent Selenium sessions you have to MAX_DRIVERS where MAX_DRIVERS is a significantly smaller number. Therefore, you only want to create one driver session for each thread in the pool and reuse it as necessary. Then the problem becomes calling quit on the driver when you are finished with the pool so that you don't leave any Selenium processes behind running.
The following code uses threadlocal storage, which is unique to each thread, to store the current driver instance for each pool thread and uses a class destructor to call the driver's quit method when the class instance is destroyed:
from selenium import webdriver
from multiprocessing.pool import ThreadPool
import threading
items = ['/catalogue/a-light-in-the-attic_1000/index.html',
'/catalogue/tipping-the-velvet_999/index.html']
accounts = [{'account': 1}, {'account': 2}]
baseurl = 'https://books.toscrape.com'
threadLocal = threading.local()
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
print('The driver has been "quitted".')
#classmethod
def create_driver(cls):
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = cls()
threadLocal.the_driver = the_driver
return the_driver.driver
def process(i, a):
print(f'Processing account {a}')
driver = Driver.create_driver()
driver.get(f'{baseurl}{i}')
def main():
global threadLocal
# We never want to create more than
MAX_DRIVERS = 8 # Rather arbitrary
POOL_SIZE = min(len(urls), MAX_DRIVERS)
pool = ThreadPool(POOL_SIZE)
pool.map(process, urls)
# ensure the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
pool.close()
pool.join()
if __name__ == '__main__':
main()

I'm not entirely certain if this solves your problem.
If you have one account per URL then you could do this:
from selenium import webdriver
from multiprocessing import Pool
items = ['/catalogue/a-light-in-the-attic_1000/index.html',
'/catalogue/tipping-the-velvet_999/index.html']
accounts = [{'account': 1}, {'account': 2}]
baseurl = 'https://books.toscrape.com'
def process(i, a):
print(f'Processing account {a}')
options = webdriver.ChromeOptions()
options.add_argument('--headless')
with webdriver.Chrome(options=options) as driver:
driver.get(f'{baseurl}{i}')
def main():
with Pool() as pool:
pool.starmap(process, zip(items, accounts))
if __name__ == '__main__':
main()
If the number of accounts doesn't match the number of URLs, you have said that it doesn't matter which account GETs from which URL. So, in that case, you could just select the account to use at random (random.choice())

Related

Python - Selenium - Single thread to multiple threads

I have an automation project made with Python and Selenium which I'm trying to make it run with multiple browsers in parallel.
The current workflow:
open a browser for manual login
save cookies for later use
in a loop, open additional browsers, load the saved session in each newly opened browser
The described workflow is opening some browsers, one by one, until all required browsers are opened.
My code contains several classes: Browser and Ui.
The object instantiated with Ui class contains a method which at some point executes the following code:
for asset in Inventory.assets:
self.browsers[asset] = ui.Browser()
# self.__open_window(asset) # if it is uncommented, the code is working properly without multi threading part; all the browsers are opened one by one
# try 1
# threads = []
# for asset in Inventory.assets:
# threads.append(Thread(target=self.__open_window, args=(asset,), name=asset))
# for thread in threads:
# thread.start()
# try 2
# with concurrent.futures.ThreadPoolExecutor() as executor:
# futures = []
# for asset in Inventory.assets:
# futures.append(executor.submit(self.__open_window, asset=asset))
# for future in concurrent.futures.as_completed(futures):
# print(future.result())
The problem appear when self.__open_window is executed within a thread. There i get an error related to Selenium, something like: 'NoneType' object has no attribute 'get', when self.driver.get(url) is called from the Browser class.
def __open_window(self, asset):
self.interface = self.browsers[asset]
self.interface.open_browser()
In class Browser:
def open_browser(self, driver_path=""):
# ...
options = webdriver.ChromeOptions()
# ...
#
web_driver = webdriver.Chrome(executable_path=driver_path, options=options)
#
self.driver = web_driver
self.opened_tabs["default"] = web_driver.current_window_handle
#
# ...
def get_url(self, url):
try:
self.driver.get(url) # this line cause problems ...
except Exception as e:
print(e)
My questions are:
Why do i have this issue in a multi threading environment?
What should i do in order to make the code work properly?
Thank You
I found the mistake, it was because of a wrong object reference.
After modification the code is working well.
I updated the following lines at __open_window:
def __open_window(self, asset, browser):
browser.interface = self.browsers[asset]
browser.interface.open_browser()
and in # try 1 code section:
threads.append(Thread(target=self.__open_window, args=(asset, browser, ), name=asset))

How to close multiple selenium sessions by user`s wish?

I launch multiple selenium sessions with multiprocessing and want to close them all or a few when user wants it
schema of my code:
from multiprocessing import Process
def drop():
...
driver = webdriver.Chrome(executable_path=os.path.join(path, '\\chromedriver.exe'),chrome_options=options)
...
for i in range(10):
s=Process(target = drop)
s.start()
input('press enter to stop all sessions')
###close selenium driver event
you can create an array process_list = [] and append the 's' variable in the list for every iteration process_list.append(s) and then to stop all processes
for i in process_list:
driver.close()
i.terminate()
Adding each driver instance to a list when creating them and then iterating through them one by one to close them afterwards might be able to solve your issue.
from multiprocessing import Process
list_drivers = []
def drop():
...
driver = webdriver.Chrome(executable_path=os.path.join(path, '\\chromedriver.exe'),chrome_options=options)
list_drivers.append(driver)
...
for i in range(10):
s=Process(target = drop)
s.start()
input('press enter to stop all sessions')
for driver in list_drivers:
driver.quit()
###close selenium driver event

Using WebDriver to take screenshots with always on process

I have a service that takes screenshots of given url using Selenium Web Driver.
It workes Ok, raises a process -> takes the screenshot -> closes the process.
the problem is - it takes too long to return.
is there a way that the web driver process stays always-on and waits for requests?
here is my code
class WebDriver(webdriver.Chrome):
def __init__(self, *args, **kwargs):
logger.info('Start WebDriver instance.')
self.start_time = datetime.now()
self.lock = threading.Lock()
kwargs['chrome_options'] = self.get_chrome_options()
super().__init__(*args, **kwargs)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
logger.info(f'Quiting Webdriver instance {id(self)}, took {datetime.now() - self.start_time}')
self.quit()
#staticmethod
def get_chrome_options():
chrome_options = ChromeOptions()
chrome_options.headless = True
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument("--no-sandbox") # Bypass OS security model
chrome_options.add_argument('--disable-dev-shm-usage') # overcome limited resource problems
chrome_options.add_argument("--lang=en")
chrome_options.add_argument("--disable-infobars") # disabling infobars
chrome_options.add_argument("--disable-extensions") # disabling extensions
chrome_options.add_argument("--hide-scrollbars")
return chrome_options
def capture_screenshot_from_html_string(self, html_str, window_size):
with tempfile.TemporaryDirectory() as tmpdirname:
html_filename = tmpdirname + f'/template.html'
with open(html_filename, 'w') as f:
f.write(html_str)
url = 'file://' + html_filename
img_str = self.capture_screenshot(url, window_size)
return img_str
def capture_screenshot(self, url, window_size):
self.lock.acquire()
try:
self.set_window_size(*window_size)
self.get(url)
self.maximize_window()
self.set_page_load_timeout(PAGE_LOAD_TIMEOUT)
img_str = self.get_screenshot_as_png()
except Exception as exc:
logger.error(f'Error capturing screenshot url: {url}; {exc}')
img_str = None
finally:
self.lock.release()
return img_str
After some research i found a solution and im posting it to maybe help others in similar problem.
using py-object-pool library.
Object pool library creates a pool of resource class instance and use them in your project. Pool is implemented using python built in library Queue.
Each time creating a new browser instance is time consuming task which will make client to wait.
If you have one browser instance and manage with browser tab, it will become cumbersome to maintain and debug in case of any issue arises.
Object Pool will help you to manage in that situation as it creates resource pool and provides to each client when it requests. Thus separating the process from one another without waiting or creating new instance on the spot.
Code Example
ff_browser_pool = ObjectPool(FirefoxBrowser, min_init=2)
with ff_browser_pool.get() as (browser, browser_stats):
title = browser.get_page_title('https://www.google.co.in/')
for more information see link below
https://pypi.org/project/py-object-pool/

Python Multiprocessing Manager - List Name Error?

I am trying to use a Shared List that will update scraped information from Selenium so i can later export this info or use it how i chose. For some reason it is giving me this error:
NameError: name 'scrapedinfo' is not defined...
This is really strange to me because i declared the list Global AND I used the multiprocessing.Manager() to create the list. I have double checked my code many times and it is not a case sensitive error. I also tried to past the list through the functions as a variable but this created other problems and did not work. Any help is greatly appreciated!
from selenium import webdriver
from multiprocessing import Pool
def browser():
driver = webdriver.Chrome()
return driver
def test_func(link):
driver = browser()
driver.get(link)
def scrape_stuff(driver):
#Scrape things
scrapedinfo.append(#Scraped Stuff)
def multip():
manager = Manager()
#Declare list here
global scrapedinfo
scrapedinfo = manager.list()
links = ["https://stackoverflow.com/", "https://signup.microsoft.com/", "www.example.com"]
chunks = [links[i::3] for i in range(3)]
pool = Pool(processes=3)
pool.map(test_func, chunks)
print(scrapedinfo)
multip()
In Windows, multiprocessing executes a new python process and then tries to pickle/unpickle a limited view of the parent state for the child. Global variables that are not passed in the map call are not included. scrapedinfo is not created in the child and you get the error.
One solution is to pass scrapedinfo in the map call. Hacking down to a quick example,
from multiprocessing import Pool, Manager
def test_func(param):
scrapedinfo, link = param
scrapedinfo.append("i scraped stuff from " + str(link))
def multip():
manager = Manager()
global scrapedinfo
scrapedinfo = manager.list()
links = ["https://stackoverflow.com/", "https://signup.microsoft.com/", "www.example.com"]
chunks = [links[i::3] for i in range(3)]
pool = Pool(processes=3)
pool.map(test_func, list((scrapedinfo, chunk) for chunk in chunks))
print(scrapedinfo)
if __name__=="__main__":
multip()
But you are doing more work than you need to with the Manager. map passes the worker's return value back to the parent process (and handles chunking). So you could do:
from multiprocessing import Pool, Manager
def test_func(link):
return "i scraped stuff from " + link
def multip():
links = ["https://stackoverflow.com/", "https://signup.microsoft.com/", "www.example.com"]
pool = Pool(processes=3)
scrapedinfo = pool.map(test_func, links)
print(scrapedinfo)
if __name__=="__main__":
multip()
And avoid the extra processsing of a clunky list proxy.

Restart a process if running longer than x amount of minutes

I have a program that creates a multiprocessing pool to handle a webextraction job. Essentially, a list of product ID's is fed into a pool of 10 processes that handle the queue. The code is pretty simple:
import multiprocessing
num_procs = 10
products = ['92765937', '20284759', '92302047', '20385473', ...etc]
def worker():
for workeritem in iter(q.get, None):
time.sleep(10)
get_product_data(workeritem)
q.task_done()
q.task_done()
q = multiprocessing.JoinableQueue()
procs = []
for i in range(num_procs):
procs.append(multiprocessing.Process(target=worker))
procs[-1].daemon = True
procs[-1].start()
for product in products:
time.sleep(10)
q.put(product)
q.join()
for p in procs:
q.put(None)
q.join()
for p in procs:
p.join()
The get_product_data() function takes the product, opens an instance of Selenium, and navigates to a site, logs in, and collects the details of the product and outputs to a csv file. The problem is, randomly (literally... it happens at different points of the website's navigation or extraction process) Selenium will stop doing whatever it's doing and just sit there and stop doing it's job. No exceptions are thrown or anything. I've done everything I can in the get_product_data() function to get this to not happen, but it seems to just be a problem with Selenium (i've tried using Firefox, PhantomJS, and Chrome as it's driver, and still run into the same problem no matter what).
Essentially, the process should never run for longer than, say, 10 minutes. Is there any way to kill a process and restart it with the same product id if it has been running for longer than the specified time?
This is all running on a Debian Wheezy box with Python 2.7.
You could write your code using multiprocessing.Pool and the timeout() function suggested by #VooDooNOFX. Not tested, consider it an executable pseudo-code:
#!/usr/bin/env python
import signal
from contextlib import closing
from multiprocessing import Pool
class Alarm(Exception):
pass
def alarm_handler(*args):
raise Alarm("timeout")
def mp_get_product_data(id, timeout=10, nretries=3):
signal.signal(signal.SIGALRM, alarm_handler) #XXX could move it to initializer
for i in range(nretries):
signal.alarm(timeout)
try:
return id, get_product_data(id), None
except Alarm as e:
timeout *= 2 # retry with increased timeout
except Exception as e:
break
finally:
signal.alarm(0) # disable alarm, no need to restore handler
return id, None, str(e)
if __name__=="__main__":
with closing(Pool(num_procs)) as pool:
for id, result, error in pool.imap_unordered(mp_get_product_data, products):
if error is not None: # report and/or reschedule
print("error: {} for {}".format(error, id))
pool.join()
You need to ask Selenium to wait an explicit amount of time, or wait for some implicit DOM object to be available. Take a quick look at the selenium docs about that.
From the link, here's a process that waits 10 seconds for the DOM element myDynamicElement to appear.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
ff = webdriver.Firefox()
ff.get("http://somedomain/url_that_delays_loading")
try:
element = WebDriverWait(ff, 10).until(EC.presence_of_element_located((By.ID, "myDynamicElement")))
except TimeoutException as why:
# Do something to reject this item, possibly by re-adding it to the worker queue.
finally:
ff.quit()
If nothing is available in the given time period, a selenium.common.exceptions.TimeoutException is raised, which you can catch in a try/except loop like above.
EDIT
Another option is to ask multiprocessing to timeout the process after some amount of time. This is done using the built-in library signal. Here's an excellent example of doing this, however it's still up to you to add that item back into the work queue when you detect a process has been killed. You can do this in the def handler section of the code.

Categories