Max threads for selenium [duplicate] - python

This is the error traceback after several hours of scraping:
The process started from chrome location /usr/bin/google-chrome is no longer running, so ChromeDriver is assuming that Chrome has crashed.
This is my setup of selenium python:
#scrape.py
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
def run_scrape(link):
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--lang=en")
chrome_options.add_argument("--start-maximized")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
chrome_options.binary_location = "/usr/bin/google-chrome"
browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver', options=chrome_options)
browser.get(<link passed here>)
try:
#scrape process
except:
#other stuffs
browser.quit()
#multiprocess.py
import time,
from multiprocessing import Pool
from scrape import *
if __name__ == '__main__':
start_time = time.time()
#links = list of links to be scraped
pool = Pool(20)
results = pool.map(run_scrape, links)
pool.close()
print("Total Time Processed: "+"--- %s seconds ---" % (time.time() - start_time))
Chrome, ChromeDriver Setup, Selenium Version
ChromeDriver 79.0.3945.36 (3582db32b33893869b8c1339e8f4d9ed1816f143-refs/branch-heads/3945#{#614})
Google Chrome 79.0.3945.79
Selenium Version: 4.0.0a3
Im wondering why is the chrome is closing but other processes are working?

I took your code, modified it a bit to suit to my Test Environment and here is the execution results:
Code Block:
multiprocess.py:
import time
from multiprocessing import Pool
from multiprocessingPool.scrape import run_scrape
if __name__ == '__main__':
start_time = time.time()
links = ["https://selenium.dev/downloads/", "https://selenium.dev/documentation/en/"]
pool = Pool(2)
results = pool.map(run_scrape, links)
pool.close()
print("Total Time Processed: "+"--- %s seconds ---" % (time.time() - start_time))
scrape.py:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
def run_scrape(link):
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--lang=en")
chrome_options.add_argument("--start-maximized")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
chrome_options.binary_location=r'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
browser = webdriver.Chrome(executable_path=r'C:\Utility\BrowserDrivers\chromedriver.exe', options=chrome_options)
browser.get(link)
try:
print(browser.title)
except (NoSuchElementException, TimeoutException):
print("Error")
browser.quit()
Console Output:
Downloads
The Selenium Browser Automation Project :: Documentation for Selenium
Total Time Processed: --- 10.248600006103516 seconds ---
Conclusion
It is pretty much evident your program is logically flawless and just perfect.
This usecase
As you mentioned this error surfaces after several hours of scraping, I suspect this due to the fact that WebDriver is not thread-safe. Having said that, if you can serialize access to the underlying driver instance, you can share a reference in more than one thread. This is not advisable. But you can always instantiate one WebDriver instance for each thread.
Ideally the issue of thread-safety isn't in your code but in the actual browser bindings. They all assume there will only be one command at a time (e.g. like a real user). But on the other hand you can always instantiate one WebDriver instance for each thread which will launch multiple browsing tabs/windows. Till this point it seems your program is perfect.
Now, different threads can be run on same Webdriver, but then the results of the tests would not be what you expect. The reason behind is, when you use multi-threading to run different tests on different tabs/windows a little bit of thread safety coding is required or else the actions you will perform like click() or send_keys() will go to the opened tab/window that is currently having the focus regardless of the thread you expect to be running. Which essentially means all the test will run simultaneously on the same tab/window that has focus but not on the intended tab/window.

Right now im using this threading module to instantiate one Webdriver each thread
import threading
threadLocal = threading.local()
def get_driver():
browser = getattr(threadLocal, 'browser', None)
if browser is None:
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument("--headless")
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--lang=en")
chrome_options.add_argument("--start-maximized")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36")
chrome_options.binary_location = "/usr/bin/google-chrome"
browser = webdriver.Chrome(executable_path=r'/usr/local/bin/chromedriver', options=chrome_options)
setattr(threadLocal, 'browser', browser)
return browser
and it really helps me to scrape faster than executing one driver at a time.

Related

How is my Selenium script getting detected?

My simple Python script using Selenium is not working properly. My hypothesis is that it's getting detected and flaged as a bot. The only purpose of the script is to log in into zalando.pl website. No matter what I do, I get Error 403 ("Wystąpił błąd. Pracujemy nad jego usunięciem. Spróbuj ponownie później.").
I've tried various methods to resolve the problem. I've tried to simulate human behavior with sleep with random numbers (I've tried to use WebDriverWait as well). Also, I've been trying to solve the problem using options given to chromedriver, but it didn't help (I also edited string &cdc using hex editor). Exept all above, I tried undetected-chromedriver but it didn't help. Is there any way for my script to work?
Here's the code:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'/chromedriver.exe')
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
print(driver.execute_script("return navigator.userAgent;"))
driver.get('https://www.zalando.pl/login')
time.sleep(7)
username_entry = driver.find_element(By.XPATH, '//*[#id="login.email"]')
username_entry.send_keys("login#mail.com")
time.sleep(1)
password_entry = driver.find_element(By.XPATH, '//*[#id="login.secret"]')
password_entry.send_keys("password")
time.sleep(4)
button_entry = driver.find_element(By.XPATH, '//*[#id="sso"]/div/div[2]/main/div/div[2]/div/div/div/form/button/span')
time.sleep(2)
button_entry.click()

How to scrape data in selenium without being detected as a robot in Python?

I am new to selenium and am very confused why this is not working. I am trying to login to their page first because it requires an account to view their articles. I think I've done that part. However, now, when I try to view the article it tells me that I can't view it because it's a robot.
the current code I have is
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
CHROMEDRIVER_PATH = './chromedriver'
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("--disable-blink-features")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
LOGIN_PAGE = "https://www.seekingalpha.com/login"
ACCOUNT = "ACCOUNT"
PASSWORD = "PASSWORD"
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=chrome_options)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
wait = WebDriverWait(driver, 30)
driver.get("https://www.seekingalpha.com/login")
wait.until(EC.element_to_be_clickable((By.NAME, "email"))).send_keys(ACCOUNT)
wait.until(EC.element_to_be_clickable((By.ID, "signInPasswordField"))).send_keys(PASSWORD)
wait.until(EC.element_to_be_clickable((By.XPATH, "//button[text()='Sign in']"))).click()
driver.get("https://seekingalpha.com/article/4414043-agenus-inc-agen-ceo-garo-armen-on-q4-2020-results-earnings-call-transcript")
text_element = driver.find_elements_by_xpath('//*')
text = text_element
for t in text:
print(t.text)
and I get
Is this happening to you frequently? Please report it on our feedback forum.
If you have an ad-blocker enabled you may be blocked from proceeding. Please disable your ad-blocker and refresh.
Reference ID: cbbe4cb0-b4c7-11eb-87a2-97a8b0029776
To continue, please prove you are not a robot
...

Getting blocked by a website with selenium and chromedriver

I'm having some trouble trying to access a web site (bet365.com) with a chrome driver and selenium (I'm quite being "blocked").
I can access the site with my ordinary chrome but when I try with chrome driver, it doesn't work.
I had this problem before and corrected it by using some options as below (python):
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options, executable_path=r'PATH_TO\chromedriver.exe')
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.execute_cdp_cmd("Network.enable", {})
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
driver.get("https://www.bet365.com/")
Now, the problem came back and this code is not working anymore to bypass the protection.
Can someone help me?
In case the Selenium driven ChromeDriver initiated google-chrome Browsing Context is getting detected a potential solution would be to use the undetected-chromedriver to initialize the Chrome Browsing Context.
undetected-chromedriver is an optimized Selenium Chromedriver patch which does not trigger anti-bot services like Distill Network / Imperva / DataDome / Botprotect.io. It automatically downloads the driver binary and patches it.
Code Block:
import undetected_chromedriver as uc
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
driver = uc.Chrome(options=options)
driver.get('https://bet365.com')
References
You can find a couple of relevant detailed discussions in:
Undetected Chromedriver not loading correctly

Why doesn't instagram work with Selenium headless Chrome?

I'm trying to build an insta bot that works headless, but it don't seem to find the username, password columns (i.e NoSuchElementException).
I tried to run this code to troubleshoot. (which basicaly opens the ig homepage and screenshots it)
from selenium import webdriver
from time import sleep
options = webdriver.ChromeOptions()
options.headless = True
options.add_argument("--window-size=1920,1080")
browser = webdriver.Chrome(options=options)
browser.get("https://www.instagram.com")
browser.get_screenshot_as_file(f"screenshot.png")
and i got these screenshots basically saying 'error, retry after several minutes' in french
I tried finding the 'connectez-vous' button thru selenium, but every xpath i try doesn't work, and it's impossible to find it thru f12
The bot will be later uploaded to pythonanywhere so i can run it in the cloud (so if you think i might run into some other problems you can let me know)
What do you suggest me to do?
from selenium import webdriver
from time import sleep
options = webdriver.ChromeOptions()
#options.headless = True
options.add_argument("--window-size=1920,1080")
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
browser = webdriver.Chrome(options=options)
browser.get("https://www.instagram.com")
sleep(5)
#browser.refresh()
browser.get_screenshot_as_file(f"screenshot.png")
For headless chrome , useragent is set as chromeheadless or something , this makes instagram to detect that you are using headless chrome.
You can vent this by specifying hardcoded useragent,
open a normal chrome , goto network tab , open request header and copy the user agent part and replace in your code
Headless browser detection

How to make reddit register votes when logging in through selenium?

So I'm trying to implement a program using selenium that upvotes the top post in my mostly private subreddit. So far, I've been able to implement the logging in and clicking on the upvote button part but the votes aren't actually registering on Reddit as far as I can see. I'm fairly new to python and have learned most of the basics so I'm doing this just for the sake of experimentation and curiosity. Here's my code I would really appreciate it if you could maybe try to help me out.
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
import os
import random
webs = input("Complete Post URL: ")
username = os.getenv("USERNAME")
userProfile = "C:\\Users\\" + username + "\\AppData\\Local\\Google\\Chrome\\User Data\\Default"
chrome_options = Options()
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-javascript")
chrome_options.add_argument("--disable-rtc-smoothness-algorithm")
chrome_options.add_argument("--disable-webrtc-encryption")
chrome_options.add_argument("--disable-webrtc-hw-decoding")
chrome_options.add_argument("--incognito")
chrome_options.add_argument(
'--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/86.0.4240.75 Safari/537.36"')
chrome_options.add_experimental_option("excludeSwitches",
["ignore-certificate-errors", "safebrowsing-disable-download-protection",
"safebrowsing-disable-auto-update", "disable-client-side-phishing-detection",
"enable-automation"])
x = str(random.randint(700, 1920))
y = str(random.randint(700, 1080))
chrome_options.add_argument(f"--window-size={x,y}")
driver = webdriver.Chrome('chromedriver.exe', options=chrome_options)
driver.get("https://old.reddit.com")
user = driver.find_element_by_xpath('//*[#id="login_login-main"]/input[2]')
user.send_keys(useritem)
pwd = driver.find_element_by_xpath('//*[#id="login_login-main"]/input[3]')
pwd.send_keys(passitem)
login = driver.find_element_by_xpath('//*[#id="login_login-main"]/div[4]/button')
login.click()
time.sleep(3.45)
driver.get(webs)
xp = webs.split('/')
xp = xp[-3]
try:
time.sleep(5)
upvote = driver.find_element_by_xpath(f'//*[#id="upvote-button-t3_{xp}"]/span/i')
upvote.click()
print(useritem + " Upvoted!")
time.sleep(15)
driver.close()
except TimeoutException as t:
print(t.msg)

Categories