selenium webdriver from flask and save_screenshot doesnt work - python

So am trying to run this script (running on digital ocean apache2) , it runs succesfully with no errors but the screenshot doesnt show up anywhere in the server files.The following code is from __ init __.py
#app.route('/process')
def process():
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=1366x768")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36")
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path='/usr/lib/chromium-browser/chromedriver_old')
driver.get("https://www.google.com")
driver.save_screenshot("screenshot.png")

Related

While I'm scraping with Selenium it keeps telling me that I'm an unusual browser and that I have to enable javascrept

I just started learning programming and started with scraping with python Selenium but when get the Url and send elemets the website keep sending me (Your browser is a bit unusual...
Try disabling ad blockers and other extensions, enabling javascript, or using a different web browser.)
I tried some of the solutions provided on the site, but none of them solved my problem.
Can you explain and solve the problem with python please?
import selenium
from selenium import webdriver
from time import sleep
options = webdriver.ChromeOptions()
options.add_argument("--incognito")
driver = webdriver.Chrome('chromedriver.exe', options=options)
driver.set_window_size(620, 720)
driver.delete_all_cookies()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver.implicitly_wait(5)
options.add_argument("--headless")
driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
driver.get('https://sso.godaddy.com/v1/account/create?realm=idp&path=%2Fcontact%2Fvalidate%3FcontactType%3DphoneMobile%26app%3Dsso%26path%3Dprofile%252Fedit%26profileUpdate%3DTrue%26userInteraction%3DPROFILE_UPDATE&app=sso&auth_reason=1&iframe=false')

Selenium user agent doesn't work on heroku

I'm trying to get access to web site, but in headless mode I get this:
<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
You don't have permission to access "http://www.bybit.com/fiat/trade/otc/?" on this server.<p>
Reference #18.3f62645f.1657282455.2f87631
</p></body></html>
So I us user agent by adding this:
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
It works well on my local machine, but when I deployed it on Heroku server, I'v got the same issue.
Main part of the code:
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-features=NetworkService")
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_argument('--allow-running-insecure-content')
chrome_options.add_argument("window-size=800,600")
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
chrome_options.add_argument(f'user-agent={user_agent}')
bybit_url = 'https://www.bybit.com/fiat/trade/otc/?actionType=0&token=USDT&fiat=RUB&paymentMethod=75'
driver.get(bybit_url)
sleep(5)
print(driver.page_source)
Perhaps the Access is Denied as Selenium driven ChromeDriver initiated google-chrome Browsing Context is getting detected as a bot.
To avoid the detection you need to use the argument --disable-blink-features=AutomationControlled as follows:
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
I had exactly the same problem on Heroku trying to access bybit sites with selenium.
I used this user agent and it works for me:
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
Among many other trial and error attempts, I reached this user agent comparing user agents used locally and on Heroku.
Locally:
from selenium import webdriver
import os
options = webdriver.ChromeOptions()
options.binary_location = os.environ.get( "GOOGLE_CHROME_BIN")
#options.add_argument("--headless")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
driver.get("https://www.bybit.com/fiat/trade/otc/?actionType=0&token=USDT&fiat=RUB&paymentMethod=75")
print(driver.page_source)
print(driver.execute_script("return navigator.userAgent"))
driver.quit()
Result:
(...)</iframe></html>
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36
Locally:
from selenium import webdriver
import os
options = webdriver.ChromeOptions()
options.binary_location = os.environ.get( "GOOGLE_CHROME_BIN")
options.add_argument("--headless")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
driver.get("https://www.bybit.com/fiat/trade/otc/?actionType=0&token=USDT&fiat=RUB&paymentMethod=75")
print(driver.page_source)
print(driver.execute_script("return navigator.userAgent"))
driver.quit()
Result:
<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
You don t have permission to access "http://www.bybit.com/fiat/trade/otc/?" on this server.<p>
Reference #18.17bd2f17.1664927640.82fbfed7
</p></body></html>
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/106.0.5249.91 Safari/537.36
On Heroku:
from selenium import webdriver
import os
options = webdriver.ChromeOptions()
options.binary_location = os.environ.get( "GOOGLE_CHROME_BIN")
options.add_argument("--headless")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
print(driver.execute_script("return navigator.userAgent"))
driver.quit()
Result:
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/106.0.5249.91 Safari/537.36
On Heroku:
from selenium import webdriver
import os
options = webdriver.ChromeOptions()
options.binary_location = os.environ.get( "GOOGLE_CHROME_BIN")
options.add_argument("--headless")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
user_agent ='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
options.add_argument('user-agent={0}'.format(user_agent))
driver = webdriver.Chrome(options=options)
driver.get("https://www.bybit.com/fiat/trade/otc/?actionType=0&token=USDT&fiat=RUB&paymentMethod=75")
print(driver.page_source)
print(driver.execute_script("return navigator.userAgent"))
driver.quit()
Winning result:
(...)</iframe></html>
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36

Headless selenium exits immediately

I have a headless web scraper. When it run the scraper takes a base url, scrapes the links on that page, and then scrapes the links it got off that page.
The problem I'm having is that when I run the scraper it pretty much immediately exits. When I run the scraper normally (non headless) it works perfectly fine.
These are my selenium arguments:
options = webdriver.ChromeOptions()
options.binary_location = os.environ.get('GOOGLE_CHROME_BIN')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(executable_path=os.environ.get('CHROMEDRIVER_PATH'),
options=options)
I've also tried adding these options but it gave me the same result:
options.add_argument('--disable-dev-shm-usage')
options.add_argument("--window-size=1920,1080")
options.add_argument("--start-maximized")
How can I solve this? I'm trying to deploy this scraper to heroku and none of the things I've tried above worked.
Basically some website won't load in headless mode unless a user agent is specified.
To fix this I added:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
options.add_argument(f'user-agent={user_agent}')
This fixed the problem of my scraper exiting immediately

last problem when scraping bet365.com with selenium

After looking for information in the community, I have seen in a post that the next code worked until some days ago:
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("window-size=1920,1080")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
browser=webdriver.Chrome(options=options,executable_path=r"chromedriver.exe")
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
browser.execute_cdp_cmd('Network.setUserAgentOverride',
{"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4240.198 Safari/537.36'})
browser.get('https://www.bet365.com')
After that, the next worked as a solution:
Open the file chromedriver.exe with Notepad ++ and searched and replaced "cdc_" with "xyz_" and saved the file. And add this line to the options of the chromedriver: options.add_argument('--disable-blink-features=AutomationControlled')
I don't know why this don't work for me. I am using Chrome 88.0.4324.146 and the chromedriver version 88.0.4324.96, and executing this code:
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("window-size=1920,1080")
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
browser=webdriver.Chrome(options=options,executable_path=r"chromedriver.exe")
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
browser.execute_cdp_cmd('Network.setUserAgentOverride',
{"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4240.198 Safari/537.36'})
browser.get('https://www.bet365.com')
But after executing the page gets stuck loading until it crash.
import subprocess
#other imports
subprocess.Popen(
'"C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" --remote-debugging-port=9222', shell=True)
options = webdriver.ChromeOptions()
options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
driver = webdriver.Chrome(options=options)
driver.maximize_window()
driver.get('https://www.bet365.com')
It seems that the site detects the automation some how , work around is to open chrome using debug address and then connect selenium to this using above code . Change the chrome.exe according to your environment
Note: Make sure you close all the chrome browsers before running this script

python selenium identificate browser running

In my website when users execute one operation in server start new seesion chrome webdrive (python selenium), for monitorization need identificate the browser opened.
UA = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.{NNR} (KHTML, like Gecko) Chrome/42.0.2288.6 Safari/537.{NNR}".format(NNR=NNR)
options = webdriver.ChromeOptions()
options.add_argument('--user-agent={UA}'.format(UA=UA))
options.add_argument("--lang=it");
options.add_argument("--test-type")
self.driver = webdriver.Chrome(chrome_options=options)
need same solution, when the browser is opened want to be associated with a name, visible to the human eye! How i can gived name to browser in selenium ?

Categories