Selenium unable to discover open pages

Selenium unable to discover open pages - python

I am using web scraping for my project and it has worked perfectly on a windows system. Deploying it on ubuntu has worked perfectly the first time running my script but any time after that I receive the error
selenium.common.exceptions.WebDriverException: Message: unknown error: unable to discover open pages
Usually before this happens the script has no output for around a minute and 30 seconds before returning the error. Any help would be appreciated!
My code:
import selenium
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
for i in range(5):
URLS = []
URLS.append('https://website.com/')
for i in range(1):
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("start-maximized")
options.add_argument("window-size=1900,1080")
options.add_argument("disable-gpu")
options.add_argument("--disable-software-rasterizer")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--remote-debugging-port=9222')
options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36')
driver = webdriver.Chrome(executable_path="./drivers/chromedriver", options=options)
driver.set_page_load_timeout(2)
for url in URLS:
try:
webdriver.get(url)
innerHTML = webdriver.page_source
soup = BeautifulSoup(innerHTML, "html.parser")
for i in soup.select("#ELEMENT"):
global ELEMENT
ELEMENT = i.text
print(ELEMENT)
webdriver.close()
webdriver.quit()
except:
pass

If you want to scrape restricted or blocked website you have to use random user agents. You can check below code snippet. Hope it should work for you.
import random
import selenium
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
for i in range(5):
URLS = []
URLS.append('https://google.com/')
for i in range(1):
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
]
user_agent = random.choice(user_agent_list)
browser_options = webdriver.ChromeOptions()
browser_options.add_argument("--no-sandbox")
browser_options.add_argument("--headless")
browser_options.add_argument("start-maximized")
browser_options.add_argument("window-size=1900,1080")
browser_options.add_argument("disable-gpu")
browser_options.add_argument("--disable-software-rasterizer")
browser_options.add_argument("--disable-dev-shm-usage")
browser_options.add_argument(f'user-agent={user_agent}')
web_driver = webdriver.Chrome(options=browser_options, service_args=["--verbose", "--log-path=test.log"])
for url in URLS:
try:
web_driver.get(url)
innerHTML = web_driver.page_source
soup = BeautifulSoup(innerHTML, "html.parser")
for i in soup.select("body"):
global ELEMENT
ELEMENT = i.text
print(ELEMENT)
web_driver.close()
web_driver.quit()
except:
pass
Please let me know it it's not worked.

Related

Scrape data from a paginated website which was blocking the request when i use beautifulsoup, selenium

I want to scrape data from this site "https://www.findhelp.org/care/support-network--san-francisco-ca?postal=94105" . I Tried using selenium, beautifulsoup but cant able to scrape the data because the site was blocking scrapers. I want to find a way to scrape the data in this site.
Any solution for this problem will be highly appreciated.
I tried these two approaches:
import requests
from bs4 import BeautifulSoup
url="https://www.findhelp.org/care/support-network--san-francisco-ca?postal=94105"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
reqs = requests.get(url, headers=headers)
soup = BeautifulSoup(reqs.text, 'lxml')
print(soup)
Second one:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#object of Options class
op = webdriver.ChromeOptions()
#add user Agent
op.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246")
#set chromedriver.exe path
driver = webdriver.Chrome(executable_path=r"C:\Users\Vinay Edula\Desktop\evva\findhelp\chromedriver.exe",options=op)
#maximize browser
driver.maximize_window()
#launch URL
driver.get("https://www.findhelp.org/search_results/94105")
time.sleep(15)
driver.quit()

I was able to get into the site by passing this user agent in the header.
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246

Python timeout error when trying to scrape sizes on product page

hi can anyone get this to work - I am trying to scrape sizes from an interactive dropdown selector but keep getting a timeout error
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
soup = BeautifulSoup(requests.get("https://www.asos.com/nike/nike-air max-95-logo-leather-trainers-in-dark-navy-orange/prd/20750072 colourwayid=60085113", timeout=60.0).content)
print([size.text.strip() for size in soup.find(class_="colour-size select")])

It's because you've forgot the parameter headers
Try again:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
soup = BeautifulSoup(requests.get("https://www.asos.com/nike/nike-air max-95-logo-leather-trainers-in-dark-navy-orange/prd/20750072 colourwayid=60085113",
timeout=60.0,
headers=headers).content)

Python: ConnectionError: 'Connection aborted' when scraping specific websites

I'm trying to scrape this website:
https://www.footpatrol.com/
However it seems like the website denies my scraping attempt.
Using headers did not help.
from bs4 import BeautifulSoup
import requests
url = "https://www.footpatrol.com/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
r = requests.get(url, headers = headers)
data = r.text
soup = BeautifulSoup(data, 'lxml')
for a in soup.find_all():
print(a)
This leads to me getting the ConnectionError, how can I fix my code so I can scrape the site?

I'm able to get a response by changing the User Agent to:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
and the following User Agent also works:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
It seems that the Chrome version is the culprit in your User Agent.

Docker Selenium Chromedriver: Unfortunately, automated access to this page was denied

I am using selenium chromedriver in my python project.
The application is running under Docker.
When I try to access http://mobile.de website I got rejected stating:
Unfortunately, automated access to this page was denied.
Here is my initialization code:
CHROME_DRIVER_PATH = os.path.abspath('assets/chromedriver')
chrome_options = ChromeOptions()
chrome_options.binary_location = "/usr/bin/google-chrome"
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
self.web_driver_chrome = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, options=chrome_options)
And here is my send request code:
def get_page_content(self, url):
url = "https://www.mobile.de/"
self.web_driver_chrome.get(url)
print(self.web_driver_chrome.page_source)
return self.web_driver_chrome.page_source
Is there any way I can pass this "automated access check"?

when using --headless it append HeadlessChrome to the user-agent
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/71.0.3578.98 Safari/537.36
The solution is adding argument to set normal user-agent
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
chrome_options.add_argument('user-agent=' + user_agent)

Changing User-Agent String for Every Get

I'm using the following code to change the user-agent string, but I'm wondering whether or not this will change the user-agent string for each and every browser.get request?
ua_strings = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
...
]
def parse(self, response):
profile = webdriver.FirefoxProfile()
profile.set_preference('general.useragent.override', random.choice(ua_string))
options = Options()
options.add_argument('-headless')
browser = webdriver.Firefox(profile, firefox_options=options)
browser.get(self.start_urls[0])
hrefs = WebDriverWait(browser, 60).until(
EC.visibility_of_all_elements_located((By.XPATH, '//div[#class="discoverableCard"]/a'))
)
pages = []
for href in hrefs:
pages.append(href.get_attribute('href'))
for page in pages:
browser.get(page)
""" scrape page """
browser.close()
Or will I have to browser.close() and then create new instances of browser in order to use new user-agent strings for each request?
for page in pages:
browser = webdriver.Firefox(profile, firefox_options=options)
browser.get(page)
""" scrape page """
browser.close()

Since random.choice() has been called initially, the user-agent string remains the same of all browser.get() requests. To ensure a constantly random user-agent, you can create a set_preference() function, which you call on every loop.
def set_prefrences(self):
user_agent_string = random.choice(ua_string)
#print out user-agent on each loop
print(user_agent_string)
profile = webdriver.FirefoxProfile()
profile.set_preference('general.useragent.override', user_agent_string)
options = Options()
options.add_argument('-headless')
browser = webdriver.Firefox(profile, firefox_options=options)
return browser
Then in your loop can be something like this:
for page in pages:
browser = set_preferences()
browser.get(page)
""" scrape page """
browser.close()
Hope this helps!

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Selenium unable to discover open pages - python

Related

Scrape data from a paginated website which was blocking the request when i use beautifulsoup, selenium

Python timeout error when trying to scrape sizes on product page

Python: ConnectionError: 'Connection aborted' when scraping specific websites

Docker Selenium Chromedriver: Unfortunately, automated access to this page was denied

Changing User-Agent String for Every Get

Categories

Resources