I am using web scraping for my project and it has worked perfectly on a windows system. Deploying it on ubuntu has worked perfectly the first time running my script but any time after that I receive the error
selenium.common.exceptions.WebDriverException: Message: unknown error: unable to discover open pages
Usually before this happens the script has no output for around a minute and 30 seconds before returning the error. Any help would be appreciated!
My code:
import selenium
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
for i in range(5):
URLS = []
URLS.append('https://website.com/')
for i in range(1):
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("start-maximized")
options.add_argument("window-size=1900,1080")
options.add_argument("disable-gpu")
options.add_argument("--disable-software-rasterizer")
options.add_argument("--disable-dev-shm-usage")
options.add_argument('--remote-debugging-port=9222')
options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36')
driver = webdriver.Chrome(executable_path="./drivers/chromedriver", options=options)
driver.set_page_load_timeout(2)
for url in URLS:
try:
webdriver.get(url)
innerHTML = webdriver.page_source
soup = BeautifulSoup(innerHTML, "html.parser")
for i in soup.select("#ELEMENT"):
global ELEMENT
ELEMENT = i.text
print(ELEMENT)
webdriver.close()
webdriver.quit()
except:
pass
If you want to scrape restricted or blocked website you have to use random user agents. You can check below code snippet. Hope it should work for you.
import random
import selenium
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
for i in range(5):
URLS = []
URLS.append('https://google.com/')
for i in range(1):
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.5; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:90.0) Gecko/20100101 Firefox/90.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
]
user_agent = random.choice(user_agent_list)
browser_options = webdriver.ChromeOptions()
browser_options.add_argument("--no-sandbox")
browser_options.add_argument("--headless")
browser_options.add_argument("start-maximized")
browser_options.add_argument("window-size=1900,1080")
browser_options.add_argument("disable-gpu")
browser_options.add_argument("--disable-software-rasterizer")
browser_options.add_argument("--disable-dev-shm-usage")
browser_options.add_argument(f'user-agent={user_agent}')
web_driver = webdriver.Chrome(options=browser_options, service_args=["--verbose", "--log-path=test.log"])
for url in URLS:
try:
web_driver.get(url)
innerHTML = web_driver.page_source
soup = BeautifulSoup(innerHTML, "html.parser")
for i in soup.select("body"):
global ELEMENT
ELEMENT = i.text
print(ELEMENT)
web_driver.close()
web_driver.quit()
except:
pass
Please let me know it it's not worked.
Related
I want to scrape data from this site "https://www.findhelp.org/care/support-network--san-francisco-ca?postal=94105" . I Tried using selenium, beautifulsoup but cant able to scrape the data because the site was blocking scrapers. I want to find a way to scrape the data in this site.
Any solution for this problem will be highly appreciated.
I tried these two approaches:
import requests
from bs4 import BeautifulSoup
url="https://www.findhelp.org/care/support-network--san-francisco-ca?postal=94105"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}
reqs = requests.get(url, headers=headers)
soup = BeautifulSoup(reqs.text, 'lxml')
print(soup)
Second one:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#object of Options class
op = webdriver.ChromeOptions()
#add user Agent
op.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246")
#set chromedriver.exe path
driver = webdriver.Chrome(executable_path=r"C:\Users\Vinay Edula\Desktop\evva\findhelp\chromedriver.exe",options=op)
#maximize browser
driver.maximize_window()
#launch URL
driver.get("https://www.findhelp.org/search_results/94105")
time.sleep(15)
driver.quit()
I was able to get into the site by passing this user agent in the header.
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246
hi can anyone get this to work - I am trying to scrape sizes from an interactive dropdown selector but keep getting a timeout error
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
soup = BeautifulSoup(requests.get("https://www.asos.com/nike/nike-air max-95-logo-leather-trainers-in-dark-navy-orange/prd/20750072 colourwayid=60085113", timeout=60.0).content)
print([size.text.strip() for size in soup.find(class_="colour-size select")])
It's because you've forgot the parameter headers
Try again:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'}
soup = BeautifulSoup(requests.get("https://www.asos.com/nike/nike-air max-95-logo-leather-trainers-in-dark-navy-orange/prd/20750072 colourwayid=60085113",
timeout=60.0,
headers=headers).content)
I'm trying to scrape this website:
https://www.footpatrol.com/
However it seems like the website denies my scraping attempt.
Using headers did not help.
from bs4 import BeautifulSoup
import requests
url = "https://www.footpatrol.com/"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
r = requests.get(url, headers = headers)
data = r.text
soup = BeautifulSoup(data, 'lxml')
for a in soup.find_all():
print(a)
This leads to me getting the ConnectionError, how can I fix my code so I can scrape the site?
I'm able to get a response by changing the User Agent to:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
and the following User Agent also works:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
It seems that the Chrome version is the culprit in your User Agent.
I am using selenium chromedriver in my python project.
The application is running under Docker.
When I try to access http://mobile.de website I got rejected stating:
Unfortunately, automated access to this page was denied.
Here is my initialization code:
CHROME_DRIVER_PATH = os.path.abspath('assets/chromedriver')
chrome_options = ChromeOptions()
chrome_options.binary_location = "/usr/bin/google-chrome"
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
self.web_driver_chrome = webdriver.Chrome(executable_path=CHROME_DRIVER_PATH, options=chrome_options)
And here is my send request code:
def get_page_content(self, url):
url = "https://www.mobile.de/"
self.web_driver_chrome.get(url)
print(self.web_driver_chrome.page_source)
return self.web_driver_chrome.page_source
Is there any way I can pass this "automated access check"?
when using --headless it append HeadlessChrome to the user-agent
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/71.0.3578.98 Safari/537.36
The solution is adding argument to set normal user-agent
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
chrome_options.add_argument('user-agent=' + user_agent)
I'm using the following code to change the user-agent string, but I'm wondering whether or not this will change the user-agent string for each and every browser.get request?
ua_strings = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
...
]
def parse(self, response):
profile = webdriver.FirefoxProfile()
profile.set_preference('general.useragent.override', random.choice(ua_string))
options = Options()
options.add_argument('-headless')
browser = webdriver.Firefox(profile, firefox_options=options)
browser.get(self.start_urls[0])
hrefs = WebDriverWait(browser, 60).until(
EC.visibility_of_all_elements_located((By.XPATH, '//div[#class="discoverableCard"]/a'))
)
pages = []
for href in hrefs:
pages.append(href.get_attribute('href'))
for page in pages:
browser.get(page)
""" scrape page """
browser.close()
Or will I have to browser.close() and then create new instances of browser in order to use new user-agent strings for each request?
for page in pages:
browser = webdriver.Firefox(profile, firefox_options=options)
browser.get(page)
""" scrape page """
browser.close()
Since random.choice() has been called initially, the user-agent string remains the same of all browser.get() requests. To ensure a constantly random user-agent, you can create a set_preference() function, which you call on every loop.
def set_prefrences(self):
user_agent_string = random.choice(ua_string)
#print out user-agent on each loop
print(user_agent_string)
profile = webdriver.FirefoxProfile()
profile.set_preference('general.useragent.override', user_agent_string)
options = Options()
options.add_argument('-headless')
browser = webdriver.Firefox(profile, firefox_options=options)
return browser
Then in your loop can be something like this:
for page in pages:
browser = set_preferences()
browser.get(page)
""" scrape page """
browser.close()
Hope this helps!