I'm trying to navigate a website with Selenium
I searched Google and said that adding user-agent would solve it, but it didn't solve it.
http://coupang.com/
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
options = Options()
options = webdriver.ChromeOptions()
# options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument('lang=ko_KR')
options.add_argument("--disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5")
options.add_argument("accept=text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
options.add_argument("accept-charset=cp1254,ISO-8859-9,utf-8;q=0.7,*;q=0.3")
options.add_argument("accept-encoding=gzip,deflate,sdch")
options.add_argument("accept-language=tr,tr-TR,en-US,en;q=0.8")
driver = webdriver.Chrome('d:/temp/chromedriver.exe',options=options)
TEST_URL = 'https://login.coupang.com/login/login.pang?rtnUrl=https%3A%2F%2Fwww.coupang.com%2Fnp%2Fpost%2Flogin%3Fr%3Dhttps%253A%252F%252Fwww.coupang.com%252F'
driver.get(TEST_URL)
time.sleep(5)
driver.implicitly_wait(3)
elem_login = driver.find_element_by_id("login-email-input")
elem_login.clear()
elem_login.send_keys("id")
time.sleep(3)
elem_login = driver.find_element_by_id("login-password-input")
elem_login.clear()
elem_login.send_keys("pw")
time.sleep(3)
xpath = "/html/body/div[1]/div/div/form/div[5]/button"
driver.find_element_by_xpath(xpath).click()
driver.implicitly_wait(3)
print(driver.page_source)
Can you try and add headers like so and tell me if it works.
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-charset": "cp1254,ISO-8859-9,utf-8;q=0.7,*;q=0.3",
"accept-encoding": "gzip,deflate,sdch",
"accept-language": "tr,tr-TR,en-US,en;q=0.8",
Not that crystal clear in which circumstances you are facing Access Denied. However I was able to access the webpage http://coupang.com/ as follows:
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-logging"])
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('lang=ko_KR')
driver = webdriver.Chrome(options=options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get('https://www.coupang.com/')
print(driver.page_source)
Console Output:
<!--[if lte IE 9]>
<div id="browserSupportWrap">
<div class="bs-wrap">
<p class="bs-message">고객님의 브라우저에서는 쿠팡이 정상 동작하지 않습니다.<br />
인터넷 익스플로러 업데이트, 크롬 또는 파이어폭스 브라우저를 설치하세요.</p>
<ul class="bs-browser-download">
<li class="ie">인터넷 익스플로러<br /> <em>업데이트하기</em></li>
<li class="chrome">크롬<br /> <em>설치하기</em></li>
<li class="firefox">파이어폭스<br /> <em> 설치하기</em></li>
</ul>
</div>
</div>
<![endif]-->
<div id="container" class="renewal home srp-sync srp-sync-brand">
.
</script>
<!-- Facebook Pixel Code -->
<script>
!function(f,b,e,v,n,t,s){if(f.fbq)return;n=f.fbq=function(){n.callMethod?n.callMethod.apply(n,arguments):n.queue.push(arguments)};if(!f._fbq)f._fbq=n;
n.push=n;n.loaded=!0;n.version='2.0';n.queue=[];t=b.createElement(e);t.async=!0;
t.src=v;s=b.getElementsByTagName(e)[0];s.parentNode.insertBefore(t,s)}(window,
document,'script','https://connect.facebook.net/en_US/fbevents.js');
fbq('init', '652323801535981');
fbq('track', 'PageView');
</script>
<noscript><img height="1" width="1" style="display:none" src="https://www.facebook.com/tr?id=652323801535981&ev=PageView&noscript=1"/></noscript>
<!-- End Facebook Pixel Code -->
<script type="text/javascript" src="//asset2.coupangcdn.com/customjs/criteo/5.6.1/ld.min.js" async="true"></script>
<noscript><img src="https://www.coupang.com/akam/11/pixel_3401c526?a=dD1kMDI3YTFiY2NmYTZiMDg3ZDE3ZWRkNzc3MDI5ZDhhNzNiYzM4ZDkxJmpzPW9mZg==" style="visibility: hidden; position: absolute; left: -999px; top: -999px;" /></noscript>
<iframe height="0" width="0" title="Criteo DIS iframe" style="display: none;"></iframe></body></html>
Browser Snapshot:
Related
I am very new to web scraping and trying to scrape gif urls from a website. For example, from gifer.com, search gifs for "smile" and then download urls for all gifs listed.
Below is an example of the source from which I want to extract src element for the video (https://i.gifer.com/ON0.mp4 in this case).
<div class="page-media-swipe desktop">
<div class="container">
<div class="swipe-left">
<span class="icon-arrow-left-2 icon" style="color: rgb(255, 255, 255); font-size: 44px;"></span>
</div>
<div class="media desktop" style="width: 367.462px;">
<div style="padding-top: 122.462%;">
<div class="media-container1">
<div class="media-container2" style="width: 367.462px;">
<div>
<video poster="https://i.gifer.com/fetch/w300-preview/d0/d0e6e89a42c43d31b5913e232d87af7b.gif" class="full-media" loop="" autoplay="" playsinline="">
<source src="https://i.gifer.com/ON0.mp4" type="video/mp4">
</video>
</div>
</div>
</div>
</div>
</div>
<div class="swipe-right">
<span class="icon-arrow-right-2 icon" style="color: rgb(255, 255, 255); font-size: 44px;">
</span>
</div>
</div>
</div>
There are more than thousands of such results and I was advised to use Python and Selenium. However my knowledge of Selenium and Python is limited
I tried below but I am not able to make much headway.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
options = Options()
options.headless = True
options.add_argument("--window-size=1920,1200")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://gifer.com/en/gifs/smile")
imgResults = driver.find_elements(By.CLASS_NAME, "media-container2")
print(len(imgResults))
#print(driver.page_source)
for i in range(0,len(imgResults)):
print(imgResults[i])
driver.quit()
Above returns 4 elements-
<selenium.webdriver.remote.webelement.WebElement (session="fac424650675a90b2a8dee91efdc01f4", element="16e771ca-37d8-45a0-8200-0f03da0b7d14")>
<selenium.webdriver.remote.webelement.WebElement (session="fac424650675a90b2a8dee91efdc01f4", element="8c9abdcb-bc9d-47da-9958-109e722b3ae9")>
<selenium.webdriver.remote.webelement.WebElement (session="fac424650675a90b2a8dee91efdc01f4", element="d9640144-4ba1-414b-aa4f-5141387335ef")>
<selenium.webdriver.remote.webelement.WebElement (session="fac424650675a90b2a8dee91efdc01f4", element="9626db84-1da9-42ad-b314-56222a5e933b")>
Now, how do I grab the source src link for each video element is what I am not getting.
I was wrong, no need to load a new page to get the mp4 link:
for img in driver.find_elements(By.CSS_SELECTOR, "figure a"):
code = img.get_attribute('href').split('/')[-1]
link = f'https://i.gifer.com/{code}.mp4'
print(link)
output
https://i.gifer.com/fzvh.mp4
https://i.gifer.com/7F5y.mp4
https://i.gifer.com/6qOR.mp4
https://i.gifer.com/3JT.mp4
...
You can obtain the list of links in one line
links = [f"https://i.gifer.com/{img.get_attribute('href').split('/')[-1]}.mp4" for img in driver.find_elements(By.CSS_SELECTOR, "figure a")]
I have the following HTML selectable menu (where you cannot click the second option until you click the #id=category menu) I am trying to navigate with Chrome headless browser using Selenium:
<div id="category" data-filters="Reports,Announcements" class="filter-form active">
<span aria-hidden="true" class="filter-label">Category</span>
<button aria-haspopup="dialog" aria-expanded="true" aria-controls="categoryContent" data-initial-name="Category" class="filter-values" aria-label="Category">Category</button>
<div class="styled-select-icon arrow" aria-expanded="true"></div>
<button aria-label="Clear Category filter" class="styled-select-icon cross"></button>
<div id="categoryContent" role="dialog" class="filter-form-labels filter-form-labels-wide">
<strong class="small-only">Category<button aria-label="Close filter" class="close-btn close-filter-form">x</button></strong>
<div class="inner">
<div>
<input type="checkbox" id="Reports">
<label for="Reports" data-filtergroup="category" data-value="Reports">Reports</label>
</div>
<div>
<input type="checkbox" id="Announcements">
<label for="Announcements" data-filtergroup="category" data-value="Announcements">Announcements</label>
</div>
</div>
</div>
*Edit: here is my driver instantiation:
path_driver = 'chromedriver'
chrome_options = ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument(("User-Agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36"))
chrome_options.add_argument('window-size=1920x1080')
driver = webdriver.Chrome(executable_path=path_driver,options=chrome_options)
I believe I am able to select the menu that gives the option id="category with the following line of code:
driver.execute_script("arguments[0].click();", WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//div[#id='category']" ))))
but I cannot figure out how to select the input Reports:
driver.execute_script("arguments[0].click();", WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//input[#id='Reports']" ))))
when I try i just get a timeout error with a blank error message:
selenium.common.exceptions.TimeoutException: Message:
If you set with window-size() in headless mode.
Try click on label element instead input
driver.execute_script("arguments[0].click();", WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.XPATH, "//label[#for='Reports']" ))))
Please Note this question remains opened, as the suggested "answer" still gives same output since it doesn't explain why JS isn't running on that page or why selenium can't extract it
I'm trying to read page source of: http://147.235.97.36/ (Hp printer) which is rendered by JS.
So I wrote:
driver.get(url)
wait_for_page(driver)
source = driver.page_source
print(source)
but in the printed source I see:
<p>JavaScript is required to access this website.</p>
<p>Please enable JavaScript or use a browser that supports JavaScript.</p>
and some of the content isn't there, so I changed my code to:
driver.get(url)
wait_for_page(driver)
source = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
print(source)
Still same output, can you help me understand what's the problem here?
Here is my init_driver function:
def init_driver():
# --Initialize Driver--#
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in Background
chrome_options.add_argument('--disable-gpu') if os.name == 'nt' else None # Windows workaround
prefs = {"profile.default_content_settings.images": 2,
"profile.managed_default_content_settings.images": 2} # Disable Loading of Images
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument('--ignore-ssl-errors=yes')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--window-size=1920,1080") # Standard Window Size
chrome_options.add_argument("--pageLoadStrategy=normal")
driver = None
try:
driver = webdriver.Chrome(options=chrome_options, service=Service('./chromedriver'))
driver.set_page_load_timeout(REQUEST_TIMEOUT)
except Exception as e:
log_warning(str(e))
return driver
You can add a few arguments to avoid geting detected and print the Page Source as follows:
Code Block:
options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
s = Service('C:\\BrowserDrivers\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get("http://147.235.97.36/")
print(driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML"))
Console Output:
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link href="/framework/Unified.css" rel="stylesheet" type="text/css">
<script type="text/javascript">
frameWorkObj = {};
frameWorkObj.pkg = "ews";
</script>
<script src="/framework/Unified.js" type="text/javascript"></script>
</head>
<body class="theme-gray">
<iframe src="/framework/cookie/client/cookie.html" style="display: none;"></iframe>
<div id="pgm-overall-container">
<div id="pgm-left-pane-bkground"></div>
<div id="pgm-banner"></div>
<div id="pgm-search-div" class="gui-hidden"></div>
<div id="pgm-top-pane"></div>
<div id="pgm-container-div">
<div id="pgm-left-pane"></div>
<div id="pgm-container" class="clear-fix">
<div id="pgm-title-div" class="gui-hidden"></div>
<div id="contentPane" class="contentPane"></div>
</div>
</div>
<div id="pgm-footer"></div>
</div> <!-- #pgm-overall-container -->
<div id="pgm-theatre-staging-div"></div>
<script type="text/javascript">
// frame buster
if(top != self)
top.location.replace(self.location.href);
</script>
<noscript>
<div id="pgm-no-js-text">
<p>JavaScript is required to access this website.</p>
<p>Please enable JavaScript or use a browser that supports JavaScript.</p>
</div>
</noscript>
<div id="ui-datepicker-div" style="display: none;" tabindex="0"></div></body>
Hello I would like to be able to change the value of "50 Profiles / Page" to "500 Profiles / Page", but the problem is that in the HTML there is no "Select" tag.
I tried doing this but it didn't work
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'https://www.personality-database.com/profile?pid=1&sort=hot'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.implicitly_wait(30)
driver.get(url)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//*[#id="root"]/div/section/main/div[1]/div[2]/div/div[5]/ul/li[10]/div/div[1]/span[2][text()="500 Profiles / Page"]'))).click()
Here is the code The HTML code
<li class="rc-pagination-options">
<div class="rc-select rc-pagination-options-size-changer rc-select-single rc-select-show-arrow">
<span class="rc-select-arrow" unselectable="on" aria-hidden="true">
<span class="rc-select-arrow-icon"></span></span>
<div class="rc-select-dropdown rc-select-dropdown-placement-topLeft rc-select-dropdown-hidden">
<div role="listbox" id="rc_select_0_list">
<div aria-label="20 Profiles / Page" role="option" id="rc_select_0_list_0"
aria-selected="false">20</div>
</div>
<div class="rc-virtual-list" style="position: relative;">
<div class="rc-virtual-list-holder">
<div class="rc-virtual-list-holder-inner"
style="display: flex; flex-direction: column;">
<div aria-selected="false" class="rc-select-item rc-select-item-option"
title="20 Profiles / Page">
<div class="rc-select-item-option-content">20 Profiles / Page</div><span
class="rc-select-item-option-state" unselectable="on" aria-hidden="true"
style="user-select: none;"><span
class="rc-select-item-option-state-icon"></span></span>
</div>
<div aria-selected="false" class="rc-select-item rc-select-item-option"
title="500 Profiles / Page">
<div class="rc-select-item-option-content">500 Profiles / Page</div><span
class="rc-select-item-option-state" unselectable="on" aria-hidden="true"
style="user-select: none;"><span
class="rc-select-item-option-state-icon"></span></span>
</div>
...
</li>
First we need to close the pop-ups and then try to click on pagination options.
And using both Implicit wait and Explicit wait is not Recommended.
Try the following solution:
driver.get("https://www.personality-database.com/profile?pid=1&sort=hot")
wait = WebDriverWait(driver,30)
try:
# Close the footer add
wait.until(EC.element_to_be_clickable((By.XPATH,"//span[#id='ezmob-wrapper']/div/center/span/div/div/span"))).click()
# Scroll a distance so that the Cookie pop up appears and Close it
driver.execute_script("window.scrollBy(0,50);")
wait.until(EC.element_to_be_clickable((By.XPATH,"//button[#id='rcc-confirm-button']"))).click()
except:
print("no adds")
# click on the drop down option
pagination = wait.until(EC.element_to_be_clickable((By.XPATH,"//li[#class='rc-pagination-options']")))
pagination.click()
# Click on the 500 profiles
option = wait.until(EC.element_to_be_clickable((By.XPATH,"//div[#class='rc-virtual-list-holder-inner']//div[text()='500 Profiles / Page']")))
option.click()
First xpath to click dropdown:
//div[#class='rc-select rc-pagination-options-size-changer rc-select-single rc-select-show-arrow']
Second xpath to click the option for 500 pages:
//div[#class='rc-select-item-option-content']/self::div[text()='500 Profiles / Page']
Here is a cheatsheet for relative xpaths https://devhints.io/xpath
Please be aware that browsers use xpath 1.0 and selenium also only supports 1.0,
So some things like 'ends-with' won't work.
I am looking to scrape prices for different products from Metro's online grocery store. To do this, I need to set a particular store as a "favourite" so that Metro knows which products to show. I'm currently using Selenium to automate this part and return the cookies after selecting a particular store. However, I am still getting 403 errors when passing the cookies to a Request despite the fact that I can access other pages on Metro's website.
import requests
import time
from user_agent import generate_user_agent
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
user_agent = generate_user_agent(navigator="chrome")
header = {"User-Agent": user_agent}
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
def getMetroCookies(store_url):
browser = webdriver.Chrome(options=options, executable_path="C:/Users/XXXX/Documents/chrome_driver/chromedriver.exe")
browser.delete_all_cookies()
stealth(browser,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
browser.get(store_url)
time.sleep(1.5)
cookie_button = browser.find_element_by_xpath("/html/body/div[4]/div/div[3]/button")
cookie_button.click()
WebDriverWait(browser, 10).until(EC.invisibility_of_element_located((By.XPATH, "/html/body/div[4]/div/div[3]/button")))
store_button = browser.find_element_by_xpath("/html/body/div[1]/div[2]/div[1]/div[2]/div[3]/div/div/div/div[1]/div/div[3]/button")
time.sleep(1)
store_button.click()
time.sleep(3)
driver_cookies = browser.get_cookies()
c = {c['name']:c['value'] for c in driver_cookies}
browser.close()
return(c)
store_url = "https://www.metro.ca/en/find-a-grocery/164"
cookies = getMetroCookies(store_url)
base_url = "https://www.metro.ca/en/online-grocery/search?filter="
search_item = "chicken"
search_url = base_url+search_item
page = requests.get(search_url, headers=header, cookies=cookies)
content = BeautifulSoup(page.text, 'html.parser')
This gives me a 403 error along with the following page content.
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]> <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]> <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<title>Attention Required! | Cloudflare</title>
<meta id="captcha-bypass" name="captcha-bypass"/>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=Edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="noindex, nofollow" name="robots"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="/cdn-cgi/styles/cf.errors.css" id="cf_styles-css" media="screen,projection" rel="stylesheet" type="text/css"/>
<!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" type="text/css" media="screen,projection" /><![endif]-->
<style type="text/css">body{margin:0;padding:0}</style>
<!--[if gte IE 10]><!-->
<script>
if (!navigator.cookieEnabled) {
window.addEventListener('DOMContentLoaded', function () {
var cookieEl = document.getElementById('cookie-alert');
cookieEl.style.display = 'block';
})
}
</script>
<!--<![endif]-->
<script type="text/javascript">
//<![CDATA[
(function(){
window._cf_chl_opt={
cvId: "2",
cType: "interactive",
cNounce: "94024",
cRay: "6657c0090c70ecee",
cHash: "f2ab1c66a7c7fb9",
cFPWv: "g",
cTTimeMs: "4000",
cLt: "n",
cRq: {
ru: "aHR0cHM6Ly93d3cubWV0cm8uY2EvZW4vb25saW5lLWdyb2Nlcnkvc2VhcmNoP2ZpbHRlcj1jaGlja2Vu",
ra: "TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV09XNjQpIEFwcGxlV2ViS2l0LzUzNy4zNiAoS0hUTUwsIGxpa2UgR2Vja28pIENocm9tZS81NC4wLjI4ODIuNzQgU2FmYXJpLzUzNy4zNg==",
rm: "R0VU",
d: "EI8002UISMNZV4/wX/5oFZrkU66iZFrjnbrNYKgh3Ttb0AlT4tTYpyyzbKdGR4wfseBSZjcF8rJrwqdQEMKdIRBqLQjf0JlIowEseVWSf0dY03uEBGR+076Co1cm3pAeU83GN1kzFNq/sMe832Ng4oWK/pCpJ6XdIvbGWpk1l8Qtrwbi/hVtj3R1BXapeIgGrJRGlUjcsa72BbNFXOb97CsKqFb+6xMTSO9D/nTxFlouAqHyvbrkTG+CeGvImNQTqu9AVSsZiibNCRQ9C/IlNzCwn0tEvnJ6dZ6WA5RaS4riPmOdbpVGDcS2hIOjIfeGK4/Xj0dho0VkraSq+NPcFTfs18YuqtvQq/h7+V7uST5whKYXu1DM5F1TwPLbzM3KpB/KeYlad+JgxDcOaz1k0H/t52rfMhz8PYAjNvn7SwUXSJMRDeQavS6428g8IWtveqSUj4gnn6d4wGdTTNRpqnUm+m9SJARft2IjidMpvvBtUUzZe6srQs4JPZ9XzjfH+X/kMWgQT3X2pZVDrZZC9Od7P+sqyXPKoFNuZRPrWP15XogncIKTjt5MJLQUV42MGcaGlQ5w1PAvLNGGyNeMFG8wCfhuc/vLzodD+DP3bgIi7tjx8d5zhP3jMPAsUPxAxcJpZkBtuMBuKDNQO50dYHD2wwyOhx9HMcqHWCssMWN4qUzYKOth1KNlg0zlA/qzry1csYQqILH1F1b9O5QypPa2OA5gGmJNhar8svffekU9CXsqgtHDphJgEwsqrP1qSZzQ6wq1s5McDp6pPKijdPGbBrK4q2pxbJaVHu0lRn58gStP6HGEY8BLV/kEpygG27T4Vq4dp4uWLZDKw2oxk8ezrOIgv/lq7yXkZmhZs1GzHd4XWVXJvZ5dTI3rT1zrXMOTpInw4RWXULnazZn3HofZYOm0mUJvsofwzjaG88A=",
t: "MTYyNDcyNDI5Mi4wMTcwMDA=",
m: "SuNqM4NyxmnA1WU+nYefP0zkF5LxO+2HK+JlYjzu4dw=",
i1: "Z/V7+yIdblkqF9PRfarDwA==",
i2: "iMe97FeUtyqejNZ6Ziyc8w==",
zh: "/vdKLh0CrKHrnBUka1HcvI1mkhoFozUewI640Q15E4c=",
uh: "wSvBDgWWw4CCletn46YSZpWn4A/qjMkCb4uV9eAjmfA=",
hh: "56bTGUAA35o0NPPIwaihW3gLWiRsmO2PeArMwpTuU9E=",
}
};
}());
//]]>
</script>
<style type="text/css">
#cf-wrapper #spinner {width:69px; margin: auto;}
#cf-wrapper #cf-please-wait{text-align:center}
.attribution {margin-top: 32px;}
.bubbles { background-color: #f58220; width:20px; height: 20px; margin:2px; border-radius:100%; display:inline-block; }
#cf-wrapper #challenge-form { padding-top:25px; padding-bottom:25px; }
#cf-hcaptcha-container { text-align:center;}
#cf-hcaptcha-container iframe { display: inline-block;}
#keyframes fader { 0% {opacity: 0.2;} 50% {opacity: 1.0;} 100% {opacity: 0.2;} }
#cf-wrapper #cf-bubbles { width:69px; }
#-webkit-keyframes fader { 0% {opacity: 0.2;} 50% {opacity: 1.0;} 100% {opacity: 0.2;} }
#cf-bubbles > .bubbles { animation: fader 1.6s infinite;}
#cf-bubbles > .bubbles:nth-child(2) { animation-delay: .2s;}
#cf-bubbles > .bubbles:nth-child(3) { animation-delay: .4s;}
</style>
</head>
<body>
<div id="cf-wrapper">
<div class="cf-alert cf-alert-error cf-cookie-error" data-translate="enable_cookies" id="cookie-alert">Please enable cookies.</div>
<div class="cf-error-details-wrapper" id="cf-error-details">
<div class="cf-wrapper cf-header cf-error-overview">
<h1 data-translate="challenge_headline">One more step</h1>
<h2 class="cf-subheadline"><span data-translate="complete_sec_check">Please complete the security check to access</span> www.metro.ca</h2>
</div>
<div class="cf-section cf-highlight cf-captcha-container">
<div class="cf-wrapper">
<div class="cf-columns two">
<div class="cf-column">
<div class="cf-highlight-inverse cf-form-stacked">
<form action="/en/online-grocery/search?filter=chicken&__cf_chl_captcha_tk__=0641319015a45358b1db60468c92bf88af4a70ea-1624724292-0-ATxUvClOko_GDrF_ejLwzZX-kuPRpoh1BFTlbPpgnM7UZS0tt0LcTa6u0ksaDrdsCuFkwxbyL7QYwbUeX6srjGPdlhXjLsQNqAH5sr4WHG8JX55aU2kRJzjzY9HulNoXyr6MuhmU1HzLv1ZvLss4X5hP-lABtnHTc5waDyQNzn3zxVHYetOu-uA7COqv76by9yx8dhQAWX0pT8cgjYQ2QwRLhrAw49GqhCux2EluSfziYo-Zqncf4uDyMe0Pb7Hb1csz2l9E_L26erOLQTrM_U2c1sYY0T-4ofJdQNEVLFA7e1FkGspeuGaFRRNmcXhCNPB7YKEiHlkROpAr2nxQeepJuefHBMdzbixJRXE5glhNCX9XXJ5nbpo8OzLY7pnMrJgaW6_YucjLh0fJs4c0bfBHAHZLWQeGxvcG7_AeM3zY6MIXngvnXg64GyrpxmYfADy_znyKmVlTCvVwdc8VEBZo27I4iGoqhJWaG0E1Q0Dw9a6dTU7bOWCSpoaxSNUmNkuwL5VsBAk3paSDIwYaewFLHijU-PUdeGw9hcLFsNbD95qUGlVEHZsdUMg176NYJ1VyZho1MMbNj8bVVC2kDKyZOu1IqcMe0TTqVwV5p9j_zZU6ODLXhn_d2VFULBMQTs9eIZUIz3j6uMZdEYV2o53P421SCx-MPPD5rALfYHdTRmSBDCLeW7gUG5-UvnWh87p87HJH__7plEmoJhFkW8crBpUeKBhwt7JQR_huvqOW" class="challenge-form interactive-form" enctype="application/x-www-form-urlencoded" id="challenge-form" method="POST">
<div id="cf-please-wait">
<div id="spinner">
<div id="cf-bubbles">
<div class="bubbles"></div>
<div class="bubbles"></div>
<div class="bubbles"></div>
</div>
</div>
<p data-translate="please_wait" id="cf-spinner-please-wait">Please stand by, while we are checking your browser...</p>
<p data-translate="redirecting" id="cf-spinner-redirecting" style="display:none">Redirecting...</p>
</div>
<input name="r" type="hidden" value="86274ebf891ca5903cedef6f5476291f7a3f2375-1624724292-0-AYyBXOiOkLO6sEJbZAhxnborgsqm+9Myz1E+TgVNFE0OKQcJs14P/RNNa9jSf5uTx9Eo4AxksOkzMWys+5Roo/xz2LZWFQybup/QSTYAEX6Oz5WVB05OtClBu7NY+EMGVabeM1OM2Q3cc1qgrnOH4h4UWw/tFTEYmY0tOXDYpe93zmxREYOxBU/vxCsLtda3YAAodT9qhQyO7oiTEgWMNC595Rjao12av3f+TtLrX41QyH/qiSfKJYRQf616Yvk7IEzTwc/n8ZvMc8wnGm5j+9lM0bzc6kRGoCfHVj1r0eJJxEV9aF15A+pKYuIzupkw/QOT8rUZE6UtL3yGB9UYVwqmcvtvIIO4ILPVnQV8fXxnvnXpvCVXKr0PgxPF4p8Drl1Vb95PdVn7ZvQ0jr6xGiqhbPFu2//9mwUnSjBQt8SXfei/Zq/Z0uL0TD/513/bBF1Jp/QojGEJjVGfs1Wo4L+usEpn9O0Z6gWaZXPfQgqTwiO9uboO+Z9V8pdeK3egZMneaXfCjhwgrNzmTilR90jQlGbsMOXhUokOQaxqhJ/khmBgnu5UfJ3OFxG5e2zQylxXkK88T38DE7DysMBuXE3wv10Pf4Dl6lEPMYbXqUB1Vp/hT+ShzNvaG1wpRQD6XA1WIzKNVINAg9QIffi30ojuKxldogRE0rpTAzZfgzRN8kiXFsxwQfMfTYMvdtoJsEbBP6CrvsNNNOmzN9exuM5WSbj+UXSa4/ZSlkHp0SVEJZOccYYYT6C9kAV0srfDmysEkDpfYQcap8AhFh7Ub8pYA9CedTD31+ghxXqlBphJj6zAQQfJyawwyFv4dwctjYJxduR3p6yG/7fyhTh8/B7U47sR30cP2mRA2sUMRLAYrLp2bd8yiz+jxsuoD6JxikOSYLTOl9e862isXFOg4RSspNB1RqCtb/154pnoP3bRghEl6vTSpj6dSH8GUxBjSQPWxbZuSwSMGTHHPxevAZFSpct5SNv05aU6rFvwPcna2h6UwjqcZOenMDr53xh2NVjHVWIcsDMXDReVncZyb28PIBqmBOdx4Fui/4KNXJuM1kuk5SMnN9zc1H9ZKgJnpXKIuvGFqS+Ifb56RCH/XWaoVPOG6tMwbitv8I0BkDvCSWBFXIgHw2tDuS3i/CxrELTCK6QURZZdFgZQKITtC/FvsxnDvPmnaON0dxzhJufdiBGiRCJAsLNJUkE1RfJCg8pApT+REE6IvrKb1r/1DIjETBWN0ntGE/J8fzZXOXaJDmmX2ZxWfBQCJ66RisEmJTzwRU9ModQMXfUeXxYx30IZN4H1BML6G/qzRiwN4mgO1aTeG0ic1Pv8NGdLlWP66gxvlxVTdNTuo1GTR8zyBs0AIwP3ohZrUH2KBH/r/NCVmKbxW7jswpl2kK9dzcPQi48TgeOyV8080BDzWkDOZj1agmycaGFobNAMdFhZhCSfYg+6+Y6rHba2CXKi2IioAGLh9/iMOvTGlMRZsw/dSd//ihW+otU033+sCxNjv/xK7RyAicZMk1MVDCDYbaEwzy6MAluXTpSSto5MHUDBWb+qwDlQYVqbkU5TO5ivbbBWpq1+8YFeq5zcrBfU9r+8ttj3qR8MpLcIAF18q9Ll1rE02opU/J6cCMNoFRmBecQZLmcSFoDWmS5n0nca51/KYQdJJDEpq63RKfc7KrizwX0lHfM+vwW4P3zYlGjXRdCjrNIf9Oae8nZpcB6itAjhzeu4n+gx24EHQmdNeg8AJ3B519bjqCA+aYooSfUzgUrNF+YDQBbI7Nq/sErOM6RanUuFaoMS3jnNCS4tP3TWdjJraHEY53wpBg+oqXpsJzdhfesM/KjNpxBxX9OT9v4vXyi8xzDPJB0EiZ8I6OihO7odnVW+gUdFLr7aMCtPsx5LbTFwLvE8ESTtdCfXWKSGB0GmQdJe5KmGsGQ1pxQEiVW0KUw+PCzvudBYngQF4N+UQcthlmt2pRt73ULhy2abmRa+JHOLWdvOgOQASxb8DW1k/htFRdj6FFmLygYJx+NBMD0kcQO32768SuU1S18wh2Vi3b3LXZrjpc/tPfbvADi2BVyMiEfs3cKtLwZjK2mrEONy5xq1BAL0UzLJCZCpVDc8IoxIQ3LpTxOAoQ9sw92LQdfvq/CyhMF8sAhMxQamvsWklrv5seJlNWvoNlvgfeaNxI/ugceoW9IwiZCb26d5ySpiySIgANeZwV//k5eGECYr8gLB37o+dGblgHjr+onK4UG2nHLAkIbhXBI1ZAlfE4f6YyruB2Z/35lxayZkRE/YYXJrYtpYJRU/ssl7S0VGY8SPh7aRdx8N9sw+F3XKQ63Y2pxO1KAm/Xf1CElhz86alEXlAdA24LZRz8cVcuHvk9mKM2j/YmUlYX+1uF2Zul+101PVpuvCypZtAa0nhlGTiB+st00ohFe6HmhK6d2T4UWISX6JiubywIJ0oLEF4hzecd1hB0/2Vdpl5Z9y/jhuOxPWceYGhriP3JYP9cS+MFbC36wOkF7hYpsdg9NEgFIDLFxzSYeEFkPeIuE13M1hwZHjjW8Zf6REdiPnQrDZHAKRDldWwzwBrs36guuJ4AiNju+Mx8Lr8wB6Krcd1+HriLm4uUFVM2DLeuusRkrSojUWkdWc2dpBrkLZ0tQw7wa6ZXVRt1nsWr5/ApEuzcC1+BaCGNdl1UzNd3NnGlPDYtYPFNPsuyUJIWjUTcB0rk/CfFP6JLoROVSP2l3WFVbktqw3m+mcwa6bw7Aew7YU4N/O2yJ8ab8a13/tV01Dfi61AIdKB2APWbVNRGxinXj++7fTKmqLB4B7usJC9EYqYbqq7ntAnjV9b1jI3iut/E6qDPZ58j9021JY4k2dfKY3Ry6GbIPAhvd/aKcN5Y6x79KItMsijXvAhBSILkbOwGQXccjo7lEIeh8Z1M+e3X0j2B811qcNjCvJDeMYb57+7jVkCzuL3ICADL1IjHGftYzjBPhPwl2UiZ4qD7tqc7Q2/Ol7BgYsIuddbNV72tof1/akffCEltbezCynu7P0hoDFCjDJAPmv4hGLFZZrLCU69jxLGYKU/ol6l8EEQA=="/>
<input name="cf_captcha_kind" type="hidden" value="h"/>
<input name="vc" type="hidden" value="cb7d9f733e82b2a322f24468dd51d0a0"/>
<noscript class="cf-captcha-info" id="cf-captcha-bookmark">
<h1 data-translate="turn_on_js" style="color:#bd2426;">Please turn JavaScript on and reload the page.</h1>
</noscript>
<div class="cookie-warning" data-translate="turn_on_cookies" id="no-cookie-warning" style="display:none">
<p data-translate="turn_on_cookies" style="color:#bd2426;">Please enable Cookies and reload the page.</p>
</div>
<script type="text/javascript">
//<![CDATA[
var a = function() {try{return !!window.addEventListener} catch(e) {return !1} },
b = function(b, c) {a() ? document.addEventListener("DOMContentLoaded", b, c) : document.attachEvent("onreadystatechange", b)};
b(function(){
var cookiesEnabled=(navigator.cookieEnabled)? true : false;
if(!cookiesEnabled){
var q = document.getElementById('no-cookie-warning');q.style.display = 'block';
}
});
//]]>
</script>
<div id="trk_captcha_js" style="background-image:url('/cdn-cgi/images/trace/captcha/nojs/h/transparent.gif?ray=6657c0090c70ecee')"></div>
</form>
<script type="text/javascript">
//<![CDATA[
(function(){
var isIE = /(MSIE|Trident\/|Edge\/)/i.test(window.navigator.userAgent);
var trkjs = isIE ? new Image() : document.createElement('img');
trkjs.setAttribute("src", "/cdn-cgi/images/trace/captcha/js/transparent.gif?ray=6657c0090c70ecee");
trkjs.id = "trk_captcha_js";
trkjs.setAttribute("alt", "");
document.body.appendChild(trkjs);
var cpo=document.createElement('script');
cpo.type='text/javascript';
cpo.src="/cdn-cgi/challenge-platform/h/g/orchestrate/captcha/v1?ray=6657c0090c70ecee";
document.getElementsByTagName('head')[0].appendChild(cpo);
}());
//]]>
</script>
</div>
</div>
<div class="cf-column">
<div class="cf-screenshot-container">
<span class="cf-no-screenshot"></span>
</div>
</div>
</div>
</div>
</div>
<div class="cf-section cf-wrapper">
<div class="cf-columns two">
<div class="cf-column">
<h2 data-translate="why_captcha_headline">Why do I have to complete a CAPTCHA?</h2>
<p data-translate="why_captcha_detail">Completing the CAPTCHA proves you are a human and gives you temporary access to the web property.</p>
</div>
<div class="cf-column">
<h2 data-translate="resolve_captcha_headline">What can I do to prevent this in the future?</h2>
<p data-translate="resolve_captcha_antivirus">If you are on a personal connection, like at home, you can run an anti-virus scan on your device to make sure it is not infected with malware.</p>
<p data-translate="resolve_captcha_network">If you are at an office or shared network, you can ask the network administrator to run a scan across the network looking for misconfigured or infected devices.</p>
<p data-translate="resolve_captcha_privacy_pass"> Another way to prevent getting this page in the future is to use Privacy Pass. You may need to download version 2.0 now from the Chrome Web Store.</p>
</div>
</div>
</div>
<div class="cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300">
<p class="text-13">
<span class="cf-footer-item sm:block sm:mb-1">Cloudflare Ray ID: <strong class="font-semibold">6657c0090c70ecee</strong></span>
<span class="cf-footer-separator sm:hidden">•</span>
<span class="cf-footer-item sm:block sm:mb-1"><span>Your IP</span>: 2607:fa49:3801:a800:6901:b6b5:6c3a:ec5</span>
<span class="cf-footer-separator sm:hidden">•</span>
<span class="cf-footer-item sm:block sm:mb-1"><span>Performance & security by</span> Cloudflare</span>
</p>
</div><!-- /.error-footer -->
</div>
</div>
<script type="text/javascript">
window._cf_translation = {};
</script>
</body>
</html>
My guess is that I'm doing something wrong when extracting the cookies as I am able to access pretty much any part of Metro's website using requests, but I'm pretty new to this so I'm not entirely sure. Any help would be much appreciated!
The website uses Cloudflare services so that it will prevent the request without browser interaction. When you send a request without browser interaction (JavaScript), it will activate a captcha to check whether you are a bot or not. You can use selenium to scrape the information from the website.
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
link = 'https://www.metro.ca/en'
chrome_driver = 'C:/Users/XXXX/Documents/chrome_driver/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_driver)
driver.implicitly_wait(10)
driver.get(link)
cookie = [f"{c['name']}={c['value']};" for c in driver.get_cookies()]
cookie = ' '.join([elem for elem in cookie])
search = driver.find_element_by_css_selector('#header--search--input')
search.send_keys("chicken")
submitButton = driver.find_element_by_css_selector("#header--search--button")
submitButton.click()
driver.implicitly_wait(10)
content = BeautifulSoup(driver.page_source, 'html.parser')
print(content)
Using requests
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
link = 'https://www.metro.ca/en'
chrome_driver = 'C:/Users/XXXX/Documents/chrome_driver/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_driver)
driver.implicitly_wait(10)
driver.get(link)
cookie = [f"{c['name']}={c['value']};" for c in driver.get_cookies()]
cookie = ' '.join([elem for elem in cookie])
def using_request():
header = {
'Host': 'www.metro.ca',
'Connection': 'close',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Client-Version': 'web version 2.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Origin': 'https://www.metro.ca/en',
'Referer': 'https://www.metro.ca/en',
'Accept-Encoding': 'gzip, deflate',
'Cookie': f"{cookie}"
}
search_item = "chicken"
base_url = f"https://www.metro.ca/en/search?filter={search_item}&freeText=true"
page = requests.get(base_url, headers=header)
content = BeautifulSoup(page.text, 'html.parser')
print(content)
using_request()