I am looking to scrape prices for different products from Metro's online grocery store. To do this, I need to set a particular store as a "favourite" so that Metro knows which products to show. I'm currently using Selenium to automate this part and return the cookies after selecting a particular store. However, I am still getting 403 errors when passing the cookies to a Request despite the fact that I can access other pages on Metro's website.
import requests
import time
from user_agent import generate_user_agent
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
user_agent = generate_user_agent(navigator="chrome")
header = {"User-Agent": user_agent}
options = webdriver.ChromeOptions()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
def getMetroCookies(store_url):
browser = webdriver.Chrome(options=options, executable_path="C:/Users/XXXX/Documents/chrome_driver/chromedriver.exe")
browser.delete_all_cookies()
stealth(browser,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
browser.get(store_url)
time.sleep(1.5)
cookie_button = browser.find_element_by_xpath("/html/body/div[4]/div/div[3]/button")
cookie_button.click()
WebDriverWait(browser, 10).until(EC.invisibility_of_element_located((By.XPATH, "/html/body/div[4]/div/div[3]/button")))
store_button = browser.find_element_by_xpath("/html/body/div[1]/div[2]/div[1]/div[2]/div[3]/div/div/div/div[1]/div/div[3]/button")
time.sleep(1)
store_button.click()
time.sleep(3)
driver_cookies = browser.get_cookies()
c = {c['name']:c['value'] for c in driver_cookies}
browser.close()
return(c)
store_url = "https://www.metro.ca/en/find-a-grocery/164"
cookies = getMetroCookies(store_url)
base_url = "https://www.metro.ca/en/online-grocery/search?filter="
search_item = "chicken"
search_url = base_url+search_item
page = requests.get(search_url, headers=header, cookies=cookies)
content = BeautifulSoup(page.text, 'html.parser')
This gives me a 403 error along with the following page content.
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]> <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]> <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>
<title>Attention Required! | Cloudflare</title>
<meta id="captcha-bypass" name="captcha-bypass"/>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=Edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="noindex, nofollow" name="robots"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="/cdn-cgi/styles/cf.errors.css" id="cf_styles-css" media="screen,projection" rel="stylesheet" type="text/css"/>
<!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" type="text/css" media="screen,projection" /><![endif]-->
<style type="text/css">body{margin:0;padding:0}</style>
<!--[if gte IE 10]><!-->
<script>
if (!navigator.cookieEnabled) {
window.addEventListener('DOMContentLoaded', function () {
var cookieEl = document.getElementById('cookie-alert');
cookieEl.style.display = 'block';
})
}
</script>
<!--<![endif]-->
<script type="text/javascript">
//<![CDATA[
(function(){
window._cf_chl_opt={
cvId: "2",
cType: "interactive",
cNounce: "94024",
cRay: "6657c0090c70ecee",
cHash: "f2ab1c66a7c7fb9",
cFPWv: "g",
cTTimeMs: "4000",
cLt: "n",
cRq: {
ru: "aHR0cHM6Ly93d3cubWV0cm8uY2EvZW4vb25saW5lLWdyb2Nlcnkvc2VhcmNoP2ZpbHRlcj1jaGlja2Vu",
ra: "TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV09XNjQpIEFwcGxlV2ViS2l0LzUzNy4zNiAoS0hUTUwsIGxpa2UgR2Vja28pIENocm9tZS81NC4wLjI4ODIuNzQgU2FmYXJpLzUzNy4zNg==",
rm: "R0VU",
d: "EI8002UISMNZV4/wX/5oFZrkU66iZFrjnbrNYKgh3Ttb0AlT4tTYpyyzbKdGR4wfseBSZjcF8rJrwqdQEMKdIRBqLQjf0JlIowEseVWSf0dY03uEBGR+076Co1cm3pAeU83GN1kzFNq/sMe832Ng4oWK/pCpJ6XdIvbGWpk1l8Qtrwbi/hVtj3R1BXapeIgGrJRGlUjcsa72BbNFXOb97CsKqFb+6xMTSO9D/nTxFlouAqHyvbrkTG+CeGvImNQTqu9AVSsZiibNCRQ9C/IlNzCwn0tEvnJ6dZ6WA5RaS4riPmOdbpVGDcS2hIOjIfeGK4/Xj0dho0VkraSq+NPcFTfs18YuqtvQq/h7+V7uST5whKYXu1DM5F1TwPLbzM3KpB/KeYlad+JgxDcOaz1k0H/t52rfMhz8PYAjNvn7SwUXSJMRDeQavS6428g8IWtveqSUj4gnn6d4wGdTTNRpqnUm+m9SJARft2IjidMpvvBtUUzZe6srQs4JPZ9XzjfH+X/kMWgQT3X2pZVDrZZC9Od7P+sqyXPKoFNuZRPrWP15XogncIKTjt5MJLQUV42MGcaGlQ5w1PAvLNGGyNeMFG8wCfhuc/vLzodD+DP3bgIi7tjx8d5zhP3jMPAsUPxAxcJpZkBtuMBuKDNQO50dYHD2wwyOhx9HMcqHWCssMWN4qUzYKOth1KNlg0zlA/qzry1csYQqILH1F1b9O5QypPa2OA5gGmJNhar8svffekU9CXsqgtHDphJgEwsqrP1qSZzQ6wq1s5McDp6pPKijdPGbBrK4q2pxbJaVHu0lRn58gStP6HGEY8BLV/kEpygG27T4Vq4dp4uWLZDKw2oxk8ezrOIgv/lq7yXkZmhZs1GzHd4XWVXJvZ5dTI3rT1zrXMOTpInw4RWXULnazZn3HofZYOm0mUJvsofwzjaG88A=",
t: "MTYyNDcyNDI5Mi4wMTcwMDA=",
m: "SuNqM4NyxmnA1WU+nYefP0zkF5LxO+2HK+JlYjzu4dw=",
i1: "Z/V7+yIdblkqF9PRfarDwA==",
i2: "iMe97FeUtyqejNZ6Ziyc8w==",
zh: "/vdKLh0CrKHrnBUka1HcvI1mkhoFozUewI640Q15E4c=",
uh: "wSvBDgWWw4CCletn46YSZpWn4A/qjMkCb4uV9eAjmfA=",
hh: "56bTGUAA35o0NPPIwaihW3gLWiRsmO2PeArMwpTuU9E=",
}
};
}());
//]]>
</script>
<style type="text/css">
#cf-wrapper #spinner {width:69px; margin: auto;}
#cf-wrapper #cf-please-wait{text-align:center}
.attribution {margin-top: 32px;}
.bubbles { background-color: #f58220; width:20px; height: 20px; margin:2px; border-radius:100%; display:inline-block; }
#cf-wrapper #challenge-form { padding-top:25px; padding-bottom:25px; }
#cf-hcaptcha-container { text-align:center;}
#cf-hcaptcha-container iframe { display: inline-block;}
#keyframes fader { 0% {opacity: 0.2;} 50% {opacity: 1.0;} 100% {opacity: 0.2;} }
#cf-wrapper #cf-bubbles { width:69px; }
#-webkit-keyframes fader { 0% {opacity: 0.2;} 50% {opacity: 1.0;} 100% {opacity: 0.2;} }
#cf-bubbles > .bubbles { animation: fader 1.6s infinite;}
#cf-bubbles > .bubbles:nth-child(2) { animation-delay: .2s;}
#cf-bubbles > .bubbles:nth-child(3) { animation-delay: .4s;}
</style>
</head>
<body>
<div id="cf-wrapper">
<div class="cf-alert cf-alert-error cf-cookie-error" data-translate="enable_cookies" id="cookie-alert">Please enable cookies.</div>
<div class="cf-error-details-wrapper" id="cf-error-details">
<div class="cf-wrapper cf-header cf-error-overview">
<h1 data-translate="challenge_headline">One more step</h1>
<h2 class="cf-subheadline"><span data-translate="complete_sec_check">Please complete the security check to access</span> www.metro.ca</h2>
</div>
<div class="cf-section cf-highlight cf-captcha-container">
<div class="cf-wrapper">
<div class="cf-columns two">
<div class="cf-column">
<div class="cf-highlight-inverse cf-form-stacked">
<form action="/en/online-grocery/search?filter=chicken&__cf_chl_captcha_tk__=0641319015a45358b1db60468c92bf88af4a70ea-1624724292-0-ATxUvClOko_GDrF_ejLwzZX-kuPRpoh1BFTlbPpgnM7UZS0tt0LcTa6u0ksaDrdsCuFkwxbyL7QYwbUeX6srjGPdlhXjLsQNqAH5sr4WHG8JX55aU2kRJzjzY9HulNoXyr6MuhmU1HzLv1ZvLss4X5hP-lABtnHTc5waDyQNzn3zxVHYetOu-uA7COqv76by9yx8dhQAWX0pT8cgjYQ2QwRLhrAw49GqhCux2EluSfziYo-Zqncf4uDyMe0Pb7Hb1csz2l9E_L26erOLQTrM_U2c1sYY0T-4ofJdQNEVLFA7e1FkGspeuGaFRRNmcXhCNPB7YKEiHlkROpAr2nxQeepJuefHBMdzbixJRXE5glhNCX9XXJ5nbpo8OzLY7pnMrJgaW6_YucjLh0fJs4c0bfBHAHZLWQeGxvcG7_AeM3zY6MIXngvnXg64GyrpxmYfADy_znyKmVlTCvVwdc8VEBZo27I4iGoqhJWaG0E1Q0Dw9a6dTU7bOWCSpoaxSNUmNkuwL5VsBAk3paSDIwYaewFLHijU-PUdeGw9hcLFsNbD95qUGlVEHZsdUMg176NYJ1VyZho1MMbNj8bVVC2kDKyZOu1IqcMe0TTqVwV5p9j_zZU6ODLXhn_d2VFULBMQTs9eIZUIz3j6uMZdEYV2o53P421SCx-MPPD5rALfYHdTRmSBDCLeW7gUG5-UvnWh87p87HJH__7plEmoJhFkW8crBpUeKBhwt7JQR_huvqOW" class="challenge-form interactive-form" enctype="application/x-www-form-urlencoded" id="challenge-form" method="POST">
<div id="cf-please-wait">
<div id="spinner">
<div id="cf-bubbles">
<div class="bubbles"></div>
<div class="bubbles"></div>
<div class="bubbles"></div>
</div>
</div>
<p data-translate="please_wait" id="cf-spinner-please-wait">Please stand by, while we are checking your browser...</p>
<p data-translate="redirecting" id="cf-spinner-redirecting" style="display:none">Redirecting...</p>
</div>
<input name="r" type="hidden" value="86274ebf891ca5903cedef6f5476291f7a3f2375-1624724292-0-AYyBXOiOkLO6sEJbZAhxnborgsqm+9Myz1E+TgVNFE0OKQcJs14P/RNNa9jSf5uTx9Eo4AxksOkzMWys+5Roo/xz2LZWFQybup/QSTYAEX6Oz5WVB05OtClBu7NY+EMGVabeM1OM2Q3cc1qgrnOH4h4UWw/tFTEYmY0tOXDYpe93zmxREYOxBU/vxCsLtda3YAAodT9qhQyO7oiTEgWMNC595Rjao12av3f+TtLrX41QyH/qiSfKJYRQf616Yvk7IEzTwc/n8ZvMc8wnGm5j+9lM0bzc6kRGoCfHVj1r0eJJxEV9aF15A+pKYuIzupkw/QOT8rUZE6UtL3yGB9UYVwqmcvtvIIO4ILPVnQV8fXxnvnXpvCVXKr0PgxPF4p8Drl1Vb95PdVn7ZvQ0jr6xGiqhbPFu2//9mwUnSjBQt8SXfei/Zq/Z0uL0TD/513/bBF1Jp/QojGEJjVGfs1Wo4L+usEpn9O0Z6gWaZXPfQgqTwiO9uboO+Z9V8pdeK3egZMneaXfCjhwgrNzmTilR90jQlGbsMOXhUokOQaxqhJ/khmBgnu5UfJ3OFxG5e2zQylxXkK88T38DE7DysMBuXE3wv10Pf4Dl6lEPMYbXqUB1Vp/hT+ShzNvaG1wpRQD6XA1WIzKNVINAg9QIffi30ojuKxldogRE0rpTAzZfgzRN8kiXFsxwQfMfTYMvdtoJsEbBP6CrvsNNNOmzN9exuM5WSbj+UXSa4/ZSlkHp0SVEJZOccYYYT6C9kAV0srfDmysEkDpfYQcap8AhFh7Ub8pYA9CedTD31+ghxXqlBphJj6zAQQfJyawwyFv4dwctjYJxduR3p6yG/7fyhTh8/B7U47sR30cP2mRA2sUMRLAYrLp2bd8yiz+jxsuoD6JxikOSYLTOl9e862isXFOg4RSspNB1RqCtb/154pnoP3bRghEl6vTSpj6dSH8GUxBjSQPWxbZuSwSMGTHHPxevAZFSpct5SNv05aU6rFvwPcna2h6UwjqcZOenMDr53xh2NVjHVWIcsDMXDReVncZyb28PIBqmBOdx4Fui/4KNXJuM1kuk5SMnN9zc1H9ZKgJnpXKIuvGFqS+Ifb56RCH/XWaoVPOG6tMwbitv8I0BkDvCSWBFXIgHw2tDuS3i/CxrELTCK6QURZZdFgZQKITtC/FvsxnDvPmnaON0dxzhJufdiBGiRCJAsLNJUkE1RfJCg8pApT+REE6IvrKb1r/1DIjETBWN0ntGE/J8fzZXOXaJDmmX2ZxWfBQCJ66RisEmJTzwRU9ModQMXfUeXxYx30IZN4H1BML6G/qzRiwN4mgO1aTeG0ic1Pv8NGdLlWP66gxvlxVTdNTuo1GTR8zyBs0AIwP3ohZrUH2KBH/r/NCVmKbxW7jswpl2kK9dzcPQi48TgeOyV8080BDzWkDOZj1agmycaGFobNAMdFhZhCSfYg+6+Y6rHba2CXKi2IioAGLh9/iMOvTGlMRZsw/dSd//ihW+otU033+sCxNjv/xK7RyAicZMk1MVDCDYbaEwzy6MAluXTpSSto5MHUDBWb+qwDlQYVqbkU5TO5ivbbBWpq1+8YFeq5zcrBfU9r+8ttj3qR8MpLcIAF18q9Ll1rE02opU/J6cCMNoFRmBecQZLmcSFoDWmS5n0nca51/KYQdJJDEpq63RKfc7KrizwX0lHfM+vwW4P3zYlGjXRdCjrNIf9Oae8nZpcB6itAjhzeu4n+gx24EHQmdNeg8AJ3B519bjqCA+aYooSfUzgUrNF+YDQBbI7Nq/sErOM6RanUuFaoMS3jnNCS4tP3TWdjJraHEY53wpBg+oqXpsJzdhfesM/KjNpxBxX9OT9v4vXyi8xzDPJB0EiZ8I6OihO7odnVW+gUdFLr7aMCtPsx5LbTFwLvE8ESTtdCfXWKSGB0GmQdJe5KmGsGQ1pxQEiVW0KUw+PCzvudBYngQF4N+UQcthlmt2pRt73ULhy2abmRa+JHOLWdvOgOQASxb8DW1k/htFRdj6FFmLygYJx+NBMD0kcQO32768SuU1S18wh2Vi3b3LXZrjpc/tPfbvADi2BVyMiEfs3cKtLwZjK2mrEONy5xq1BAL0UzLJCZCpVDc8IoxIQ3LpTxOAoQ9sw92LQdfvq/CyhMF8sAhMxQamvsWklrv5seJlNWvoNlvgfeaNxI/ugceoW9IwiZCb26d5ySpiySIgANeZwV//k5eGECYr8gLB37o+dGblgHjr+onK4UG2nHLAkIbhXBI1ZAlfE4f6YyruB2Z/35lxayZkRE/YYXJrYtpYJRU/ssl7S0VGY8SPh7aRdx8N9sw+F3XKQ63Y2pxO1KAm/Xf1CElhz86alEXlAdA24LZRz8cVcuHvk9mKM2j/YmUlYX+1uF2Zul+101PVpuvCypZtAa0nhlGTiB+st00ohFe6HmhK6d2T4UWISX6JiubywIJ0oLEF4hzecd1hB0/2Vdpl5Z9y/jhuOxPWceYGhriP3JYP9cS+MFbC36wOkF7hYpsdg9NEgFIDLFxzSYeEFkPeIuE13M1hwZHjjW8Zf6REdiPnQrDZHAKRDldWwzwBrs36guuJ4AiNju+Mx8Lr8wB6Krcd1+HriLm4uUFVM2DLeuusRkrSojUWkdWc2dpBrkLZ0tQw7wa6ZXVRt1nsWr5/ApEuzcC1+BaCGNdl1UzNd3NnGlPDYtYPFNPsuyUJIWjUTcB0rk/CfFP6JLoROVSP2l3WFVbktqw3m+mcwa6bw7Aew7YU4N/O2yJ8ab8a13/tV01Dfi61AIdKB2APWbVNRGxinXj++7fTKmqLB4B7usJC9EYqYbqq7ntAnjV9b1jI3iut/E6qDPZ58j9021JY4k2dfKY3Ry6GbIPAhvd/aKcN5Y6x79KItMsijXvAhBSILkbOwGQXccjo7lEIeh8Z1M+e3X0j2B811qcNjCvJDeMYb57+7jVkCzuL3ICADL1IjHGftYzjBPhPwl2UiZ4qD7tqc7Q2/Ol7BgYsIuddbNV72tof1/akffCEltbezCynu7P0hoDFCjDJAPmv4hGLFZZrLCU69jxLGYKU/ol6l8EEQA=="/>
<input name="cf_captcha_kind" type="hidden" value="h"/>
<input name="vc" type="hidden" value="cb7d9f733e82b2a322f24468dd51d0a0"/>
<noscript class="cf-captcha-info" id="cf-captcha-bookmark">
<h1 data-translate="turn_on_js" style="color:#bd2426;">Please turn JavaScript on and reload the page.</h1>
</noscript>
<div class="cookie-warning" data-translate="turn_on_cookies" id="no-cookie-warning" style="display:none">
<p data-translate="turn_on_cookies" style="color:#bd2426;">Please enable Cookies and reload the page.</p>
</div>
<script type="text/javascript">
//<![CDATA[
var a = function() {try{return !!window.addEventListener} catch(e) {return !1} },
b = function(b, c) {a() ? document.addEventListener("DOMContentLoaded", b, c) : document.attachEvent("onreadystatechange", b)};
b(function(){
var cookiesEnabled=(navigator.cookieEnabled)? true : false;
if(!cookiesEnabled){
var q = document.getElementById('no-cookie-warning');q.style.display = 'block';
}
});
//]]>
</script>
<div id="trk_captcha_js" style="background-image:url('/cdn-cgi/images/trace/captcha/nojs/h/transparent.gif?ray=6657c0090c70ecee')"></div>
</form>
<script type="text/javascript">
//<![CDATA[
(function(){
var isIE = /(MSIE|Trident\/|Edge\/)/i.test(window.navigator.userAgent);
var trkjs = isIE ? new Image() : document.createElement('img');
trkjs.setAttribute("src", "/cdn-cgi/images/trace/captcha/js/transparent.gif?ray=6657c0090c70ecee");
trkjs.id = "trk_captcha_js";
trkjs.setAttribute("alt", "");
document.body.appendChild(trkjs);
var cpo=document.createElement('script');
cpo.type='text/javascript';
cpo.src="/cdn-cgi/challenge-platform/h/g/orchestrate/captcha/v1?ray=6657c0090c70ecee";
document.getElementsByTagName('head')[0].appendChild(cpo);
}());
//]]>
</script>
</div>
</div>
<div class="cf-column">
<div class="cf-screenshot-container">
<span class="cf-no-screenshot"></span>
</div>
</div>
</div>
</div>
</div>
<div class="cf-section cf-wrapper">
<div class="cf-columns two">
<div class="cf-column">
<h2 data-translate="why_captcha_headline">Why do I have to complete a CAPTCHA?</h2>
<p data-translate="why_captcha_detail">Completing the CAPTCHA proves you are a human and gives you temporary access to the web property.</p>
</div>
<div class="cf-column">
<h2 data-translate="resolve_captcha_headline">What can I do to prevent this in the future?</h2>
<p data-translate="resolve_captcha_antivirus">If you are on a personal connection, like at home, you can run an anti-virus scan on your device to make sure it is not infected with malware.</p>
<p data-translate="resolve_captcha_network">If you are at an office or shared network, you can ask the network administrator to run a scan across the network looking for misconfigured or infected devices.</p>
<p data-translate="resolve_captcha_privacy_pass"> Another way to prevent getting this page in the future is to use Privacy Pass. You may need to download version 2.0 now from the Chrome Web Store.</p>
</div>
</div>
</div>
<div class="cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300">
<p class="text-13">
<span class="cf-footer-item sm:block sm:mb-1">Cloudflare Ray ID: <strong class="font-semibold">6657c0090c70ecee</strong></span>
<span class="cf-footer-separator sm:hidden">•</span>
<span class="cf-footer-item sm:block sm:mb-1"><span>Your IP</span>: 2607:fa49:3801:a800:6901:b6b5:6c3a:ec5</span>
<span class="cf-footer-separator sm:hidden">•</span>
<span class="cf-footer-item sm:block sm:mb-1"><span>Performance & security by</span> Cloudflare</span>
</p>
</div><!-- /.error-footer -->
</div>
</div>
<script type="text/javascript">
window._cf_translation = {};
</script>
</body>
</html>
My guess is that I'm doing something wrong when extracting the cookies as I am able to access pretty much any part of Metro's website using requests, but I'm pretty new to this so I'm not entirely sure. Any help would be much appreciated!
The website uses Cloudflare services so that it will prevent the request without browser interaction. When you send a request without browser interaction (JavaScript), it will activate a captcha to check whether you are a bot or not. You can use selenium to scrape the information from the website.
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
link = 'https://www.metro.ca/en'
chrome_driver = 'C:/Users/XXXX/Documents/chrome_driver/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_driver)
driver.implicitly_wait(10)
driver.get(link)
cookie = [f"{c['name']}={c['value']};" for c in driver.get_cookies()]
cookie = ' '.join([elem for elem in cookie])
search = driver.find_element_by_css_selector('#header--search--input')
search.send_keys("chicken")
submitButton = driver.find_element_by_css_selector("#header--search--button")
submitButton.click()
driver.implicitly_wait(10)
content = BeautifulSoup(driver.page_source, 'html.parser')
print(content)
Using requests
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
link = 'https://www.metro.ca/en'
chrome_driver = 'C:/Users/XXXX/Documents/chrome_driver/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_driver)
driver.implicitly_wait(10)
driver.get(link)
cookie = [f"{c['name']}={c['value']};" for c in driver.get_cookies()]
cookie = ' '.join([elem for elem in cookie])
def using_request():
header = {
'Host': 'www.metro.ca',
'Connection': 'close',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Client-Version': 'web version 2.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Origin': 'https://www.metro.ca/en',
'Referer': 'https://www.metro.ca/en',
'Accept-Encoding': 'gzip, deflate',
'Cookie': f"{cookie}"
}
search_item = "chicken"
base_url = f"https://www.metro.ca/en/search?filter={search_item}&freeText=true"
page = requests.get(base_url, headers=header)
content = BeautifulSoup(page.text, 'html.parser')
print(content)
using_request()
Related
Please Note this question remains opened, as the suggested "answer" still gives same output since it doesn't explain why JS isn't running on that page or why selenium can't extract it
I'm trying to read page source of: http://147.235.97.36/ (Hp printer) which is rendered by JS.
So I wrote:
driver.get(url)
wait_for_page(driver)
source = driver.page_source
print(source)
but in the printed source I see:
<p>JavaScript is required to access this website.</p>
<p>Please enable JavaScript or use a browser that supports JavaScript.</p>
and some of the content isn't there, so I changed my code to:
driver.get(url)
wait_for_page(driver)
source = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
print(source)
Still same output, can you help me understand what's the problem here?
Here is my init_driver function:
def init_driver():
# --Initialize Driver--#
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in Background
chrome_options.add_argument('--disable-gpu') if os.name == 'nt' else None # Windows workaround
prefs = {"profile.default_content_settings.images": 2,
"profile.managed_default_content_settings.images": 2} # Disable Loading of Images
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument('--ignore-ssl-errors=yes')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument("--window-size=1920,1080") # Standard Window Size
chrome_options.add_argument("--pageLoadStrategy=normal")
driver = None
try:
driver = webdriver.Chrome(options=chrome_options, service=Service('./chromedriver'))
driver.set_page_load_timeout(REQUEST_TIMEOUT)
except Exception as e:
log_warning(str(e))
return driver
You can add a few arguments to avoid geting detected and print the Page Source as follows:
Code Block:
options = Options()
options.add_argument("start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
s = Service('C:\\BrowserDrivers\\chromedriver.exe')
driver = webdriver.Chrome(service=s, options=options)
driver.get("http://147.235.97.36/")
print(driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML"))
Console Output:
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link href="/framework/Unified.css" rel="stylesheet" type="text/css">
<script type="text/javascript">
frameWorkObj = {};
frameWorkObj.pkg = "ews";
</script>
<script src="/framework/Unified.js" type="text/javascript"></script>
</head>
<body class="theme-gray">
<iframe src="/framework/cookie/client/cookie.html" style="display: none;"></iframe>
<div id="pgm-overall-container">
<div id="pgm-left-pane-bkground"></div>
<div id="pgm-banner"></div>
<div id="pgm-search-div" class="gui-hidden"></div>
<div id="pgm-top-pane"></div>
<div id="pgm-container-div">
<div id="pgm-left-pane"></div>
<div id="pgm-container" class="clear-fix">
<div id="pgm-title-div" class="gui-hidden"></div>
<div id="contentPane" class="contentPane"></div>
</div>
</div>
<div id="pgm-footer"></div>
</div> <!-- #pgm-overall-container -->
<div id="pgm-theatre-staging-div"></div>
<script type="text/javascript">
// frame buster
if(top != self)
top.location.replace(self.location.href);
</script>
<noscript>
<div id="pgm-no-js-text">
<p>JavaScript is required to access this website.</p>
<p>Please enable JavaScript or use a browser that supports JavaScript.</p>
</div>
</noscript>
<div id="ui-datepicker-div" style="display: none;" tabindex="0"></div></body>
I'm facing some issues while getting the content from the IRI having some special characters. I've been strictly working with requests module.
Following are some of the URLs which are causing trouble
https://cwur.org/2018-19/King's-College-London.php
https://cwur.org/2018-19/University-of-Wisconsin–Madison.php
import requests
res = requests.get('https://cwur.org/2018-19/University-of-São-Paulo.php')
res.text
In order to get response 200, pass an User-Agent in the headers.
import requests
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
res = requests.get('https://cwur.org/2018-19/University-of-São-Paulo.php', headers=headers)
print(res.status_code)
print("---" * 10)
print(res.text)
Output:
200
------------------------------
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<meta name="description" content="The Center for World University Rankings (CWUR) is a leading consulting organization and publisher of the largest academic ranking of global universities.">
<meta name="keywords" content="ranking, rankings, university, universities, college, colleges, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, world, top, best, global, Ranking universitario mundial, Classement mondial des universités , Weltweites Universitätsranking, Zentrum für weltweite Universitätsrankings , ××ר×× ×××× ××רס××××ת ××¢××××, ××ר×× ×××ר×× ×××× ××רס××××ת ××¢××××, ì¸ê³ ëíìì, ãä¸çã®å¤§å¦ããã, ä¸ç大å¸æåä¸å¿, ì¸ê³ëíëí¹ì¼í°,ä¸ç大å¦ã©ã³ãã³ã°ã»ã³ã¿ã¼, Ranking mundial universitário, РейÑинг ÑнивеÑÑиÑеÑов миÑа , ÑазÑабоÑки ÑейÑинга ÑнивеÑÑиÑеÑов миÑа, ÙرÙز ,تصÙÙ٠اÙجاÙعات اÙعاÙÙÙØ© ,تصÙÙÙ, اÙجاÙعات, جاÙعات, اÙعاÙÙ, تصÙÙ٠اÙجاÙعات, ÙرÙز تصÙÙ٠اÙجاÙعات اÙعاÙÙÙØ©, Ranking de universidades del mundo, subject, subjects, journal, journals, ranking by subjects, country ranking, country rankings">
<link rel="icon" type="image/png" href="../../favicon.png" />
<!-- Bootstrap core CSS -->
<link href="../../dist/css/bootstrap.min.css" rel="stylesheet">
<!-- IE10 viewport hack for Surface/desktop Windows 8 bug -->
<link href="../../assets/css/ie10-viewport-bug-workaround.css" rel="stylesheet">
<!-- Custom styles for this template -->
<link href="../../starter-template.css" rel="stylesheet">
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
<style type="text/css">
/* CSS used here will be applied after bootstrap.css */
.navbar-custom {
color: #FFFFFF;
background-color: #222222;
border-color: #222222;
}
</style>
<title> University of São Paulo Ranking | CWUR World University Rankings 2018-2019</title>
</head>
<body>
<nav class="navbar navbar-inverse navbar-fixed-top">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<img src="../images/logo_944_400.png" height="50">
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li>About</li>
<li class="dropdown">
World University Rankings <span class="caret"></span>
<ul class="dropdown-menu">
<li class="dropdown-header">World University Rankings</li>
<li>2020-21</li>
<li>2019-20</li>
<li>2018-19</li>
<li>2017</li>
<li>2016</li>
<li>2015</li>
<li>2014</li>
<li>2013</li>
<li>2012</li>
<li role="separator" class="divider"></li>
<li class="dropdown-header">University Rankings by Country</li>
<li>2018-19</li>
<li>2017</li>
<li>2016</li>
<li>2015</li>
<li>2014</li>
<li role="separator" class="divider"></li>
<li>Rankings by Subject</li>
</ul>
</li>
<li class="dropdown">
Methodology <span class="caret"></span>
<ul class="dropdown-menu">
<li>World University Rankings</li>
<li>Subject Rankings</li>
</ul>
</li>
<li>Media</li>
</ul>
</div>
</div>
</nav>
<div class="container">
<div class="page-header">
<h4> University of São Paulo Ranking - CWUR World University Rankings 2018-2019</h4>
<!-- Go to www.addthis.com/dashboard to customize your tools -->
<div class="addthis_toolbox addthis_default_style addthis_32x32_style"> <a class="addthis_button_preferred_1"></a> <a class="addthis_button_preferred_2"></a> <a class="addthis_button_preferred_3"></a> <a class="addthis_button_preferred_4"></a><a class="addthis_button_compact"></a></div> </div>
<div class="row">
<div class="col-md-8">
<table class="table table-bordered table-hover">
<tr><td><b>Institution Name</b></td><td>University of São Paulo </td></tr>
<tr><td><b>Native Name</b></td><td>Universidade de São Paulo </td></tr>
<tr><td><b>Location</b></td><td>Brazil</td></tr>
<tr><td><b>World Rank</b></td><td>77</td></tr>
<tr><td><b>National Rank</b></td><td>1</td></tr>
<tr><td><b>Quality of Education Rank</b></td><td>583</td></tr>
<tr><td><b>Alumni Employment Rank</b></td><td>256</td></tr>
<tr><td><b>Quality of Faculty Rank</b></td><td>109</td></tr>
<tr><td><b>Research Output Rank</b></td><td>4</td></tr>
<tr><td><b>Quality Publications Rank</b></td><td>60</td></tr>
<tr><td><b>Influence Rank</b></td><td>162</td></tr>
<tr><td><b>Citations Rank</b></td><td>139</td></tr>
<tr><td><b>Overall Score</b></td><td>82.6</td></tr>
<tr><td><b>Domain</b></td><td>usp.br</td></tr>
</table>
</div>
<div class="col-md-4">
<div class="table-responsive">
<table class="table table-bordered table-hover">
<tr><td>Top 2000 Universities (2020-21)</td></tr>
<tr><td>Top 2000 Universities (2019-20)</td></tr>
<tr><td>Top 1000 Universities (2018-19)</td></tr>
<tr><td>Ranking by Country (2018-2019)</td></tr>
<tr><td>Top 1000 Universities (2017)</td></tr>
<tr><td>Ranking by Country (2017)</td></tr>
<tr><td>Rankings by Subject</td></tr>
<tr><td>Top 1000 Universities (2016)</td></tr>
<tr><td>Ranking by Country (2016)</td></tr>
<tr><td>Top 1000 Universities (2015)</td></tr>
<tr><td>Ranking by Country (2015)</td></tr>
<tr><td>Top 1000 Universities (2014)</td></tr>
<tr><td>Ranking by Country (2014)</td></tr>
</table>
</div>
</div>
</div>
<p>Copyright © 2012-2020 Center for World University Rankings</p>
</div>
<!-- Bootstrap core JavaScript
================================================== -->
<!-- Placed at the end of the document so the pages load faster -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
<script>window.jQuery || document.write('<script src="../../assets/js/vendor/jquery.min.js"><\/script>')</script>
<script src="../../dist/js/bootstrap.min.js"></script>
<!-- IE10 viewport hack for Surface/desktop Windows 8 bug -->
<script src="../../assets/js/ie10-viewport-bug-workaround.js"></script>
<!-- Go to www.addthis.com/dashboard to customize your tools -->
<script type="text/javascript" src="//s7.addthis.com/js/300/addthis_widget.js#pubid=ra-5316b43f5ee1fc57"></script>
</body>
</html>
Update:
In case of unicode urls, you can convert them to string
import requests
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
url = "https://cwur.org/2018-19/University-of-S\xc3\xa3o-Paulo.php"
new_url = url.encode("iso-8859-1").decode()
res = requests.get(new_url, headers=headers)
print(res.status_code)
print("---" * 10)
print(res.text)
I recommend trying to store the data you receive from the .get() method in a dictionary and then using pprint module to display in a neat manner:
import requests
from pprint import pprint
url = 'https://cwur.org/2018-19/University-of-Wisconsin–Madison.php'
res = requests.get(url)
# printing the status code is also helpful to see if the API call was successful
print("Status code:", r.status_code)
r_dict = res.json()
pprint(r_dict)
If you get a status code of 200, then the API call was successful. This is more documentation on other status code response: link
Hope this helps you to find the problem with your link.
I was planning on creating a basic web scraper for the site Sneakersnstuff.com however my efforts were stopped early due to an error. When requesting to the url https://www.sneakersnstuff.com/, rather than displaying the html of the website, or even the entrance captcha, I am redirected to a cloudflare page with the error message "enable cookies". Both my code and the response are shown below
import requests
import cfscrape
session = requests.session()
response = session.get('https://www.sneakersnstuff.com/')
print(response.headers)
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]> <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]> <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-US">
<!--<![endif]-->
<head>
<title>Access denied | www.sneakersnstuff.com used Cloudflare to restrict access</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge,chrome=1" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/cf.errors.css" type="text/css"
media="screen,projection" />
<!--[if lt IE 9]><link rel="stylesheet" id='cf_styles-ie-css' href="/cdn-cgi/styles/cf.errors.ie.css" type="text/css" media="screen,projection" /><![endif]-->
<style type="text/css">
body {
margin: 0;
padding: 0
}
</style>
<!--[if gte IE 10]><!-->
<script type="text/javascript" src="/cdn-cgi/scripts/zepto.min.js"></script>
<!--<![endif]-->
<!--[if gte IE 10]><!-->
<script type="text/javascript" src="/cdn-cgi/scripts/cf.common.js"></script>
<!--<![endif]-->
</head>
<body>
<div id="cf-wrapper">
<div class="cf-alert cf-alert-error cf-cookie-error" id="cookie-alert" data-translate="enable_cookies">Please
enable cookies.</div>
<div id="cf-error-details" class="cf-error-details-wrapper">
<div class="cf-wrapper cf-header cf-error-overview">
<h1>
<span class="cf-error-type" data-translate="error">Error</span>
<span class="cf-error-code">1020</span>
<small class="heading-ray-id">Ray ID: 578133293d83e0d6 • 2020-03-22 16:13:25 UTC</small>
</h1>
<h2 class="cf-subheadline">Access denied</h2>
</div><!-- /.header -->
<section></section><!-- spacer -->
<div class="cf-section cf-wrapper">
<div class="cf-columns two">
<div class="cf-column">
<h2 data-translate="what_happened">What happened?</h2>
<p>This website is using a security service to protect itself from online attacks.</p>
</div>
</div>
</div><!-- /.section -->
<div class="cf-error-footer cf-wrapper">
<p>
<span class="cf-footer-item">Cloudflare Ray ID: <strong>578133293d83e0d6</strong></span>
<span class="cf-footer-separator">•</span>
<span class="cf-footer-item"><span>Your IP</span>: 96.241.108.243</span>
<span class="cf-footer-separator">•</span>
<span class="cf-footer-item"><span>Performance & security by</span> <a
href="https://www.cloudflare.com/5xx-error-landing?utm_source=error_footer" id="brand_link"
target="_blank">Cloudflare</a></span>
</p>
</div><!-- /.error-footer -->
</div><!-- /#cf-error-details -->
</div><!-- /#cf-wrapper -->
<script type="text/javascript">
window._cf_translation = {};
</script>
</body>
</html>
I have attempted using a library reccomend by many called cfscrape to no avail.
Adding Browser/User-Agent Filtering to cloudscraper did the trick for me.
import cloudscraper
from bs4 import BeautifulSoup
# Adding Browser / User-Agent Filtering should help ie.
# will give you only desktop firefox User-Agents on Windows
scraper = cloudscraper.create_scraper(browser={'browser': 'firefox','platform': 'windows','mobile': False})
html = scraper.get("https://www.sneakersnstuff.com/").content
soup = BeautifulSoup(html, 'html.parser')
print(soup)
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
html = scraper.get("https://www.sneakersnstuff.com/").content
soup = BeautifulSoup(html, 'html.parser')
print(soup)
Output:
cloudscraper.exceptions.CloudflareReCaptchaProvider: Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider correctly via the 'recaptcha' parameter.
Next Step ?
3rd Party reCaptcha Solvers
Description
cloudscraper currently supports the following 3rd party reCaptcha solvers, should you require them.
anticaptcha
deathbycaptcha
2captcha
9kw
return_response
I'm trying to login in to a jsp form with selenium webdriver in python. I'm trying to login by posting the parameters but I cannot reach the form or anything beyond the body tag for that matter. What am i doing wrong? Below is my code and below that is the page source - since it is a non-public web page:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
def my_method():
driver = webdriver.PhantomJS()
driver.get("https://<URL>.se:20443/snl/login.jsp")
password = driver.find_element_by_xpath("//input[#id='j_password']")
driver.close()
my_method()
Page source:
<!DOCTYPE html>
<!-- WARNING : modifying login.jsp may affect the login layout for Mobile, Embedded and Desktop Version. -->
<html>
<head>
<!-- Import header for script/style ... -->
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="Cache-Control" content="no-cache"/>
<meta http-equiv="Expires" content="-1" />
<meta http-equiv="Content-type" content="text/html;charset=UTF-8">
<link type="image/x-icon" rel="shortcut icon" href="resources/images/favicon.ico">
<link type="text/css" rel="stylesheet" href="resources/style/login.css">
<style type="text/css">
body{
background: url(resources/images/gradient_body_login.png) repeat-x;
background-color: #d3d3d3;
}
#container{
background: url(resources/images/background_body.png) no-repeat top;
width:1090px;
min-height:770px;
height:auto;
}
#container{
margin:auto;
margin-top:0;
}
#login input[type="submit"],#login input[type="button"]{
margin:10px 5px 10px 0;
}
#mask{
opacity: 0.5;
filter: alpha(opacity = 50);
}
</style>
<script type="text/javascript" src="resources/script/live.js"></script>
<script type="text/javascript">
<!-- Against XFS attack -->
if(top != self)
{top.location=self.location;}
<!-- Against XFS attack -->
function Start(){
DisplayContent();
GiveFocus("j_username");
<!-- Check login failed -->
var vars = getUrlParameters();
var loginLabel = document.getElementById("failureIndication");
var authFailed = vars["authfailed"];
if (authFailed === "true")
loginLabel.innerHTML = "Login Failed";
<!-- Check login failed -->
}
function DisplayContent() {
var mainFrame = document.getElementById("hasJavascript");
mainFrame.style.display = 'block';
}
function GiveFocus(id){
document.getElementById(id).focus();
}
function setSubmitUrl(form){
var hash = getUrlHash();
if((hash != null) && (hash != "")) {
form.action = "j_spring_security_check#" + hash;
}else {
form.action = "j_spring_security_check";
}
return true;
}
</script>
<!-- Sample of custom logo -->
<style type="text/css">
h2 {
background: url("resources/large.png") no-repeat 20px 0 transparent;
line-height: 20px;
padding-left: 170px;
background-size: 75px 33px !important;
background-position: 35px 0px !important;
}
body {
font-family:Verdana;font-size:12px;color:#444;margin:0;padding:0;width:100%;height:100%;
background-color: #5BBF19 !important;
background-attachment: fixed !important;
background-repeat: no-repeat !important}
}
#mask {
display: none !important;
}
#login{
border:4px solid #008800 !important
}
</style>
<title>
Portal Login
</title>
</head>
<body OnLoad="Start();">
<noscript>
<div class="noJavascriptBox">
Your web browser must have JavaScript enabled
in order for this application to display correctly.
</div>
</noscript>
<div id="hasJavascript" class="hidden contentContainer">
<div id="container">
<div id="mask"></div>
<div id="login">
<h2>Portal Live Login</h2>
<form id="login_form" method="POST" onSubmit="return setSubmitUrl(this);">
<label for="j_username">Username:</label>
<input type="text" id="j_username" name="j_username" autocapitalize="off" autocorrect="off"/>
<label for="j_password">Password:</label>
<input type="password" id="j_password" name="j_password" autocapitalize="off" autocorrect="off"/>
<label id="failureIndication"> </label>
<input type="submit" value="OK"/>
</form>
</div>
</div>
</div>
</body>
I think you need to wait for the page to load or, to be more specific, wait for the visibility of the password field:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
password = wait.until(EC.visibility_of_element_located((By.ID, "j_password")))
password.send_keys("password")
I solved the problem by passing a service argument to ignore ssl errors (service_args=['--ignore-ssl-errors=true']) like this:
driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=['--ignore-ssl-errors=true'])
Then it worked!
I have this video surveillance software on Windows that has it's brower remote access with user/password authentication, on PC's IP. (I can also view from remote since I have a public IP).
I check the code in the browser UI and want to take these span values (liveCameraCount and totalCameraCount):
I use the lime code, that is work but reply with:
None
None
ON
While it should be
2
2
ON
(I've also try with a time.sleep() to let the page load but without success)
from bs4 import BeautifulSoup
import urllib
import urllib2
import base64
url = "http://my.pc.ip.address:port"
username = "myuser"
password = "mypass"
handle = urllib2.Request(url)
authheader = "Basic %s" % base64.encodestring('%s:%s' % (username,password))
data = urllib.urlopen(url)
soup = BeautifulSoup(data, "html.parser")
cameras = soup.findAll('span')
for span in cameras:
print span.string
I'm also trying to login automatically without that it ask every time:
Enter username [for my.pc.ip.address:port]:------
Enter password for username at [for my.pc.ip.address:port]:-----
EDIT 1:
OK. that's strange.
If I press F12 I can see the value inside as the image posted, but if I press CRTL + U i see this code (without value). I don't understand:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<script>
var isMobile = navigator.userAgent.match(/(iPhone|iPod|Android|BlackBerry)/);
if(isMobile){
window.location = '/mobile/';
}
</script>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>Sighthound</title>
<link rel="stylesheet" href="/css/sighthound-desktop.css" />
<style type="text/css">
#liveTime{
margin-left: 6px;
}
.views {
margin-right: 6px;
}
</style>
<!-- Favicons and touch icons -->
<!-- For retina-display iPads -->
<link href="/img/apple-touch-icon-xlarge.png" rel="apple-touch-icon- precomposed" sizes="144x144" type="image/png"/>
<!-- For retina-display iPhones -->
<link href="/img/apple-touch-icon-large.png" rel="apple-touch-icon-precomposed" sizes="114x114" type="image/png"/>
<!-- For iPad 1 -->
<link href="/img/apple-touch-icon-medium.png" rel="apple-touch-icon-precomposed" sizes="72x72" type="image/png"/>
<!-- For iPhone 3G, iPod Touch and Android -->
<link href="/img/apple-touch-icon-small.png" rel="apple-touch-icon-precomposed" type="image/png"/>
<!-- For Nokia -->
<link href="/img/apple-touch-icon-small.png" rel="shortcut icon"/>
<!-- For everything else -->
<link href="/img/favicon.png" rel="shortcut icon" type="image/png"/>
<link href="/img/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
</head>
<body>
<script type="text/javascript"> if (!window.console) console = {log: function() {}}; </script>
<script src="/js/jquery-2.0.3.min.js"></script>
<script src="/js/handlebars.js"></script>
<script src="/js/underscore-min.js"></script>
<script src="/js/handlebars-extras.js"></script>
<script src="/js/xmlrpc.js"></script>
<script src="/js/sighthoundxmlrpc.js"></script>
<script src="/js/sighthound.js"></script>
<script src="/js/purl.js"></script>
<script src="/js/moment.min.js"></script>
<script src="/js/camera_display.js"></script>
<script src="/js/all_cameras_control.js"></script>
<script src="/js/common.js"></script>
<div class="container">
<div class="header">
<div class="logo"><img src="img/logo.png" height="80" /></div><!-- End of Logo -->
<div class="pageNav buttonBar">
Cameras
Clips
</div><!-- End of Page Nav -->
<div class="on-off buttonBar">
<a id="allOffButton" href="#" class="button">Off</a>
<a id="allOnButton" href="#" class="button">On</a>
</div><!-- End of Buttons -->
<div class="cameras">
Cameras<br />
<strong> <span id="liveCameraCount"></span> / <span id="totalCameraCount"></span> <span class="expressive">ON</span></strong>
</div><!-- Camera -->
</div><!-- End of Header -->
<div class="content">
<div class="contextMenu">
<div id="liveTime" class="date"></div><!-- End of Date -->
<!--<div class="fullscreen buttonBar">
<img src="img/iconFullscreen.png" width="18" />
</div>--><!-- End of Context Fullscreen -->
<div class="views buttonBar">
<!--<a id="view1up" href="#" class="button viewButton"<img src="img/icon1upBlue.png" /></a>-->
<a id="view2up" href="#" class="button viewButton"><img src="img/icon2upBlue.png" /></a>
<a id="view3up" href="#" class="button viewButton"><img src="img/icon3upBlue.png" /></a>
<a id="view4up" href="#" class="button active viewButton"><img src="img/icon4upWhite.png" /></a>
</div><!-- End of Context Views -->
</div><!-- End of Context -->
<div id="cameraGrid"></div>
<script id="cameraGridTemplate" type="text/x-handlebars-template">
{{#everyNth cameras {(columns)}}}
{{#if isModZeroNotFirst}}
</div>
{{/if}}
{{#if isModZero}}
<div class="videos">
{{/if}}
<div class="video-{(columns)}up cameraVideo">
<a href="live.html?camera={{name}}&live={{live}}&cameraindex={{index}}">
<div class="videoImgContainer"
data-camera_index="{{index}}">
<!-- Image stream content is built by camera_display.js -->
</div>
<div class="cameraTitle">
<span>{{name}}<span>
</div>
</a>
</div>
{{#if isLast}}
</div>
{{/if}}
{{/everyNth}}
</script>
</div><!-- End of Content -->
</div><!-- End of Container -->
<script src="/js/index.js" type="text/javascript"></script>
</body>
</html>
You could try swapping string for text as follows:
data = """<strong>
<span id="liveCameraCount">2</span>
" / "
<span id="totalCameraCount">2</span>
<span class="expressive">ON</span>
</strong>
</div>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(data, "html.parser")
for span in soup.findAll('span'):
print span.text
This would display the following for your small example:
2
2
ON
For your full HTML, the first two <span>s are empty, so empty strings will be printed.