This question already has an answer here:
HTML tag appears empty when parsing it with BeautifulSoup but has content when opened in browser
(1 answer)
Closed 2 years ago.
I am trying to scrape a table from a website:
After importing the url
print(soup.prettify())
<!DOCTYPE html>
<html lang="en">
<head>
<meta content="noindex" name="robots"/>
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1,shrink-to-fit=no" name="viewport"/>
<link href="https://d9mzsvqupf0ma.cloudfront.net/0367505b9e/static/react/favicon.ico" rel="shortcut icon"/>
<title>
Reonomy
</title>
<script src="/static/react/env.js?1592498512097">
</script>
<script onerror='console.error("Error loading Google Maps. Please check your firewall, proxy, or ad blocker settings.")' src="//maps.googleapis.com/maps/api/js?v=3&libraries=places,drawing,geometry&client=gme-scryerinc">
</script>
<script type="text/javascript">
!function(){if(void 0!==window.env&&"production"===window.env.REACT_APP_ENVIRONMENT){var i=window.analytics=window.analytics||[];if(!i.initialize)if(i.invoked)window.console&&console.error&&console.error("Segment snippet included twice.");else{i.invoked=!0,i.methods=["trackSubmit","trackClick","trackLink","trackForm","pageview","identify","reset","group","track","ready","alias","debug","page","once","off","on"],i.factory=function(t){return function(){var e=Array.prototype.slice.call(arguments);return e.unshift(t),i.push(e),i}};for(var e=0;e<i.methods.length;e++){var t=i.methods[e];i[t]=i.factory(t)}i.load=function(e,t){var n=document.createElement("script");n.type="text/javascript",n.async=!0,n.src="https://cdn.segment.com/analytics.js/v1/"+e+"/analytics.min.js";var o=document.getElementsByTagName("script")[0];o.parentNode.insertBefore(n,o),i._loadOptions=t},i.SNIPPET_VERSION="4.1.0",i.load("Jb0xYxcgY3BJTcGWoAmtUP9qwhM9V2pp")}}}()
</script>
<link href="https://d9mzsvqupf0ma.cloudfront.net/0367505b9e/static/react/static/css/main.4f4bf592.chunk.css" rel="stylesheet"/>
</head>
<body>
<noscript>
You need to enable JavaScript to run this app.
</noscript>
<div id="root">
</div>
<script>
!function(d){function e(e){for(var t,r,n=e[0],c=e[1],o=e[2],a=0,f=[];a<n.length;a++)r=n[a],Object.prototype.hasOwnProperty.call(s,r)&&s[r]&&f.push(s[r][0]),s[r]=0;for(t in c)Object.prototype.hasOwnProperty.call(c,t)&&(d[t]=c[t]);for(h&&h(e);f.length;)f.shift()();return i.push.apply(i,o||[]),u()}function u(){for(var e,t=0;t<i.length;t++){for(var r=i[t],n=!0,c=1;c<r.length;c++){var o=r[c];0!==s[o]&&(n=!1)}n&&(i.splice(t--,1),e=p(p.s=r[0]))}return e}var r={},l={5:0},s={5:0},i=[];function p(e){if(r[e])return r[e].exports;var t=r[e]={i:e,l:!1,exports:{}};return d[e].call(t.exports,t,t.exports,p),t.l=!0,t.exports}p.e=function(i){var e=[];l[i]?e.push(l[i]):0!==l[i]&&{20:1,21:1,24:1,25:1}[i]&&e.push(l[i]=new Promise(function(e,n){for(var t="static/css/"+({}[i]||i)+"."+{0:"31d6cfe0",1:"31d6cfe0",2:"31d6cfe0",3:"31d6cfe0",7:"31d6cfe0",8:"31d6cfe0",9:"31d6cfe0",10:"31d6cfe0",11:"31d6cfe0",12:"31d6cfe0",13:"31d6cfe0",14:"31d6cfe0",15:"31d6cfe0",16:"31d6cfe0",17:"31d6cfe0",18:"31d6cfe0",19:"31d6cfe0",20:"7bbd82a1",21:"989321a7",22:"31d6cfe0",23:"31d6cfe0",24:"d608a43c",25:"36cb7054",26:"31d6cfe0",27:"31d6cfe0",28:"31d6cfe0",29:"31d6cfe0",30:"31d6cfe0",31:"31d6cfe0",32:"31d6cfe0"}[i]+".chunk.css",c=p.p+t,r=document.getElementsByTagName("link"),o=0;o<r.length;o++){var a=(d=r[o]).getAttribute("data-href")||d.getAttribute("href");if("stylesheet"===d.rel&&(a===t||a===c))return e()}var f=document.getElementsByTagName("style");for(o=0;o<f.length;o++){var d;if((a=(d=f[o]).getAttribute("data-href"))===t||a===c)return e()}var u=document.createElement("link");u.rel="stylesheet",u.type="text/css",u.onload=e,u.onerror=function(e){var t=e&&e.target&&e.target.src||c,r=new Error("Loading CSS chunk "+i+" failed.\n("+t+")");r.code="CSS_CHUNK_LOAD_FAILED",r.request=t,delete l[i],u.parentNode.removeChild(u),n(r)},u.href=c,document.getElementsByTagName("head")[0].appendChild(u)}).then(function(){l[i]=0}));var r=s[i];if(0!==r)if(r)e.push(r[2]);else{var t=new Promise(function(e,t){r=s[i]=[e,t]});e.push(r[2]=t);var n,c=document.createElement("script");c.charset="utf-8",c.timeout=120,p.nc&&c.setAttribute("nonce",p.nc),c.src=p.p+"static/js/"+({}[i]||i)+"."+{0:"ca0cfe7f",1:"1f775947",2:"f3aa526c",3:"8e92118a",7:"8821eefa",8:"e17401b1",9:"6e4ba317",10:"24f1a107",11:"96c5e7b8",12:"7a6ef661",13:"e539811a",14:"37c1ffc4",15:"dc8d4356",16:"2d61de04",17:"23eefbbb",18:"51a9cf50",19:"7f8a5cf4",20:"c409a0e9",21:"00e0dc95",22:"de275a36",23:"114fe889",24:"a1c29240",25:"b1426e77",26:"2eaf037b",27:"cf150351",28:"ac391d82",29:"b2c0bc67",30:"4b510904",31:"5a5b63b1",32:"f8a3d31f"}[i]+".chunk.js";var o=new Error;n=function(e){c.onerror=c.onload=null,clearTimeout(a);var t=s[i];if(0!==t){if(t){var r=e&&("load"===e.type?"missing":e.type),n=e&&e.target&&e.target.src;o.message="Loading chunk "+i+" failed.\n("+r+": "+n+")",o.name="ChunkLoadError",o.type=r,o.request=n,t[1](o)}s[i]=void 0}};var a=setTimeout(function(){n({type:"timeout",target:c})},12e4);c.onerror=c.onload=n,document.head.appendChild(c)}return Promise.all(e)},p.m=d,p.c=r,p.d=function(e,t,r){p.o(e,t)||Object.defineProperty(e,t,{enumerable:!0,get:r})},p.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},p.t=function(t,e){if(1&e&&(t=p(t)),8&e)return t;if(4&e&&"object"==typeof t&&t&&t.__esModule)return t;var r=Object.create(null);if(p.r(r),Object.defineProperty(r,"default",{enumerable:!0,value:t}),2&e&&"string"!=typeof t)for(var n in t)p.d(r,n,function(e){return t[e]}.bind(null,n));return r},p.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return p.d(t,"a",t),t},p.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},p.p="https://d9mzsvqupf0ma.cloudfront.net/0367505b9e/static/react/",p.oe=function(e){throw console.error(e),e};var t=this.webpackJsonpfrontend=this.webpackJsonpfrontend||[],n=t.push.bind(t);t.push=e,t=t.slice();for(var c=0;c<t.length;c++)e(t[c]);var h=n;u()}([])
</script>
<script src="https://d9mzsvqupf0ma.cloudfront.net/0367505b9e/static/react/static/js/6.41e506b7.chunk.js">
</script>
<script src="https://d9mzsvqupf0ma.cloudfront.net/0367505b9e/static/react/static/js/main.e68cecb8.chunk.js">
</script>
</body>
</html>
When I inspect the website, I see that my table is there between tags:
Still when I use :
print(soup.find_all('td'))
It returns me an empty list. Can someone point out what I did wrong ?
Beautifulsoup, doesn't evaluate javascript.
It looks like all those tables are being generated by Javascript. You could use dryscape to evaluate the page before passing it on to beautiful soup.
I have been using the selenium webdriver with python in an attempt to try and login to this website Login Page Here
To do this I did the following in python:
from selenium import webdriver
import bs4 as bs
driver = webdriver.Chrome()
driver.get('https://app.chatra.io/')
I then go on to make an attempt at parsing using Beautiful Soup:
html = driver.execute_script('return document.documentElement.outerHTML')
soup = bs.BeautifulSoup(html, 'html.parser')
print(soup.prettify)
The main issue is that the page never fully loads. When I load the page in a browser on my own, all is fine. However when the selenium webdriver tries to load it, it just seemingly stops halfway.
Any idea why? Any ideas on how to fix it or where to look to learn?
First of all, the issue is also reproducible for me in the latest Chrome (with chromedriver 2.34 - also currently latest) - not yet sure what's happening at the moment. Workaround: Firefox worked for me perfectly.
And, I would add an extra step in between driver.get() and HTML parsing - an explicit wait to let the page properly load until the desired condition would be true:
import bs4 as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get('https://app.chatra.io/')
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.ID, "signin-email")))
html = driver.execute_script('return document.documentElement.outerHTML')
soup = bs.BeautifulSoup(html, 'html.parser')
print(soup.prettify())
Note that you also needed to call prettify() - it's a method.
There are several aspects to the issue you are facing as below :
As you are trying to take help of BeautifulSoup so if you try to use urlopen from urllib.request the error says it all :
urllib.error.HTTPError: HTTP Error 403: Forbidden
Which means urllib.request is getting detected and HTTP Error 403: Forbidden is raised. Hence using webdriver from selenium makes sense.
Next, when you take help of ChromeDriver and Chrome initially the Website opens and renders. But soon ChromeDriver being a WebDriver is detected and ChromeDriver is unable to parse the <head> & <body> tags. You see the minimal header as :
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" class="supports cssfilters flexwrap chrome webkit win hover web"></html>
Finally, when you take help of GeckoDriver and Firefox Quantum the Website opens and renders properly as follows :
Code Block :
from selenium import webdriver
from bs4 import BeautifulSoup as soup
driver = webdriver.Firefox(executable_path=r'C:\Utility\BrowserDrivers\geckodriver.exe')
driver.get('https://app.chatra.io/')
html = driver.execute_script('return document.documentElement.outerHTML')
pagesoup = soup(html, "html.parser")
print(pagesoup)
Console Output :
<html class="supports cssfilters flexwrap firefox gecko win hover web"><head>
<link class="" href="https://app.chatra.io/b281cc6b75916e26b334b5a05913e3eb18fd3a4d.css?meteor_css_resource=true&_g_app_v_=51" rel="stylesheet" type="text/css"/>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1, minimum-scale=1, user-scalable=no, viewport-fit=cover" name="viewport"/>
.
.
.
<em>··· Chatra</em>
.
.
.
</div></body></html>
Adding prettify to the soup extraction :
Code Block :
from selenium import webdriver
from bs4 import BeautifulSoup as soup
driver = webdriver.Firefox(executable_path=r'C:\Utility\BrowserDrivers\geckodriver.exe')
driver.get('https://app.chatra.io/')
html = driver.execute_script('return document.documentElement.outerHTML')
pagesoup = soup(html, "html.parser")
print(pagesoup.prettify)
Console Output :
<bound method Tag.prettify of <html class="supports cssfilters flexwrap firefox gecko win hover web"><head>
<link class="" href="https://app.chatra.io/b281cc6b75916e26b334b5a05913e3eb18fd3a4d.css?meteor_css_resource=true&_g_app_v_=51" rel="stylesheet" type="text/css"/>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, maximum-scale=1, minimum-scale=1, user-scalable=no, viewport-fit=cover" name="viewport"/>
.
.
.
<em>··· Chatra</em>
.
.
.
</div></body></html>>
Even you can use Selenium's page_source method as follows :
Code Block :
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'C:\Utility\BrowserDrivers\geckodriver.exe')
driver.get('https://app.chatra.io/')
print(driver.page_source)
Console Output :
<html class="supports cssfilters flexwrap firefox gecko win hover web">
<head>
<link rel="stylesheet" type="text/css" class="" href="https://app.chatra.io/b281cc6b75916e26b334b5a05913e3eb18fd3a4d.css?meteor_css_resource=true&_g_app_v_=51">
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, minimum-scale=1, user-scalable=no, viewport-fit=cover">
<!-- platform specific stuff -->
<meta name="msapplication-tap-highlight" content="no">
<meta name="apple-mobile-web-app-capable" content="yes">
<!-- favicon -->
<link rel="shortcut icon" href="/static/favicon.ico">
<!-- win8 tile -->
<meta name="msapplication-TileImage" content="/static/win-tile.png">
<meta name="msapplication-TileColor" content="#ffffff">
<meta name="application-name" content="Chatra">
<!-- apple touch icon -->
<!--<link rel="apple-touch-icon" sizes="256x256" href="/static/?????.png">-->
<title>··· Chatra</title>
<style>
body {
background: #f6f5f7
}
</style>
<style type="text/css"></style>
</head>
<body>
<script async="" src="https://www.google-analytics.com/analytics.js"></script>
<script type="text/javascript" src="/meteor_runtime_config.js"></script>
<script type="text/javascript" src="https://app.chatra.io/9153feecdc706adbf2c71253473a6aa62c803e45.js?meteor_js_resource=true&_g_app_v_=51"></script>
<div class="body body-layout">
<div class="body-layout__main main-layout">
<aside class="main-layout__left-sidebar">
<div class="left-sidebar-layout">
</div>
</aside>
<div class="main-layout__content">
<div class="content-layout">
<main class="content-layout__main is-no-fades js-popover-boundry js-main">
<div class="center loading loading--light">
<div class="content-padding nothing">
<em>··· Chatra</em>
</div>
</div>
</main>
</div>
</div>
</div>
</div>
</body>
</html>
Hi I was using following code to scrape craiglist.
import pandas as pd
import requests
%pylab inline
url_base = 'http://houston.craigslist.org/search/apa'
params = dict(bedrooms=2)
rsp = requests.get(url_base, params=params)
print(rsp.text[:500])
from bs4 import BeautifulSoup as bs4
html = bs4(rsp.text, 'html.parser')
print(html.prettify()[:1000])
everything works fine till above and the output is :-
<!DOCTYPE html>
<html class="no-js">
<head>
<title>
houston apartments / housing rentals - craigslist
</title>
<meta content="houston apartments / housing rentals - craigslist"
name="description">
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<link href="https://houston.craigslist.org/search/apa" rel="canonical">
<link href="https://houston.craigslist.org/search/apa?
format=rss&min_bedrooms=2" rel="alternate" title="RSS feed for
craigslist | houston apartments / housing rentals - craigslist "
type="application/rss+xml">
<link href="https://houston.craigslist.org/search/apa?
s=120&min_bedrooms=2" rel="next">
<meta content="width=device-width,initial-scale=1" name="viewport">
<link href="//www.craigslist.org/styles/cl.css?
v=a14d0c65f7978c2bbc0d780a3ea7b7be" media="all" rel="stylesheet"
type="text/css">
<link href="//www.craigslist.org/styles/search.css?v=27e1d4246df60da5ffd1146d59a8107e" media="all" rel="stylesheet" type="
It clearly shows that the list is not empty and there are items which i can use. This is use the following code:-
apts = html.find_all('p', attrs={'class': 'row'})
print(len(apts))
The above output of print(len(apts)) is 0..
can anyone please helkp in correcting this code. I do believe there is some change in the craiglist html parser but i dont know how to implement it here.
Thanks
There is no <p> tag with 'row' class instead <p> has 'result-info' class.
import requests
url_base = 'http://houston.craigslist.org/search/apa'
params = dict(bedrooms=2)
rsp = requests.get(url_base, params=params)
print(rsp.text[:500])
from bs4 import BeautifulSoup as bs4
html = bs4(rsp.text, 'html.parser')
print(html.prettify()[:1000])
apts = html.find_all('p', attrs={'class': 'result-info'})
print(len(apts))
I'm a big fan of stackoverflow and typically find solutions to my problems through this website. However, the following problem has bothered me for so long that it forced me to create an account here and ask directly:
I'm trying to scape this link: https://permid.org/1-21475776041 What i want is the row "TRCS Asset Class" and "Currency".
For starters, I'm using this code:
from bs4 import BeautifulSoup
import urllib2
url = 'https://permid.org/1-21475776041'
req = urllib2.urlopen(url)
raw = req.read()
soup = BeautifulSoup(raw)
print soup.prettify()
The html code returned (see below) is different from what you can see in your browser upon clicking the link:
<!DOCTYPE html>
<!--[if lt IE 7]> <html ng-app="tmsMdaasApp" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html ng-app="tmsMdaasApp" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html ng-app="tmsMdaasApp" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" ng-app="tmsMdaasApp">
<!--<![endif]-->
<head>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="max-age=0,no-cache" http-equiv="Cache-Control"/>
<base href="/"/>
<title ng-bind="PageTitle">
Thomson Reuters | PermID
</title>
<meta content="" name="description"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="#ff8000" name="theme-color"/>
<!-- Place favicon.ico and apple-touch-icon.png in the root directory -->
<link href="app/vendor.daf96efe.css" rel="stylesheet"/>
<link href="app/app.1405210f.css" rel="stylesheet"/>
<link href="favicon.ico" rel="icon"/>
<!-- Typekit -->
<script src="//use.typekit.net/gnw2rmh.js">
</script>
<script>
try{Typekit.load({async:true});}catch(e){}
</script>
<!-- // Typekit -->
<!-- Google Tag Manager Data Layer -->
<!--<script>
analyticsEvent = function() {};
analyticsSocial = function() {};
analyticsForm = function() {};
dataLayer = [];
</script>-->
<!-- // Google Tag Manager Data Layer -->
</head>
<body class="theme-grey" id="top" ng-esc="">
<!--[if lt IE 7]>
<p class="browserupgrade">You are using an <strong>outdated</strong> browser. Please upgrade your browser to improve your experience.</p>
<![endif]-->
<!-- Add your site or application content here -->
<navbar class="tms-navbar">
</navbar>
<div id="body" role="main" ui-view="">
</div>
<div id="footer-wrapper" ng-show="!params.elementsToHide">
<footer id="main-footer">
</footer>
</div>
<!--[if lt IE 9]>
<script src="bower_components/es5-shim/es5-shim.js"></script>
<script src="bower_components/json3/lib/json3.min.js"></script>
<![endif]-->
<script src="app/vendor.8cc12370.js">
</script>
<script src="app/app.6e5f6ce8.js">
</script>
</body>
</html>
Does anyone know what I'm missing here and how I could get it to work?
Thanks, Teemu Risikko - a comment (albeit not the solution) of the website you linked got me on the right path.
In case someone else is bumping into the same problem, here is my solution: I'm getting the data via requests and not via traditional "scraping" (e.g. BeautifulSoup or lxml).
Navigate to the website using Google Chrome.
Right-click on the website and select "Inspect".
On the top navigation bar select "Network".
Limit network monitor to "XHR".
One of the entries (market with an arrow) shows the link that can be used with the requests library.
import requests
url = 'https://permid.org/api/mdaas/getEntityById/21475776041'
headers = {'X-AG-Access-Token': YOUR_ACCESS_TOKEN}
r = requests.get(url, headers=headers)
r.json()
Which gets me this:
{u'Asset Class': [u'Units'],
u'Asset Class URL': [u'https://permid.org/1-302043'],
u'Currency': [u'CAD'],
u'Currency URL': [u'https://permid.org/1-500140'],
u'Exchange': [u'TOR'],
u'IsQuoteOf.mdaas': [{u'Is Quote Of': [u'Convertible Debentures Income Units'],
u'URL': [u'https://permid.org/1-21475768667'],
u'quoteOfInstrument': [u'21475768667'],
u'quoteOfInstrument URL': [u'https://permid.org/1-21475768667']}],
u'Mic': [u'XTSE'],
u'PERM ID': [u'21475776041'],
u'Quote Name': [u'CONVERTIBLE DEBENTURES INCOME UNT'],
u'Quote Type': [u'equity'],
u'RIC': [u'OCV_u.TO'],
u'Ticker': [u'OCV.UN'],
u'entityType': [u'Quote']}
Using the default user-agent with a lot of pages will give you a different looking page because it is using an outdated user-agent. This is what your output is telling you.
Reference on Changing user-agents
Thought this may be your problem, it does not exactly answer the question about getting dynamically applied changes on a webpage. To get the dynamically changed data you need to emulate the javascript requests that the page is making on load. If you make the requests that the javascript is making you will get the data that the javascript is getting.