I currently have a python script running, logging in the given user to the web page and navigating to a different page on the site after logging. What I'm aiming for i to get the raw HTML of the final page after it is finished loading. I have tried different variation of the Render class but it seems to throw a "frame is not an attribute of Render" error. Like I said my main goal is just to get the HTML of the page. What am I doing incorrect?
url = "https://firstwebpage.com/"
url3 = "https://finaldestinationpage.com" #the page that I want the HTML from
username = "username"
password = "password"
import sys, signal
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, app, url):
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
def JSEval(code):
return webpage.mainFrame().evaluateJavaScript(code)
def onLoadStarted():
print("Loading started: %s" % webpage.mainFrame().url().toString())
def onLoadFinished(result):
print("Loading finished: %s" % webpage.mainFrame().url().toString())
if not result:
print("Request failed")
return
JSEval("_form = document.getElementsByName('loginForm')[0];")
JSEval("_form.username.value='%s';" % username \
+ "_form.password.value='%s';" % password \
+ "_form.submit();")
print("Login data sent")
if webpage.mainFrame().url().toString() == url3:
r = Render(app,url3)
html = r.frame.toHtml() #Here is where the "frame" error comes in
print(html)
app = QApplication(sys.argv)
signal.signal(signal.SIGINT, signal.SIG_DFL)
webpage = QWebPage()
webpage.connect(webpage, SIGNAL("loadFinished(bool)"), onLoadFinished)
webpage.connect(webpage, SIGNAL("loadStarted()"), onLoadStarted)
webpage.mainFrame().load(QUrl(url)) #where user is initally logged in
webpage.mainFrame().load(QUrl(url3))
web = QWebView()
web.setPage(webpage)
web.show()
sys.exit(app.exec_())
In your Render class, you set the frame attribute in your handler for loadFinished, but when you use it you try to access it immediately after you created your Render instance. By that time chances are very high that the page hasn't been loaded yet and therefore the signal has't fired, so r.frame will not yet exit.
If you want to get the html content of the page, you can only do it after the page has finished loading, so it would be better to do it in your signal handler.
Oh, and you shouldn't call app.exec_() in Render.__init__.
Related
I would like to get the DOM of a website after js execution.
I would also like to get all the content of the iframes in the website, similarly to what I have in Google Chrome's Inspect Element feature.
This is my code:
import sys
from PyQt4 import QtGui, QtCore, QtWebKit
class Sp():
def save(self):
print ("call")
data = self.webView.page().currentFrame().documentElement().toInnerXml()
print(data.encode('utf-8'))
print ('finished')
def main(self):
self.webView = QtWebKit.QWebView()
self.webView.load(QtCore.QUrl("http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe_scrolling"))
QtCore.QObject.connect(self.webView,QtCore.SIGNAL("loadFinished(bool)"),self.save)
app = QtGui.QApplication(sys.argv)
s = Sp()
s.main()
sys.exit(app.exec_())
This gives me the html of the website, but not the html inside the iframes. Is there any way that I could get the HTML of the iframes.
This is a very hard problem to solve in general.
The main difficulty is that there is no way to know in advance how many frames each page has. And in addition to that, each child-frame may have its own set of frames, the number of which is also unknown. In theory, there could be an infinite number of nested frames, and the page will never finish loading (which seems no exaggeration for sites that have a lot of ads).
Anyway, below is a version of your script which gets the top-level QWebFrame object of each frame as it loads, and shows how you can access some of the things you are interested in. As you will see from the output, there are a lot of "junk" frames inserted by ads and such like that you will somehow need to filter out.
import sys, signal
from PyQt4 import QtGui, QtCore, QtWebKit
class Sp():
def save(self, ok, frame=None):
if frame is None:
print ('main-frame')
frame = self.webView.page().mainFrame()
else:
print('child-frame')
print('URL: %s' % frame.baseUrl().toString())
print('METADATA: %s' % frame.metaData())
print('TAG: %s' % frame.documentElement().tagName())
print()
def handleFrameCreated(self, frame):
frame.loadFinished.connect(lambda: self.save(True, frame=frame))
def main(self):
self.webView = QtWebKit.QWebView()
self.webView.page().frameCreated.connect(self.handleFrameCreated)
self.webView.page().mainFrame().loadFinished.connect(self.save)
self.webView.load(QtCore.QUrl("http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe_scrolling"))
signal.signal(signal.SIGINT, signal.SIG_DFL)
print('Press Crtl+C to quit\n')
app = QtGui.QApplication(sys.argv)
s = Sp()
s.main()
sys.exit(app.exec_())
NB: it is important that you connect to the loadFinished signal of the main frame rather than the web-view. If you connect to the latter, it will be called multiple times if the page contains more than one frame.
I would like to scrape two websites in java for links using PyQt4.QtWebKit to render the pages and then get the desired links. The code works fine with one page or url, but stops (but continues running until force quit) after printing the links of the first website. It seems the scope stays in the event loop of the render class. How can I get the program to change scope and continue with the for loop and rendering the second website? Using exit() in _loadFinished method just quits the program after the first iteration. Maybe the python app has to close and reopen to render the next page, which is impossible because the app is opened/reopened outside of the program?
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4 import QtGui
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.frame = None
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
result = self.frame.toHtml()
formatted_result = str(result)
tree = html.fromstring(formatted_result)
archive_links = tree.xpath('//div/div/a/#href')[0:4]
print(archive_links)
urls = ['http://pycoders.com/archive/', 'http://www.pythonjobshq.com']
def main(urls):
app = QtGui.QApplication(sys.argv)
for url in urls:
r = Render(url)
#s = Render(urls[1]) #The pages can be rendered parallel, but rendering more than a handful of pages a the same time is a bad idea
sys.exit(app.exec_())
if __name__ == '__main__':
main(urls)
Thankful for any help!
I've been working on a really simple script that gets the question titles from the python tag in stackoverflow and shows them in a QTextBrowser.
The application works as expected(at least at the beggining) but the Window doesn't show untill it finishes loading the webpage and the refresh button freezes the program until it loads too is there a way to fix this problem? Here's The full code:
#! usr/bin/env python
from PyQt4.QtGui import *
import requests
from bs4 import BeautifulSoup
import lxml
from threading import Thread
class Form(QWidget):
def __init__(self, parent=None):
super(Form, self).__init__(parent)
self.url = "http://www.stackoverflow.com/questions/tagged/python"
self.browser = QTextBrowser()
self.connectionlabel = QLabel()
self.refreshBtn = QPushButton("Refresh")
self.refreshBtn.clicked.connect(self.get)
layout = QGridLayout()
layout.addWidget(self.connectionlabel, 0, 0)
layout.addWidget(self.refreshBtn, 0, 1)
layout.addWidget(self.browser, 1, 0,1,2)
self.setLayout(layout)
self.setWindowTitle("StackOverflow: Python")
def get(self):
self.browser.clear()
self.connectionlabel.setText("Connecting.....")
try:
response = requests.get(self.url)
soup = BeautifulSoup(response.content, 'lxml')
self.connectionlabel.setText("Connected.")
questions = soup.find_all("a", {"class": "question-hyperlink"})
for i, questionTitle in enumerate(questions):
try:
self.browser.append("\n"+str(i+1)+". "+questionTitle.text)
except:
pass
except:
self.connectionlabel.setText("Couldn't connect.")
if __name__ == '__main__':
import sys
app = QApplication(sys.argv)
screen = Form()
screen.show()
t = Thread(screen.get)
t.deamon = True
t.start()
sys.exit(app.exec_())
The UI will always freeze until the code execution is complete. In order to avoid this use multiprocessing or threading and call the blocking code in a separate process/thread. You can also use PyQT's QThread.
The code below works fine except one thing, it does not follow the sign-up link.
However if I go to my actual browser and in console type:
document.getElementById("link-signup").click()
It will redirect me to the desired page. I was thinking that the problem accured because I didn't enable some feature in settings. but I'm not sure.
Thank's for any help
#! /usr/bin/env python2.7
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
import sys
class GrabberSettings(QWebPage):
def __init__(self):
QWebPage.__init__(self)
self.settings().setAttribute(QWebSettings.AutoLoadImages, False)
class Grabber(QWebView):
def __init__(self):
QWebView.__init__(self)
self.setPage(GrabberSettings())
self.loadFinished.connect(self._loadComplete)
self.doc = self.page().mainFrame().documentElement()
def _loadComplete(self):
print "Done"
link = self.doc.findFirst('a[link-signup]')
if link:
print "link found"
link.evaluateJavaScript('click()')
if __name__ == "__main__":
app = QApplication(sys.argv)
view = Grabber()
gmail = QUrl('https://accounts.google.com')
view.load(gmail)
view.show()
app.exec_()
I think the click() failure may have something to do with how the google page uses javascript to transform the original A element after it loads. If you wrap your evaluateJavaScript() call in an alert(), you can see that the click method is null
link.evaluateJavaScript('this.click')
It is not a 100% cross-browser support to be able to call "click" on a link. It would need to be a button.
You have a couple alternatives...
(#1) Just navigate to the href of the link
def _loadComplete(self):
page = self.page()
doc = page.currentFrame().documentElement()
link = doc.findFirst('#link-signup')
if link and not link.isNull():
self.load(QUrl.fromEncoded(link.attribute('href').toAscii()))
(#2) Simulate a click on the web view
def _loadComplete(self):
page = self.page()
doc = page.currentFrame().documentElement()
link = doc.findFirst('#link-signup')
if link and not link.isNull():
pos = link.geometry().center()
self._doMouseClick(page, pos)
else:
print "Link not found"
#staticmethod
def _doMouseClick(obj, pos):
# mouse down
evt = QMouseEvent(QEvent.MouseButtonPress, pos,
Qt.LeftButton, Qt.LeftButton, Qt.NoModifier)
QApplication.sendEvent(obj, evt)
# mouse up
evt = QMouseEvent(QEvent.MouseButtonRelease, pos,
Qt.LeftButton, Qt.LeftButton, Qt.NoModifier)
QApplication.sendEvent(obj, evt)
(#3) Make the link clickable via javascript
def _loadComplete(self):
page = self.page()
doc = page.currentFrame().documentElement()
link = doc.findFirst('#link-signup')
if link and not link.isNull():
link.evaluateJavaScript("""
var e = document.createEvent('MouseEvents');
e.initEvent('click', true, true);
this.dispatchEvent(e);
""")
I'm using PyQT webView to visit some webpages I have stored on a dictionary the code is something like this:
def loadLink(self, url):
manager = QNetworkAccessManager()
request = QNetworkRequest(QUrl(url))
self.ui.webView.load(QUrl(visitar))
def readUnreadLinks(self):
print "Links to read: " + str(len(self.unreadLinks))
for link in self.unreadLinks:
print "link-> " + str(link)
self.loadLink(link)
the problem is it doesn't wait until finished loading the web page and starts loading the next one. I want to load a webpage, wait until it finished loading and then load the next one.
Thanks, this is driving me crazy :)
you might want to use loadFinished signal of yout QWebView control to detect when page loading was finished and trigger loading of the new one. Pls, see if an example below would work for you:
import sys
from PyQt4 import QtGui, QtCore, QtWebKit
class MainForm(QtGui.QMainWindow):
def __init__(self, parent=None):
super(MainForm, self).__init__(parent)
self.pages = ['http://www.google.com', 'http://www.citrix.com', 'http://yahoo.com', 'http://reddit.com']
self.index = 0
self.view = QtWebKit.QWebView()
self.view.connect(self.view, QtCore.SIGNAL('loadFinished(bool)'), self.loadFinished)
self.setCentralWidget(self.view)
self.view.load(QtCore.QUrl(self.pages[self.index]))
def loadFinished(self, ok):
self.index += 1
if self.index < len(self.pages) :
self.view.load(QtCore.QUrl(self.pages[self.index]))
else:
print 'done'
def main():
app = QtGui.QApplication(sys.argv)
form = MainForm()
form.show()
app.exec_()
if __name__ == '__main__':
main()
hope this helps, regards