How to scrape several websites with pyqt4, scope change? - python

I would like to scrape two websites in java for links using PyQt4.QtWebKit to render the pages and then get the desired links. The code works fine with one page or url, but stops (but continues running until force quit) after printing the links of the first website. It seems the scope stays in the event loop of the render class. How can I get the program to change scope and continue with the for loop and rendering the second website? Using exit() in _loadFinished method just quits the program after the first iteration. Maybe the python app has to close and reopen to render the next page, which is impossible because the app is opened/reopened outside of the program?
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4 import QtGui
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.frame = None
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
result = self.frame.toHtml()
formatted_result = str(result)
tree = html.fromstring(formatted_result)
archive_links = tree.xpath('//div/div/a/#href')[0:4]
print(archive_links)
urls = ['http://pycoders.com/archive/', 'http://www.pythonjobshq.com']
def main(urls):
app = QtGui.QApplication(sys.argv)
for url in urls:
r = Render(url)
#s = Render(urls[1]) #The pages can be rendered parallel, but rendering more than a handful of pages a the same time is a bad idea
sys.exit(app.exec_())
if __name__ == '__main__':
main(urls)
Thankful for any help!

Related

Proxies when using PyQt to render HTML

I'm looking to scrape JavaScript-driven pages using this code, which has appeared on a number of past threads (c.f. this, this, and others on offsite threads):
import sys
from PyQt5.QtCore import QEventLoop
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
def render(source_html):
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(source_html).html
It's working fine.
My question is whether I need to use a proxy for this portion of the code (assuming that I generally want to be using a proxy for all network activity).
I'm using urllib.request and a proxy to access the site in question, and then passing the html from there on to PyQt5 to do the JavaScript mambo. Does that second leg of the journey involve a network connection that should be proxy-fied? If so, how should I change this code - haven't touched PyQt until today and am feeling a bit over my head.
Using Python 3.5 and Windows 7.
Many thanks.

How to get the html dom of a webpage and its frames

I would like to get the DOM of a website after js execution.
I would also like to get all the content of the iframes in the website, similarly to what I have in Google Chrome's Inspect Element feature.
This is my code:
import sys
from PyQt4 import QtGui, QtCore, QtWebKit
class Sp():
def save(self):
print ("call")
data = self.webView.page().currentFrame().documentElement().toInnerXml()
print(data.encode('utf-8'))
print ('finished')
def main(self):
self.webView = QtWebKit.QWebView()
self.webView.load(QtCore.QUrl("http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe_scrolling"))
QtCore.QObject.connect(self.webView,QtCore.SIGNAL("loadFinished(bool)"),self.save)
app = QtGui.QApplication(sys.argv)
s = Sp()
s.main()
sys.exit(app.exec_())
This gives me the html of the website, but not the html inside the iframes. Is there any way that I could get the HTML of the iframes.
This is a very hard problem to solve in general.
The main difficulty is that there is no way to know in advance how many frames each page has. And in addition to that, each child-frame may have its own set of frames, the number of which is also unknown. In theory, there could be an infinite number of nested frames, and the page will never finish loading (which seems no exaggeration for sites that have a lot of ads).
Anyway, below is a version of your script which gets the top-level QWebFrame object of each frame as it loads, and shows how you can access some of the things you are interested in. As you will see from the output, there are a lot of "junk" frames inserted by ads and such like that you will somehow need to filter out.
import sys, signal
from PyQt4 import QtGui, QtCore, QtWebKit
class Sp():
def save(self, ok, frame=None):
if frame is None:
print ('main-frame')
frame = self.webView.page().mainFrame()
else:
print('child-frame')
print('URL: %s' % frame.baseUrl().toString())
print('METADATA: %s' % frame.metaData())
print('TAG: %s' % frame.documentElement().tagName())
print()
def handleFrameCreated(self, frame):
frame.loadFinished.connect(lambda: self.save(True, frame=frame))
def main(self):
self.webView = QtWebKit.QWebView()
self.webView.page().frameCreated.connect(self.handleFrameCreated)
self.webView.page().mainFrame().loadFinished.connect(self.save)
self.webView.load(QtCore.QUrl("http://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe_scrolling"))
signal.signal(signal.SIGINT, signal.SIG_DFL)
print('Press Crtl+C to quit\n')
app = QtGui.QApplication(sys.argv)
s = Sp()
s.main()
sys.exit(app.exec_())
NB: it is important that you connect to the loadFinished signal of the main frame rather than the web-view. If you connect to the latter, it will be called multiple times if the page contains more than one frame.

WebScraping, PyQt

I've been working on a really simple script that gets the question titles from the python tag in stackoverflow and shows them in a QTextBrowser.
The application works as expected(at least at the beggining) but the Window doesn't show untill it finishes loading the webpage and the refresh button freezes the program until it loads too is there a way to fix this problem? Here's The full code:
#! usr/bin/env python
from PyQt4.QtGui import *
import requests
from bs4 import BeautifulSoup
import lxml
from threading import Thread
class Form(QWidget):
def __init__(self, parent=None):
super(Form, self).__init__(parent)
self.url = "http://www.stackoverflow.com/questions/tagged/python"
self.browser = QTextBrowser()
self.connectionlabel = QLabel()
self.refreshBtn = QPushButton("Refresh")
self.refreshBtn.clicked.connect(self.get)
layout = QGridLayout()
layout.addWidget(self.connectionlabel, 0, 0)
layout.addWidget(self.refreshBtn, 0, 1)
layout.addWidget(self.browser, 1, 0,1,2)
self.setLayout(layout)
self.setWindowTitle("StackOverflow: Python")
def get(self):
self.browser.clear()
self.connectionlabel.setText("Connecting.....")
try:
response = requests.get(self.url)
soup = BeautifulSoup(response.content, 'lxml')
self.connectionlabel.setText("Connected.")
questions = soup.find_all("a", {"class": "question-hyperlink"})
for i, questionTitle in enumerate(questions):
try:
self.browser.append("\n"+str(i+1)+". "+questionTitle.text)
except:
pass
except:
self.connectionlabel.setText("Couldn't connect.")
if __name__ == '__main__':
import sys
app = QApplication(sys.argv)
screen = Form()
screen.show()
t = Thread(screen.get)
t.deamon = True
t.start()
sys.exit(app.exec_())
The UI will always freeze until the code execution is complete. In order to avoid this use multiprocessing or threading and call the blocking code in a separate process/thread. You can also use PyQT's QThread.

QWebView get response

I have a python code with PySide that has a QWebView that shows google maps.
I just want to get the response each time that I do any request using the QWebView widget.
I have searched info but there is no reference about getting a response with PySide. If you need me to paste some code I will but I just have a simple QWebView widget.
EDIT: You asked me for the code:
from PySide.QtCore import *
from PySide.QtGui import *
import sys
import pyside3
class MainDialog(QMainWindow, pyside3.Ui_MainWindow):
def __init__(self, parent=None):
super(MainDialog,self).__init__(parent)
self.setupUi(self)
token_fb=""
#self.Connect_buttom.clicked.connect(self.get_fb_token)
self.Connect_buttom.clicked.connect(lambda: self.get_fb_token(self.FB_username.text(), self.FB_password.text()))
#self.connect(self.Connect_buttom, SIGNAL("clicked()"), self.get_fb_token)
#Change between locate and hunt
self.MapsButton.clicked.connect(lambda: self.select_page_index(0))
self.HuntButton.clicked.connect(lambda: self.select_page_index(1))
###########################
self.webView.setHtml(URL)
def select_page_index(self, index): # To change between frames
self.Container.setCurrentIndex(index)
I need the response from: self.webView.setHtml(URL) because depending on the response my app has to do one thing or other.
Function QWebView.setHtml() has no response in the sense that it doesn't return anything.
Maybe you want to listen to all links that are clicked and do something custom with it.
web_view = QtWebKit.QWebView()
web_view.page().setLinkDelegationPolicy(QtWebKit.QWebPage.DelegateAllLinks)
web_view.linkClicked.connect(your_handler)
Or maybe you want to do something when loading has finished. This is done by:
web_view = QtWebKit.QWebView()
web_view.loadFinished.connect(your_handler)

Retrieve HTML from webpage in PyQt

I currently have a python script running, logging in the given user to the web page and navigating to a different page on the site after logging. What I'm aiming for i to get the raw HTML of the final page after it is finished loading. I have tried different variation of the Render class but it seems to throw a "frame is not an attribute of Render" error. Like I said my main goal is just to get the HTML of the page. What am I doing incorrect?
url = "https://firstwebpage.com/"
url3 = "https://finaldestinationpage.com" #the page that I want the HTML from
username = "username"
password = "password"
import sys, signal
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Render(QWebPage):
def __init__(self, app, url):
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
def JSEval(code):
return webpage.mainFrame().evaluateJavaScript(code)
def onLoadStarted():
print("Loading started: %s" % webpage.mainFrame().url().toString())
def onLoadFinished(result):
print("Loading finished: %s" % webpage.mainFrame().url().toString())
if not result:
print("Request failed")
return
JSEval("_form = document.getElementsByName('loginForm')[0];")
JSEval("_form.username.value='%s';" % username \
+ "_form.password.value='%s';" % password \
+ "_form.submit();")
print("Login data sent")
if webpage.mainFrame().url().toString() == url3:
r = Render(app,url3)
html = r.frame.toHtml() #Here is where the "frame" error comes in
print(html)
app = QApplication(sys.argv)
signal.signal(signal.SIGINT, signal.SIG_DFL)
webpage = QWebPage()
webpage.connect(webpage, SIGNAL("loadFinished(bool)"), onLoadFinished)
webpage.connect(webpage, SIGNAL("loadStarted()"), onLoadStarted)
webpage.mainFrame().load(QUrl(url)) #where user is initally logged in
webpage.mainFrame().load(QUrl(url3))
web = QWebView()
web.setPage(webpage)
web.show()
sys.exit(app.exec_())
In your Render class, you set the frame attribute in your handler for loadFinished, but when you use it you try to access it immediately after you created your Render instance. By that time chances are very high that the page hasn't been loaded yet and therefore the signal has't fired, so r.frame will not yet exit.
If you want to get the html content of the page, you can only do it after the page has finished loading, so it would be better to do it in your signal handler.
Oh, and you shouldn't call app.exec_() in Render.__init__.

Categories