I'm using PyQt to fully load a page(including JS) and get it contents using Beautiful Soup. Works fine at the first iteration, but after, it crashes. I don't have a big knowledge in Python, and even less in PyQt, so any help is very welcome.
Class borrowed from here.
from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import sys
import signal
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.html = None
signal.signal(signal.SIGINT, signal.SIG_DFL)
self.connect(self, SIGNAL('loadFinished(bool)'), self._finished_loading)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _finished_loading(self, result):
self.html = self.mainFrame().toHtml()
self.soup = BeautifulSoup(UnicodeDammit(self.html).unicode_markup)
self.app.quit()
###################################################################
l = ["http://www.google.com/?q=a", "http://www.google.com/?q=b", "http://www.google.com/?q=c"]
for page in l:
soup = Render(page).soup
print("# soup done: " + page)
The example crashes because the RenderPage class attempts to create a new QApplication and event-loop for every url it tries to load.
Instead, only one QApplication should be created, and the QWebPage subclass should load a new url after each page has been processed, rather than using a for-loop.
Here's a re-write of the example which should do what you want:
import sys, signal
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt4 import QtCore, QtGui, QtWebKit
class WebPage(QtWebKit.QWebPage):
def __init__(self):
QtWebKit.QWebPage.__init__(self)
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def process(self, items):
self._items = iter(items)
self.fetchNext()
def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.mainFrame().load(QtCore.QUrl(self._url))
except StopIteration:
return False
return True
def handleLoadFinished(self):
self._func(self._url, self.mainFrame().toHtml())
if not self.fetchNext():
print('# processing complete')
QtGui.qApp.quit()
def funcA(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
def funcB(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
if __name__ == '__main__':
items = [
('http://stackoverflow.com', funcA),
('http://google.com', funcB),
]
signal.signal(signal.SIGINT, signal.SIG_DFL)
print('Press Ctrl+C to quit\n')
app = QtGui.QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())
Related
My problem is summed in title. When I call method setHtml on instance of QtWebPageRenderer, SIGILL signal is emitted and my application goes down.
I'm aware that this issue is caused by bad Qt5 dynamic library but I installed it with:
sudo pip install PyQt5 --only-binary PyQt5
sudo pip install PyQtWebEngine --only-binary PyQtWebEngine
so I thought I will get correct precompiled library. When I tried to install PyQt5 without --only-binary, I always ended with some strange compilation error. Something like qmake is not in PATH even though it is and I'm able to call qmake from shell.
So my question is, how to make PyQt5 running on Fedora 31 without any SIGILLs.
EDIT:
Following code can replicate the issue. That information about SIGILL is little inaccurate because first signal is actually SIGTRAP, after I hit continue in gdb, I got SIGILL. This hints that Qt is actually trying to say something to me, although in not very intuitive way.
After some playing around with it, I found that without thread, its ok. Does this mean that Qt forces user to use QThread and not python threads? Or it means that I can't call methods of Qt objects outside of thread where event loop is running?
import signal
import sys
import threading
from PyQt5 import QtWidgets
from PyQt5 import QtCore
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebView(QWebEnginePage):
def __init__(self):
QWebEnginePage.__init__(self)
self.loadFinished.connect(self.on_load_finish)
def print_result(self, data):
print("-" * 30)
print(data)
with open("temp.html", "wb") as hndl:
hndl.write(data.encode("utf-8"))
def on_load_finish(self):
self.toHtml(self.print_result)
class Runner(threading.Thread):
def __init__(self, web_view):
self.web_view = web_view
threading.Thread.__init__(self)
self.daemon = True
def run(self):
self.web_view.load(QtCore.QUrl("https://www.worldometers.info/coronavirus/"))
def main():
signal.signal(signal.SIGINT, signal.SIG_DFL)
app = QtWidgets.QApplication(sys.argv)
web_view = WebView()
runner = Runner(web_view)
runner.start()
app.exec_()
if __name__ == "__main__":
main()
You have to have several restrictions:
A QObject is not thread-safe so when creating "web_view" in the main thread then it is not safe to modify it in the secondary thread
Since the QWebEnginePage tasks run asynchronously then you need a Qt eventloop.
So if you want to use python's Thread class then you must implement both conditions:
import signal
import sys
import threading
from PyQt5 import QtWidgets
from PyQt5 import QtCore
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebView(QWebEnginePage):
def __init__(self):
QWebEnginePage.__init__(self)
self.loadFinished.connect(self.on_load_finish)
def print_result(self, data):
print("-" * 30)
print(data)
with open("temp.html", "wb") as hndl:
hndl.write(data.encode("utf-8"))
def on_load_finish(self):
self.toHtml(self.print_result)
class Runner(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.daemon = True
def run(self):
# The QWebEnginePage was created in a new thread and
# that thread has an eventloop
loop = QtCore.QEventLoop()
web_view = WebView()
web_view.load(QtCore.QUrl("https://www.worldometers.info/coronavirus/"))
loop.exec_()
def main():
signal.signal(signal.SIGINT, signal.SIG_DFL)
app = QtWidgets.QApplication(sys.argv)
runner = Runner()
runner.start()
app.exec_()
if __name__ == "__main__":
main()
In reality QThread and threading.Thread() are native thread handlers of the OS, so in practical terms it can be said that QThread is a threading.Thread() + QObject with an eventloop running on the secondary thread.
On the other hand, if your objective is to call a function from a thread to which it does not belong, then you should use asynchronous methods as pointed out in this answer.
In this case the simplest is to use pyqtSlot + QMetaObject:
import signal
import sys
import threading
from PyQt5 import QtWidgets
from PyQt5 import QtCore
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebView(QWebEnginePage):
def __init__(self):
QWebEnginePage.__init__(self)
self.loadFinished.connect(self.on_load_finish)
def print_result(self, data):
print("-" * 30)
print(data)
with open("temp.html", "wb") as hndl:
hndl.write(data.encode("utf-8"))
def on_load_finish(self):
self.toHtml(self.print_result)
#QtCore.pyqtSlot(QtCore.QUrl)
def load(self, url):
QWebEnginePage.load(self, url)
class Runner(threading.Thread):
def __init__(self, web_view):
self.web_view = web_view
threading.Thread.__init__(self)
self.daemon = True
def run(self):
url = QtCore.QUrl("https://www.worldometers.info/coronavirus/")
QtCore.QMetaObject.invokeMethod(
self.web_view,
"load",
QtCore.Qt.QueuedConnection,
QtCore.Q_ARG(QtCore.QUrl, url),
)
def main():
signal.signal(signal.SIGINT, signal.SIG_DFL)
app = QtWidgets.QApplication(sys.argv)
web_view = WebView()
runner = Runner(web_view)
runner.start()
app.exec_()
if __name__ == "__main__":
main()
Or functools.partial() + QTimer
from functools import partial
import signal
import sys
import threading
from PyQt5 import QtWidgets
from PyQt5 import QtCore
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebView(QWebEnginePage):
def __init__(self):
QWebEnginePage.__init__(self)
self.loadFinished.connect(self.on_load_finish)
def print_result(self, data):
print("-" * 30)
print(data)
with open("temp.html", "wb") as hndl:
hndl.write(data.encode("utf-8"))
def on_load_finish(self):
self.toHtml(self.print_result)
class Runner(threading.Thread):
def __init__(self, web_view):
self.web_view = web_view
threading.Thread.__init__(self)
self.daemon = True
def run(self):
wrapper = partial(
self.web_view.load,
QtCore.QUrl("https://www.worldometers.info/coronavirus/"),
)
QtCore.QTimer.singleShot(0, wrapper)
def main():
signal.signal(signal.SIGINT, signal.SIG_DFL)
app = QtWidgets.QApplication(sys.argv)
web_view = WebView()
runner = Runner(web_view)
runner.start()
app.exec_()
if __name__ == "__main__":
main()
working on writing a Python script to scrape a webpage after it has run its JavaScript. I realized I needed the JS to run because using the Requests wasn't returning any data. I found what seemed to be my solution here but I am having some problems still.
First of all that tutorial uses PyQt4, I have installed and tried multiple versions of PyQt 4 and 5 from the project interpreter and still can't find a solution. Here is the relevant code:
import PyQt5.QtWebEngineWidgets
import PyQt5.QtCore
import PyQt5.QtWebEngine
import PyQt5.QtWebEngineCore
class Render(QWebpage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _load_finished(self, result):
self.frame = self.mainFrame()
self.app.quit()
The QWebpage, QApplication, and QUrl calls all have 'Unresolved Reference' errors, the four PyQt5 import statements also all have 'Unused Import Statement' indications. I have tried for several hours to resolve these issues, uninstalling and reinstalling PyQt several times and searching the internet
Any advice would be awesome, Thanks!
Your imports are incorrect, in python there are many ways to do it: in your case you could be like this:
1.from package import class
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebKitWidgets import QWebPage
from PyQt5.QtWidgets import QApplication
# Take this class for granted.Just use result of rendering.
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://pycoders.com/archive/'
r = Render(url)
result = r.frame.toHtml()
print(result)
import package, then you should use each element as package.class:
import sys
from PyQt5 import QtWebKitWidgets, QtCore, QtWidgets
class Render(QtWebKitWidgets.QWebPage):
def __init__(self, url):
self.app = QtWidgets.QApplication(sys.argv)
QtWebKitWidgets.QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QtCore.QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://pycoders.com/archive/'
r = Render(url)
result = r.frame.toHtml()
print(result)
If you are using pycharm there is a very simple way that pycharm imports the packages correctly for you, for this you must place the dot above the word that generates the error and execute Ctrl+M
Note:If you are using windows you will not be able to use these modules since Qt and therefore PyQt, use chromium, and it seems that they have a problem with Windows.
I'm doing Sentdex's PyQt4 YouTube tutorial right here. I'm trying to follow along but use PyQt5 instead. It's a simple web scraping app. I followed along with Sentdex's tutorial and I got here:
Now I'm trying to write the same application with PyQt5 and this is what I have:
import os
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl, QEventLoop
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from bs4 import BeautifulSoup
import requests
class Client(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.load(QUrl(url))
self.app.exec_()
def _loadFinished(self):
self.app.quit()
url = 'https://pythonprogramming.net/parsememcparseface/'
client_response = Client(url)
#I think the issue is here at LINE 26
source = client_response.mainFrame().toHtml()
soup = BeautifulSoup(source, "html.parser")
js_test = soup.find('p', class_='jstest')
print(js_test.text)
When I run this, I get the message:
source = client_response.mainFrame().toHtml()
AttributeError: 'Client' object has no attribute 'mainFrame'
I've tried a few different solutions but none work. Any help would be appreciated.
EDIT
Logging QUrl(url) on line 15 returns this value:
PyQt5.QtCore.QUrl('https://pythonprogramming.net/parsememcparseface/')
When I try source = client_response.load(QUrl(url)) for line 26, I end up with the message:
File "test3.py", line 28, in <module>
soup = BeautifulSoup(source, "html.parser")
File "/Users/MYNAME/.venv/qtproject/lib/python3.6/site-packages/bs4/__init__.py", line 192, in __init__
elif len(markup) <= 256 and (
TypeError: object of type 'NoneType' has no len()
When I try source = client_response.url() I get:
soup = BeautifulSoup(source, "html.parser")
File "/Users/MYNAME/.venv/qtproject/lib/python3.6/site-packages/bs4/__init__.py", line 192, in __init__
elif len(markup) <= 256 and (
TypeError: object of type 'QUrl' has no len()
you must call the QWebEnginePage::toHtml() inside the definition of the class. QWebEnginePage::toHtml() takes a pointer function or a lambda as a parameter, and this pointer function must in turn take a parameter of 'str' type (this is the parameter that contains the page's html). Here is sample code below.
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('https://pythonprogramming.net/parsememcparseface/')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find('p', class_='jstest')
print js_test.text
if __name__ == '__main__': main()
Never too late...
I got the same issue and found description of it here: http://pyqt.sourceforge.net/Docs/PyQt5/gotchas.html#crashes-on-exit
I followed the advice of puting the QApplication in a global variable (I know it is dirty... and I will be punished for that) and it works "fine". I can loop without any crash.
Hope this will help.
I want to send http GET request using PyQt.
Despite my researches, I haven't found any examples of that simple manipulation in python.
I've ended up with some code (that I have modified according to the hints given by Bakuriu in the comments), but it doesn't work. Let's say I want to make a get request to facebook webpage, and print the answer, which should be the HTML content of the page.
from PyQt4 import QtCore, QtNetwork, QtCore, QtGui
from PyQt4.QtCore import *
import sys
from functools import partial
def printContent():
answerAsText = QString(replyObject.readAll())
print answerAsText
app = QtCore.QCoreApplication(sys.argv)
url = QtCore.QUrl("http://www.facebook.com")
request = QtNetwork.QNetworkRequest()
request.setUrl(url)
manager = QtNetwork.QNetworkAccessManager()
replyObject = manager.get(request)
replyObject.finished.connect(printContent)
sys.exit(app.exec_())
This doesn't raise any error, it just doesn't print anything.
I don't know where the problem is :
Is my request wrong ?
Or is it the way I handle the reply object afterwards ?
Why doesn't it work ? Could somebody please show me a functioning code ?
We need to create a QApplication or QtCoreApplication, because we are using the signal and slot mechanism. Notice also that the response has to be decoded from a QByteArray.
Here is a working example:
#!/usr/bin/python
from PyQt5 import QtCore, QtGui, QtNetwork
import sys
class Example:
def __init__(self):
self.doRequest()
def doRequest(self):
url = "http://webcode.me"
req = QtNetwork.QNetworkRequest(QtCore.QUrl(url))
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.nam.get(req)
def handleResponse(self, reply):
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
print(str(bytes_string, 'utf-8'))
else:
print("Error occured: ", er)
print(reply.errorString())
QtCore.QCoreApplication.quit()
if __name__ == '__main__':
app = QtCore.QCoreApplication([])
ex = Example()
sys.exit(app.exec_())
If you run this application, you will get the HTML code of a very simple web page.
Looks like the URL example is bad. Try www.google.com. I've adapted your code for PyQt5. It works.
from PyQt5.QtCore import QUrl
from PyQt5.QtWidgets import QApplication, QMainWindow
from PyQt5.QtNetwork import QNetworkAccessManager, QNetworkRequest
import sys
def printContent():
answerAsText = bytes(replyObject.readAll()).decode("utf-8")
print(answerAsText)
class mainClass():
def my_exception_hook(exctype, value, traceback):
print(exctype, value, traceback)
sys._excepthook(exctype, value, traceback)
sys.exit(1)
sys.excepthook = my_exception_hook
if __name__ == '__main__':
app = QApplication(sys.argv)
url = QUrl("http://www.google.com")
request = QNetworkRequest()
request.setUrl(url)
manager = QNetworkAccessManager()
replyObject = manager.get(request)
replyObject.finished.connect(printContent)
sys.exit(app.exec_())
I've been working on a really simple script that gets the question titles from the python tag in stackoverflow and shows them in a QTextBrowser.
The application works as expected(at least at the beggining) but the Window doesn't show untill it finishes loading the webpage and the refresh button freezes the program until it loads too is there a way to fix this problem? Here's The full code:
#! usr/bin/env python
from PyQt4.QtGui import *
import requests
from bs4 import BeautifulSoup
import lxml
from threading import Thread
class Form(QWidget):
def __init__(self, parent=None):
super(Form, self).__init__(parent)
self.url = "http://www.stackoverflow.com/questions/tagged/python"
self.browser = QTextBrowser()
self.connectionlabel = QLabel()
self.refreshBtn = QPushButton("Refresh")
self.refreshBtn.clicked.connect(self.get)
layout = QGridLayout()
layout.addWidget(self.connectionlabel, 0, 0)
layout.addWidget(self.refreshBtn, 0, 1)
layout.addWidget(self.browser, 1, 0,1,2)
self.setLayout(layout)
self.setWindowTitle("StackOverflow: Python")
def get(self):
self.browser.clear()
self.connectionlabel.setText("Connecting.....")
try:
response = requests.get(self.url)
soup = BeautifulSoup(response.content, 'lxml')
self.connectionlabel.setText("Connected.")
questions = soup.find_all("a", {"class": "question-hyperlink"})
for i, questionTitle in enumerate(questions):
try:
self.browser.append("\n"+str(i+1)+". "+questionTitle.text)
except:
pass
except:
self.connectionlabel.setText("Couldn't connect.")
if __name__ == '__main__':
import sys
app = QApplication(sys.argv)
screen = Form()
screen.show()
t = Thread(screen.get)
t.deamon = True
t.start()
sys.exit(app.exec_())
The UI will always freeze until the code execution is complete. In order to avoid this use multiprocessing or threading and call the blocking code in a separate process/thread. You can also use PyQT's QThread.