I'm doing Sentdex's PyQt4 YouTube tutorial right here. I'm trying to follow along but use PyQt5 instead. It's a simple web scraping app. I followed along with Sentdex's tutorial and I got here:
Now I'm trying to write the same application with PyQt5 and this is what I have:
import os
import sys
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl, QEventLoop
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from bs4 import BeautifulSoup
import requests
class Client(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.load(QUrl(url))
self.app.exec_()
def _loadFinished(self):
self.app.quit()
url = 'https://pythonprogramming.net/parsememcparseface/'
client_response = Client(url)
#I think the issue is here at LINE 26
source = client_response.mainFrame().toHtml()
soup = BeautifulSoup(source, "html.parser")
js_test = soup.find('p', class_='jstest')
print(js_test.text)
When I run this, I get the message:
source = client_response.mainFrame().toHtml()
AttributeError: 'Client' object has no attribute 'mainFrame'
I've tried a few different solutions but none work. Any help would be appreciated.
EDIT
Logging QUrl(url) on line 15 returns this value:
PyQt5.QtCore.QUrl('https://pythonprogramming.net/parsememcparseface/')
When I try source = client_response.load(QUrl(url)) for line 26, I end up with the message:
File "test3.py", line 28, in <module>
soup = BeautifulSoup(source, "html.parser")
File "/Users/MYNAME/.venv/qtproject/lib/python3.6/site-packages/bs4/__init__.py", line 192, in __init__
elif len(markup) <= 256 and (
TypeError: object of type 'NoneType' has no len()
When I try source = client_response.url() I get:
soup = BeautifulSoup(source, "html.parser")
File "/Users/MYNAME/.venv/qtproject/lib/python3.6/site-packages/bs4/__init__.py", line 192, in __init__
elif len(markup) <= 256 and (
TypeError: object of type 'QUrl' has no len()
you must call the QWebEnginePage::toHtml() inside the definition of the class. QWebEnginePage::toHtml() takes a pointer function or a lambda as a parameter, and this pointer function must in turn take a parameter of 'str' type (this is the parameter that contains the page's html). Here is sample code below.
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('https://pythonprogramming.net/parsememcparseface/')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find('p', class_='jstest')
print js_test.text
if __name__ == '__main__': main()
Never too late...
I got the same issue and found description of it here: http://pyqt.sourceforge.net/Docs/PyQt5/gotchas.html#crashes-on-exit
I followed the advice of puting the QApplication in a global variable (I know it is dirty... and I will be punished for that) and it works "fine". I can loop without any crash.
Hope this will help.
Related
working on writing a Python script to scrape a webpage after it has run its JavaScript. I realized I needed the JS to run because using the Requests wasn't returning any data. I found what seemed to be my solution here but I am having some problems still.
First of all that tutorial uses PyQt4, I have installed and tried multiple versions of PyQt 4 and 5 from the project interpreter and still can't find a solution. Here is the relevant code:
import PyQt5.QtWebEngineWidgets
import PyQt5.QtCore
import PyQt5.QtWebEngine
import PyQt5.QtWebEngineCore
class Render(QWebpage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _load_finished(self, result):
self.frame = self.mainFrame()
self.app.quit()
The QWebpage, QApplication, and QUrl calls all have 'Unresolved Reference' errors, the four PyQt5 import statements also all have 'Unused Import Statement' indications. I have tried for several hours to resolve these issues, uninstalling and reinstalling PyQt several times and searching the internet
Any advice would be awesome, Thanks!
Your imports are incorrect, in python there are many ways to do it: in your case you could be like this:
1.from package import class
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebKitWidgets import QWebPage
from PyQt5.QtWidgets import QApplication
# Take this class for granted.Just use result of rendering.
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://pycoders.com/archive/'
r = Render(url)
result = r.frame.toHtml()
print(result)
import package, then you should use each element as package.class:
import sys
from PyQt5 import QtWebKitWidgets, QtCore, QtWidgets
class Render(QtWebKitWidgets.QWebPage):
def __init__(self, url):
self.app = QtWidgets.QApplication(sys.argv)
QtWebKitWidgets.QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QtCore.QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://pycoders.com/archive/'
r = Render(url)
result = r.frame.toHtml()
print(result)
If you are using pycharm there is a very simple way that pycharm imports the packages correctly for you, for this you must place the dot above the word that generates the error and execute Ctrl+M
Note:If you are using windows you will not be able to use these modules since Qt and therefore PyQt, use chromium, and it seems that they have a problem with Windows.
I want to send http GET request using PyQt.
Despite my researches, I haven't found any examples of that simple manipulation in python.
I've ended up with some code (that I have modified according to the hints given by Bakuriu in the comments), but it doesn't work. Let's say I want to make a get request to facebook webpage, and print the answer, which should be the HTML content of the page.
from PyQt4 import QtCore, QtNetwork, QtCore, QtGui
from PyQt4.QtCore import *
import sys
from functools import partial
def printContent():
answerAsText = QString(replyObject.readAll())
print answerAsText
app = QtCore.QCoreApplication(sys.argv)
url = QtCore.QUrl("http://www.facebook.com")
request = QtNetwork.QNetworkRequest()
request.setUrl(url)
manager = QtNetwork.QNetworkAccessManager()
replyObject = manager.get(request)
replyObject.finished.connect(printContent)
sys.exit(app.exec_())
This doesn't raise any error, it just doesn't print anything.
I don't know where the problem is :
Is my request wrong ?
Or is it the way I handle the reply object afterwards ?
Why doesn't it work ? Could somebody please show me a functioning code ?
We need to create a QApplication or QtCoreApplication, because we are using the signal and slot mechanism. Notice also that the response has to be decoded from a QByteArray.
Here is a working example:
#!/usr/bin/python
from PyQt5 import QtCore, QtGui, QtNetwork
import sys
class Example:
def __init__(self):
self.doRequest()
def doRequest(self):
url = "http://webcode.me"
req = QtNetwork.QNetworkRequest(QtCore.QUrl(url))
self.nam = QtNetwork.QNetworkAccessManager()
self.nam.finished.connect(self.handleResponse)
self.nam.get(req)
def handleResponse(self, reply):
er = reply.error()
if er == QtNetwork.QNetworkReply.NoError:
bytes_string = reply.readAll()
print(str(bytes_string, 'utf-8'))
else:
print("Error occured: ", er)
print(reply.errorString())
QtCore.QCoreApplication.quit()
if __name__ == '__main__':
app = QtCore.QCoreApplication([])
ex = Example()
sys.exit(app.exec_())
If you run this application, you will get the HTML code of a very simple web page.
Looks like the URL example is bad. Try www.google.com. I've adapted your code for PyQt5. It works.
from PyQt5.QtCore import QUrl
from PyQt5.QtWidgets import QApplication, QMainWindow
from PyQt5.QtNetwork import QNetworkAccessManager, QNetworkRequest
import sys
def printContent():
answerAsText = bytes(replyObject.readAll()).decode("utf-8")
print(answerAsText)
class mainClass():
def my_exception_hook(exctype, value, traceback):
print(exctype, value, traceback)
sys._excepthook(exctype, value, traceback)
sys.exit(1)
sys.excepthook = my_exception_hook
if __name__ == '__main__':
app = QApplication(sys.argv)
url = QUrl("http://www.google.com")
request = QNetworkRequest()
request.setUrl(url)
manager = QNetworkAccessManager()
replyObject = manager.get(request)
replyObject.finished.connect(printContent)
sys.exit(app.exec_())
I would like to scrape two websites in java for links using PyQt4.QtWebKit to render the pages and then get the desired links. The code works fine with one page or url, but stops (but continues running until force quit) after printing the links of the first website. It seems the scope stays in the event loop of the render class. How can I get the program to change scope and continue with the for loop and rendering the second website? Using exit() in _loadFinished method just quits the program after the first iteration. Maybe the python app has to close and reopen to render the next page, which is impossible because the app is opened/reopened outside of the program?
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4 import QtGui
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.frame = None
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
result = self.frame.toHtml()
formatted_result = str(result)
tree = html.fromstring(formatted_result)
archive_links = tree.xpath('//div/div/a/#href')[0:4]
print(archive_links)
urls = ['http://pycoders.com/archive/', 'http://www.pythonjobshq.com']
def main(urls):
app = QtGui.QApplication(sys.argv)
for url in urls:
r = Render(url)
#s = Render(urls[1]) #The pages can be rendered parallel, but rendering more than a handful of pages a the same time is a bad idea
sys.exit(app.exec_())
if __name__ == '__main__':
main(urls)
Thankful for any help!
I've been working on a really simple script that gets the question titles from the python tag in stackoverflow and shows them in a QTextBrowser.
The application works as expected(at least at the beggining) but the Window doesn't show untill it finishes loading the webpage and the refresh button freezes the program until it loads too is there a way to fix this problem? Here's The full code:
#! usr/bin/env python
from PyQt4.QtGui import *
import requests
from bs4 import BeautifulSoup
import lxml
from threading import Thread
class Form(QWidget):
def __init__(self, parent=None):
super(Form, self).__init__(parent)
self.url = "http://www.stackoverflow.com/questions/tagged/python"
self.browser = QTextBrowser()
self.connectionlabel = QLabel()
self.refreshBtn = QPushButton("Refresh")
self.refreshBtn.clicked.connect(self.get)
layout = QGridLayout()
layout.addWidget(self.connectionlabel, 0, 0)
layout.addWidget(self.refreshBtn, 0, 1)
layout.addWidget(self.browser, 1, 0,1,2)
self.setLayout(layout)
self.setWindowTitle("StackOverflow: Python")
def get(self):
self.browser.clear()
self.connectionlabel.setText("Connecting.....")
try:
response = requests.get(self.url)
soup = BeautifulSoup(response.content, 'lxml')
self.connectionlabel.setText("Connected.")
questions = soup.find_all("a", {"class": "question-hyperlink"})
for i, questionTitle in enumerate(questions):
try:
self.browser.append("\n"+str(i+1)+". "+questionTitle.text)
except:
pass
except:
self.connectionlabel.setText("Couldn't connect.")
if __name__ == '__main__':
import sys
app = QApplication(sys.argv)
screen = Form()
screen.show()
t = Thread(screen.get)
t.deamon = True
t.start()
sys.exit(app.exec_())
The UI will always freeze until the code execution is complete. In order to avoid this use multiprocessing or threading and call the blocking code in a separate process/thread. You can also use PyQT's QThread.
I'm using PyQt to fully load a page(including JS) and get it contents using Beautiful Soup. Works fine at the first iteration, but after, it crashes. I don't have a big knowledge in Python, and even less in PyQt, so any help is very welcome.
Class borrowed from here.
from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import sys
import signal
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.html = None
signal.signal(signal.SIGINT, signal.SIG_DFL)
self.connect(self, SIGNAL('loadFinished(bool)'), self._finished_loading)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _finished_loading(self, result):
self.html = self.mainFrame().toHtml()
self.soup = BeautifulSoup(UnicodeDammit(self.html).unicode_markup)
self.app.quit()
###################################################################
l = ["http://www.google.com/?q=a", "http://www.google.com/?q=b", "http://www.google.com/?q=c"]
for page in l:
soup = Render(page).soup
print("# soup done: " + page)
The example crashes because the RenderPage class attempts to create a new QApplication and event-loop for every url it tries to load.
Instead, only one QApplication should be created, and the QWebPage subclass should load a new url after each page has been processed, rather than using a for-loop.
Here's a re-write of the example which should do what you want:
import sys, signal
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt4 import QtCore, QtGui, QtWebKit
class WebPage(QtWebKit.QWebPage):
def __init__(self):
QtWebKit.QWebPage.__init__(self)
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def process(self, items):
self._items = iter(items)
self.fetchNext()
def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.mainFrame().load(QtCore.QUrl(self._url))
except StopIteration:
return False
return True
def handleLoadFinished(self):
self._func(self._url, self.mainFrame().toHtml())
if not self.fetchNext():
print('# processing complete')
QtGui.qApp.quit()
def funcA(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
def funcB(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
if __name__ == '__main__':
items = [
('http://stackoverflow.com', funcA),
('http://google.com', funcB),
]
signal.signal(signal.SIGINT, signal.SIG_DFL)
print('Press Ctrl+C to quit\n')
app = QtGui.QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())