PyQt5 on Pycharm, Modules not used - python

working on writing a Python script to scrape a webpage after it has run its JavaScript. I realized I needed the JS to run because using the Requests wasn't returning any data. I found what seemed to be my solution here but I am having some problems still.
First of all that tutorial uses PyQt4, I have installed and tried multiple versions of PyQt 4 and 5 from the project interpreter and still can't find a solution. Here is the relevant code:
import PyQt5.QtWebEngineWidgets
import PyQt5.QtCore
import PyQt5.QtWebEngine
import PyQt5.QtWebEngineCore
class Render(QWebpage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _load_finished(self, result):
self.frame = self.mainFrame()
self.app.quit()
The QWebpage, QApplication, and QUrl calls all have 'Unresolved Reference' errors, the four PyQt5 import statements also all have 'Unused Import Statement' indications. I have tried for several hours to resolve these issues, uninstalling and reinstalling PyQt several times and searching the internet
Any advice would be awesome, Thanks!

Your imports are incorrect, in python there are many ways to do it: in your case you could be like this:
1.from package import class
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebKitWidgets import QWebPage
from PyQt5.QtWidgets import QApplication
# Take this class for granted.Just use result of rendering.
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://pycoders.com/archive/'
r = Render(url)
result = r.frame.toHtml()
print(result)
import package, then you should use each element as package.class:
import sys
from PyQt5 import QtWebKitWidgets, QtCore, QtWidgets
class Render(QtWebKitWidgets.QWebPage):
def __init__(self, url):
self.app = QtWidgets.QApplication(sys.argv)
QtWebKitWidgets.QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QtCore.QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://pycoders.com/archive/'
r = Render(url)
result = r.frame.toHtml()
print(result)
If you are using pycharm there is a very simple way that pycharm imports the packages correctly for you, for this you must place the dot above the word that generates the error and execute Ctrl+M
Note:If you are using windows you will not be able to use these modules since Qt and therefore PyQt, use chromium, and it seems that they have a problem with Windows.

Related

view.showMaximised() not working in PyQt5

I am making a web browser using PyQt5. I am using the following code:
import PyQt5
from PyQt5.QtCore import QUrl
from PyQt5.QtWidgets import QApplication, QWidget
from PyQt5.QtWebKitWidgets import QWebView , QWebPage
from PyQt5.QtWebKit import QWebSettings
from PyQt5.QtNetwork import *
import sys
from optparse import OptionParser
class Browser(QWebView):
def __init__(self):
# QWebView
self.view = QWebView.__init__(self)
#self.view.setPage(MyBrowser())
self.setWindowTitle('Loading...')
self.titleChanged.connect(self.adjustTitle)
#super(Browser).connect(self.ui.webView,QtCore.SIGNAL("titleChanged (const QString&)"), self.adjustTitle)
def load(self,url):
self.setUrl(QUrl(url))
def adjustTitle(self):
self.setWindowTitle(self.title())
app = QApplication(sys.argv)
view = Browser()
view.showMaximized()
view.load("https://duckduckgo.com")
app.exec_()
However, this is what I get:
Can someone please tell me where I am going wrong? Note that it is not a problem of the website. I have tried it with Wikipedia, Stack Overflow and Google. I am using PyQt5 version 5.10.1.
In case you want fullscreen, you have to use:
class Browser(QWebView):
def __init__(self):
# QWebView
self.view = QWebView.__init__(self)
#self.view.setPage(MyBrowser())
self.setWindowTitle('Loading...')
self.titleChanged.connect(self.adjustTitle)
self.showFullScreen()
class Browser(QWebView):
def __init__(self):
super().__init__()
def load(self, url):
self.setUrl(QUrl(url))
def adjustTitle(self):
self.setWindowTitle(self.title())
if __name__ == '__main__':
app = QApplication(sys.argv)
window = Browser()
window.setWindowTitle('Loading...')
window.titleChanged.connect(window.adjustTitle)
window.load("https://duckduckgo.com")
window.showMaximized()
sys.exit(app.exec_())
The program does not know the real sizes of your device so it creates a maximum on its assumed geometry.
You should provide the actual geometry by resize then call showMaximized. So that your geometry will be reachable by the program and a true maximized window will be displayed.
self.resize(998, 878)
self.showMaximized()

Can't web scrape with PyQt5 more than once [duplicate]

This question already has an answer here:
Scrape multiple urls using QWebPage
(1 answer)
Closed 4 years ago.
I am attempting to web scrape using the PyQT5 QWebEngineView. Here is the code that I got from another response on StackOverflow:
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl, QEventLoop
from PyQt5.QtWebEngineWidgets import QWebEngineView
import sys
def render(url):
class Render(QWebEngineView):
def __init__(self, t_url):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadfinished)
self.load(QUrl(t_url))
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadfinished(self, result):
self.page().toHtml(self._callable)
return Render(url).html
Then if I put the line:
print(render('http://quotes.toscrape.com/random'))
it works as expected. But if I add a second line to that so it reads:
print(render('http://quotes.toscrape.com/random'))
print(render('http://quotes.toscrape.com/tableful/'))
it gives me the error "Process finished with exit code -1073741819 (0xC0000005)" after printing out the first render correctly.
I have narrowed the error down to the line that says self.load(QUrl(t_url))
You're initializing QApplication more than once. Only once instance should exist, globally. If you need to get the current instance and do not have a handle to it, you can use QApplication.instance(). QApplication.quit() is meant to be called right before sys.exit, in fact, you should almost never use one without the other.
In short, you're telling Qt you're exiting the application, and then trying to run more Qt code. It's an easy fix, however...
Solution
You can do 1 of three things:
Store the app in a global variable and reference it from there:
APP = QApplication(sys.argv)
# ... Many lines ellipsed
class SomeClass(QWidget):
def some_method(self):
APP.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
Pass the app as a handle to the class.
def render(app, url):
...
Create a global instance, and use QApplication.instance().
APP = QApplication(sys.argv)
# ... Many lines ellipsed
class SomeClass(QWidget):
def some_method(self):
app = QApplication.instance()
app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
Do what's most convenient for you.

Proxies when using PyQt to render HTML

I'm looking to scrape JavaScript-driven pages using this code, which has appeared on a number of past threads (c.f. this, this, and others on offsite threads):
import sys
from PyQt5.QtCore import QEventLoop
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
def render(source_html):
class Render(QWebEngineView):
def __init__(self, html):
self.html = None
self.app = QApplication(sys.argv)
QWebEngineView.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.setHtml(html)
while self.html is None:
self.app.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers | QEventLoop.WaitForMoreEvents)
self.app.quit()
def _callable(self, data):
self.html = data
def _loadFinished(self, result):
self.page().toHtml(self._callable)
return Render(source_html).html
It's working fine.
My question is whether I need to use a proxy for this portion of the code (assuming that I generally want to be using a proxy for all network activity).
I'm using urllib.request and a proxy to access the site in question, and then passing the html from there on to PyQt5 to do the JavaScript mambo. Does that second leg of the journey involve a network connection that should be proxy-fied? If so, how should I change this code - haven't touched PyQt until today and am feeling a bit over my head.
Using Python 3.5 and Windows 7.
Many thanks.

How to scrape several websites with pyqt4, scope change?

I would like to scrape two websites in java for links using PyQt4.QtWebKit to render the pages and then get the desired links. The code works fine with one page or url, but stops (but continues running until force quit) after printing the links of the first website. It seems the scope stays in the event loop of the render class. How can I get the program to change scope and continue with the for loop and rendering the second website? Using exit() in _loadFinished method just quits the program after the first iteration. Maybe the python app has to close and reopen to render the next page, which is impossible because the app is opened/reopened outside of the program?
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4 import QtGui
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.frame = None
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
result = self.frame.toHtml()
formatted_result = str(result)
tree = html.fromstring(formatted_result)
archive_links = tree.xpath('//div/div/a/#href')[0:4]
print(archive_links)
urls = ['http://pycoders.com/archive/', 'http://www.pythonjobshq.com']
def main(urls):
app = QtGui.QApplication(sys.argv)
for url in urls:
r = Render(url)
#s = Render(urls[1]) #The pages can be rendered parallel, but rendering more than a handful of pages a the same time is a bad idea
sys.exit(app.exec_())
if __name__ == '__main__':
main(urls)
Thankful for any help!

Python/PyQt reopening QWebView shows blank page

I have this problem with QWebView not showing anything after I rerun QAplication. This small snippet displys the problem:
import sys
from PyQt4 import QtGui, QtWebKit, QtCore
app = QtGui.QApplication(sys.argv)
while True:
browser = QtWebKit.QWebView()
browser.setUrl(QtCore.QUrl('https://www.google.ca/#q=pyqt'))
browser.show()
app.exec_()
Upon running, the google search page for pyqt is shown, but once I close the widget, next one pops up as blank instead of the same search page. I was wondering what I'm doing wrong here?
I do not know why the page stays blank, but I'm certain you can easily achieve the same functionality without calling QApplication.exec_() multiple times.
An example achieving the same:
from PySide import QtGui, QtCore, QtWebKit
class MyBrowser(QtWebKit.QWebView):
closing = QtCore.Signal()
def __init__(self):
super().__init__()
def closeEvent(self, event):
self.closing.emit()
class MyApp(QtCore.QObject):
def __init__(self):
super().__init__()
def setup(self):
self.browser = MyBrowser()
self.browser.closing.connect(self.setup)
self.browser.setUrl(QtCore.QUrl('https://www.google.ca/#q=pyqt'))
self.browser.show()
app = QtGui.QApplication([])
a = MyApp()
a.setup()
app.exec_()

Categories