So I've been working on a async python scraper with http requests based on modules. So for that I've been using asks and importlib, and I'd like to make a simple little GUI that updates with the status codes of requests. And I did.
Everything is great, but I seem to have an issue as the requests are sent sucessfully, the GUI shows up, but it only shows up once all the requests are sent, and not dynamically while the requests are being sent
I've been playing around with tutorials and Qtimer, but in all the tutorials and help threads I've seen such as:
https://www.riverbankcomputing.com/pipermail/pyqt/2013-July/033053.html
PyQt5: Updating Label?
I have tried to implement the code to my situation, but the only thing I've manage to do is for the GUI to show up at the same time the requests are sent, but it stays frozen (not responding) until all the requests are finished
import trio
from asks import Session
import importlib
from PyQt5.QtWidgets import QLabel, QMainWindow, QApplication, QWidget, QVBoxLayout
from PyQt5 import QtCore
import qdarkstyle
app = QApplication(sys.argv)
app.setStyleSheet(qdarkstyle.load_stylesheet_pyqt5())
module = importlib.import_module("get_module")
good = 0
bad = 0
total = 0
class Menu(QMainWindow):
def __init__(self):
global good, bad, total
super().__init__()
self.setWindowTitle("Status Codes")
self.central_widget = QWidget()
self.setCentralWidget(self.central_widget)
lay = QVBoxLayout(self.central_widget)
self.resize(500, 350)
ok = QLabel("200: <font color='green'>0</font>")
ok.setAlignment(QtCore.Qt.AlignHCenter)
bad = QLabel("400: <font color='yellow'>0</font>")
bad.setAlignment(QtCore.Qt.AlignHCenter)
total = QLabel("Total: <font color='#00FF00'>0</font>")
total.setAlignment(QtCore.Qt.AlignHCenter)
r_total, r_good, r_bad = self.check()
QtCore.QTimer.singleShot(1000, lambda: self.updateLabels(r_total, r_good, r_bad))
lay.addWidget(ok)
lay.addWidget(bad)
lay.addWidget(total)
self.show()
def check(self):
async def worker1(s):
global ok
global bad
global total
if module.method.lower() == "get":
r = await s.get(module.request(), params=module.settings())
elif module.method.lower() == "post":
r = await s.post(module.request(), data=module.settings())
if any(x in r.status_code for x in module.error):
print("BAD -- " + module.request())
r_total += 1
r_invalid += 1
else:
print("GOOD -- " + module.request())
r_total += 1
r_valid += 1
print(r.text)
async def worker2(s):
global ok
global bad
global total
if module.method.lower() == "get":
r = await s.get(module.request(), params=module.settings())
elif module.method.lower() == "post":
r = await s.post(module.request(), data=module.settings())
if any(x in r.status_code for x in module.error):
print("BAD -- " + module.request())
r_total += 1
r_invalid += 1
else:
print("GOOD -- " + module.request())
r_total += 1
r_valid += 1
print(r.text)
async def example():
s = Session(connections=module.connections)
for i in range(10):
async with trio.open_nursery() as nursery:
nursery.start_soon(worker1, s)
nursery.start_soon(worker2, s)
trio.run(example)
print("Total:", r_total)
print("Total good:", r_valid)
print("Total bad:", r_invalid)
return r_total, r_valid, r_invalid
def updateLabels(self, r_total, r_card, r_invalid):
good.setText("200: <font color='green'>%s</font>" % (r_valid))
bad.setText("400: <font color='#00FF00'>%s</font>" % (r_invalid))
total.setText("Total: <font color='#F40D30'>%s</font>" % (r.total))
if __name__ == '__main__':
ex = Menu()
sys.exit(app.exec_())
Now what I would like it to do is that the GUI shows up and that dynamically (or every 1 second) the 200, 400 and total labels show how many requests have been made, and how many returned 200 and 400.
However instead it'll show them (it does show the total, total 200 and total 400) but only once all the requests are finished and not dynamically on the side
Asynchronous tasks block the thread of the GUI, so those tasks must be executed in another thread and send it through signals to the GUI, also that allows us to separate the business logic and the GUI having a cleaner code:
get_module.py
# coding: utf8
#======================= IMPORT AREA =======================
#here's where you import any module needed for the website
from random import randint
#===========================================================
#====================== SETTINGS AREA ======================
#here's where you declare the settings of the website such
#as method, error key, success key, custom settings, etc...
name = "GET Test Config"
method = 'GET' #Method is either GET, POST or CUSTOM
error = ['400', '401', '402', '403', '404', '405', '406', '407', '408', '409', '410', '411', '412', '413', '414', '415', '416', '417', '418', '421', '422', '423', '424', '425', '426', '428', '429', '431', '451','500', '501', '502', '503', '504', '505', '506', '507', '508', '510', '511']
connections = 5
#===========================================================
#====================== DEFINITON AREA =====================
#here's where the definitions are made.
#There's 2 defs:
#def request(): Which returns the url (with or without modifications)
#def settings(): returns the data to send in the GET request
#====== SETTINGS AREA ======
def request():
url = "https://httpbin.org/anything"
return url
def settings():
data = {'example':'example'}
return (data)
#===========================================================
main.py
import importlib
import multio
import qdarkstyle
import trio
from PyQt5 import QtCore, QtWidgets
from asks import Session
module = importlib.import_module("get_module")
class TaskWorker(QtCore.QObject):
totalChanged = QtCore.pyqtSignal(int)
validChanged = QtCore.pyqtSignal(int)
invalidChanged = QtCore.pyqtSignal(int)
def __init__(self, parent=None):
super(TaskWorker, self).__init__(parent)
self._total = 0
self._valid = 0
self._invalid = 0
#QtCore.pyqtProperty(int, notify=totalChanged)
def total(self):
return self._total
#total.setter
def total(self, value):
if self._total == value:
return
self._total = value
self.totalChanged.emit(self._total)
#QtCore.pyqtProperty(int, notify=validChanged)
def valid(self):
return self._valid
#valid.setter
def valid(self, value):
if self._valid == value:
return
self._valid = value
self.validChanged.emit(self._valid)
#QtCore.pyqtProperty(int, notify=invalidChanged)
def invalid(self):
return self._invalid
#invalid.setter
def invalid(self, value):
if self._invalid == value:
return
self._invalid = value
self.invalidChanged.emit(self._invalid)
#QtCore.pyqtSlot()
def check(self):
async def worker1(s):
if module.method.lower() == "get":
r = await s.get(module.request(), params=module.settings())
elif module.method.lower() == "post":
r = await s.post(module.request(), data=module.settings())
# if any(x in r.status_code for x in module.error):
if str(r.status_code) in module.error:
print("BAD -- " + module.request())
self.total += 1
self.invalid += 1
else:
print("GOOD -- " + module.request())
self.total += 1
self.valid += 1
print(r.text)
async def worker2(s):
if module.method.lower() == "get":
r = await s.get(module.request(), params=module.settings())
elif module.method.lower() == "post":
r = await s.post(module.request(), data=module.settings())
# if any(x in r.status_code for x in module.error):
if str(r.status_code) in module.error:
print("BAD -- " + module.request())
self.total += 1
self.invalid += 1
else:
print("GOOD -- " + module.request())
self.total += 1
self.valid += 1
print(r.text)
async def example():
s = Session(connections=module.connections)
for i in range(40):
async with trio.open_nursery() as nursery:
nursery.start_soon(worker1, s)
nursery.start_soon(worker2, s)
multio.init("trio")
trio.run(example)
class Menu(QtWidgets.QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("Status Codes")
self.central_widget = QtWidgets.QWidget()
self.setCentralWidget(self.central_widget)
lay = QtWidgets.QVBoxLayout(self.central_widget)
self.resize(500, 350)
self.ok = QtWidgets.QLabel(
"200: <font color='green'>0</font>",
alignment=QtCore.Qt.AlignHCenter,
)
self.bad = QtWidgets.QLabel(
"400: <font color='yellow'>0</font>",
alignment=QtCore.Qt.AlignHCenter,
)
self.total = QtWidgets.QLabel(
"Total: <font color='#00FF00'>0</font>",
alignment=QtCore.Qt.AlignHCenter,
)
lay.addWidget(self.ok)
lay.addWidget(self.bad)
lay.addWidget(self.total)
thread = QtCore.QThread(self)
thread.start()
self.worker = TaskWorker()
self.worker.moveToThread(thread)
self.worker.totalChanged.connect(self.updateTotal)
self.worker.validChanged.connect(self.updateValid)
self.worker.invalidChanged.connect(self.updateInvalid)
QtCore.QTimer.singleShot(0, self.worker.check)
#QtCore.pyqtSlot(int)
def updateTotal(self, total):
self.total.setText("Total: <font color='#F40D30'>%s</font>" % (total))
#QtCore.pyqtSlot(int)
def updateValid(self, valid):
self.ok.setText("200: <font color='green'>%s</font>" % (valid))
#QtCore.pyqtSlot(int)
def updateInvalid(self, invalid):
self.bad.setText("400: <font color='#00FF00'>%s</font>" % (invalid))
if __name__ == "__main__":
import sys
app = QtWidgets.QApplication(sys.argv)
app.setStyleSheet(qdarkstyle.load_stylesheet_pyqt5())
ex = Menu()
ex.show()
sys.exit(app.exec_())
Related
I'm trying to run a few slow processes but, I need to keep updated the QDialog to show the progress (maybe I put a progress bar too).
So I decide to use QThread, but on the first try, it doesn't work as I expected.
In my example code:
1- I'm using a simple ping to my default gateway
2- I'm pinging to my dns resolver
As you can see on imagem below, the information is showed according the thread is finalizing, but it is a mess to me.
Is possible to respect the threads order to show the informations?
Thanks.
Follow my example code:
# -*- coding: utf8 -*-
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
from ping3 import ping, verbose_ping
import socket
import dns.resolver
class ExternalTests(QThread):
data_collected = pyqtSignal(object)
def __init__(self, title, arg=None):
QThread.__init__(self)
self.title = title
self.arg = arg
def run(self):
resp = ping(self.arg)
self.data_collected.emit('%s: %s' % (self.title, resp))
class MainMenu(QMenu):
def __init__(self, parent=None):
QMenu.__init__(self)
self.setStyleSheet("background-color: #3a80cd; color: rgb(255,255,255); selection-color: white; selection-background-color: #489de4;")
# Diagnostics
self.check_internet = QAction("Diagnosys")
self.check_internet.setIcon(QIcon(QPixmap("..\\img\\lupa.png")))
self.check_internet.triggered.connect(self.diagnosticNetwork)
self.addAction(self.check_internet)
self.addSeparator()
# To quit the app
self.quit = QAction("Quit")
self.quit.triggered.connect(app.quit)
self.addAction(self.quit)
def diagnosticNetwork(self):
self.check_internet_dialog = QDialog()
self.check_internet_dialog.setWindowTitle("Check external connections")
self.check_internet_dialog.setWindowModality(Qt.ApplicationModal)
self.check_internet_dialog.setGeometry(150, 100, 700, 500)
# text box
self.textbox = QTextBrowser(self.check_internet_dialog)
self.textbox.move(20, 20)
self.textbox.resize(660,400)
self.textbox.setFont(QFont("Courier New", 12))
self.textbox.setStyleSheet("background-color: black;")
#button copy
btn_copy = QPushButton("Copy", self.check_internet_dialog)
btn_copy.setIcon(QIcon(QPixmap("..\\img\\copy.png")))
btn_copy.move(520,450)
btn_copy.clicked.connect(self.dialogClickCopy)
#button close
btn_copy = QPushButton("Close", self.check_internet_dialog)
btn_copy.setIcon(QIcon(QPixmap("..\\img\\close.png")))
btn_copy.move(605,450)
btn_copy.clicked.connect(self.dialogClickClose)
# tests
self.textbox.setTextColor(QColor("white"))
self.textbox.append("Diagnosys")
self.textbox.append("--------------------------------------------------")
self.textbox.setTextColor(QColor("cyan"))
self.threads = []
#QCoreApplication.processEvents()
''' ping default gateway '''
ping_default_gw = ExternalTests("default gatewat is reacheble", "192.168.0.1")
ping_default_gw.data_collected.connect(self.onDataReady)
self.threads.append(ping_default_gw)
ping_default_gw.start()
''' ping dns resolver '''
ping_dns_resolvers = dns.resolver.Resolver().nameservers
for dns_resolver in ping_dns_resolvers:
ping_dns_resolver = ExternalTests("dns resolver is reacheble %s" % dns_resolver, dns_resolver)
ping_dns_resolver.data_collected.connect(self.onDataReady)
self.threads.append(ping_dns_resolver)
ping_dns_resolver.start()
self.check_internet_dialog.exec_()
def onDataReady(self, data):
print(data)
if data:
self.textbox.append(data)
else:
self.textbox.append("error")
def dialogClickCopy(self):
pass
def dialogClickClose(self):
self.check_internet_dialog.close()
class SystemTrayIcon(QSystemTrayIcon):
def __init__(self, menu, parent=None):
QSystemTrayIcon.__init__(self)
self.setIcon(QIcon("..\\img\\icon.png"))
self.setVisible(True)
self.setContextMenu(menu)
if __name__ == "__main__":
import sys
app = QApplication([])
app.setQuitOnLastWindowClosed(False)
app.setApplicationName('pkimonitor')
app.setApplicationVersion('0.1')
app.setWindowIcon(QIcon("..\\img\\icon.png"))
menu = MainMenu()
widget = QWidget()
trayIcon = SystemTrayIcon(menu, widget)
trayIcon.show()
sys.exit(app.exec_())
I tried to create a scheme to organize by "run position" and it works. Follow my code.
In 'diagnosticNetwork':
self.data_list = []
self.data_hold = []
In 'onDataReady':
if len(self.data_list) > 0:
if self.data_list[-1][0] + 1 == data[0]:
self.data_list.append(data)
if data[2]:
self.textbox.append("%s: %s" % (data[1], 'OK'))
else:
self.textbox.append("%s: %s" % (data[1], 'NOK'))
elif self.data_list[-1][0] < data[0]:
self.data_hold.append(data)
else:
self.data_list.append(data)
if data[2]:
self.textbox.append("%s: %s" % (data[1], 'OK'))
else:
self.textbox.append("%s: %s" % (data[1], 'NOK'))
if len(self.data_hold) > 0:
hold_sorted = self.data_hold[:]
hold_sorted.sort()
for line_hold in hold_sorted:
if self.data_list[-1][0] + 1 == line_hold[0]:
if line_hold[2]:
self.textbox.append("%s: %s" % (line_hold[1], 'OK'))
else:
self.textbox.append("%s: %s" % (line_hold[1], 'NOK'))
self.data_list.append(line_hold)
del self.data_hold[0]
I worked with two lists, data_list and data_hold. The results that I received from the threads, I filled out the main list according to the order that I pre-established, if the result that entered was not the next one in the sequence, it goes to data_hold, then scan this list to fill the rest of my textbox.
Thank you for all help!
I have created a desktop application by PYQT5 and python 3.7 to download a video by clicking the download button and save it locally in the PC.
The code will fetch the video link from (lineEdit.text()) which is labeled "URL" and save it in the local directory in (lineEdit_2.text()) which is labeled "SAVE AS". If the download stops for any reason, it will be resumed again by press the start download button. In addition, the ProgressBar start from 1% until 100% along with downloading the video. Everything working smoothly.
The question is, once the video stops in the middle for any reason then it resumes the downloading again but the ProgressBar should start from where it stopped but it is not. For example, if it stops in 50% then should be resumed from 50% and continue. However, it starts from 0% (from the beginning).
```def curl_progress(self,total, existing, totalfrac,fracmb):
global frac,tsize,size,save_location
try:
frac= float(existing)/float(total)
self.progressBar.setValue(totalfrac)
QApplication.processEvents()
except (ZeroDivisionError, RuntimeError, TypeError, NameError):
frac = 0
self.textBrowser.append("Downloaded %d/%d %d%%" % (existing, total, totalfrac))
if frac ==1.0:
self.textBrowser.append("")
size = os.path.getsize(save_location)
tsize= (size /1024 /1024)
QMessageBox.information(self,"Download Completed", "The Download is Finished and the size is %03.2f MB" %(tsize,))
self.textBrowser.append('Size of file is %03.2f MB' %(tsize,))
self.progressBar.setValue(0)
self.lineEdit.setText('')
self.lineEdit_2.setText('')
QMessageBox.close(self)
else:
self.textBrowser.append("Downloaded %d/%d %d%%" % (existing, total, totalfrac))
def curl_limit_rate(self,rate_limit):
global tsize,size,save_location
url= self.lineEdit.text()
save_location = self.lineEdit_2.text()
if len(url) == 0 and len(save_location) == 0:
QMessageBox.information(self, "Error", "Please put the links")
return
if len(url) > 0 and len(save_location) == 0:
QMessageBox.information(self, "Error", "Please put the location")
return
if len(url) == 0 and len(save_location) > 0:
QMessageBox.information(self, "Error", "Please put the link")
return
if len(url) > 0 and len(save_location) > 0:
c = pycurl.Curl()
c.setopt(pycurl.CAINFO, certifi.where())
c.setopt(c.URL,url)
c.setopt(c.MAX_RECV_SPEED_LARGE, rate_limit)
if os.path.exists(save_location):
file_id = open(save_location, "ab")
c.setopt(c.RESUME_FROM, os.path.getsize(save_location))
else:
file_id = open(save_location, "wb")
c.setopt(c.WRITEDATA, file_id)
c.setopt(c.NOPROGRESS, 0)
c.setopt(c.PROGRESSFUNCTION, self.curl_progress)
c.perform()
c.close()
else:
QMessageBox.information(self, "Error", "Unknown error!")```
The picture
Many thanks in advance,
Before pointing out the solution, I must point out that you should not run pycurl in the main thread since it is blocking, instead you must execute it in another thread and send the information to the main thread so that it can be shown.
Going to the point, the idea is that when you calculate the percentage it is using the following formula:
progress = 100 * (bytes_downloaded + size_of_resume_file) / (total_bytes + size_of_resume_file)
Considering the above the solution is:
import os
import certifi
import pycurl
from PyQt5 import QtCore, QtWidgets
class Downloader(QtCore.QObject):
started = QtCore.pyqtSignal()
finished = QtCore.pyqtSignal()
progressChanged = QtCore.pyqtSignal(int)
error = QtCore.pyqtSignal(int, str)
bytesChanged = QtCore.pyqtSignal(int, int)
#QtCore.pyqtSlot(str, str)
def download(self, url, save_location):
pass
class PycURLDownloader(Downloader):
def __init__(self, parent=None):
super().__init__(parent)
self._resume_size = 0
self._c = pycurl.Curl()
self._flag_stop = 0
def download(self, url, save_location):
self._flag_stop = 0
exist_path = os.path.exists(save_location)
self.started.emit()
with open(save_location, "ab" if exist_path else "wb") as file_id:
self._c.setopt(pycurl.CAINFO, certifi.where())
self._c.setopt(pycurl.URL, url)
self._c.setopt(pycurl.MAX_RECV_SPEED_LARGE, 1024)
if exist_path:
self._c.setopt(pycurl.RESUME_FROM, os.path.getsize(save_location))
self._resume_size = os.path.getsize(save_location)
else:
self._resume_size = 0
self._c.setopt(pycurl.WRITEDATA, file_id)
self._c.setopt(pycurl.NOPROGRESS, 0)
self._c.setopt(pycurl.PROGRESSFUNCTION, self._progress_callaback)
try:
self._c.perform()
except pycurl.error as e:
self.error.emit(*e.args)
else:
self.finished.emit()
self._c.close()
#QtCore.pyqtSlot()
def stop(self):
self._flag_stop = 1
def _progress_callaback(self, total, existing, totalfrac, fracmb):
frac = 0
if existing > 0 and total > 0:
frac = int(
100 * (existing + self._resume_size) / (total + self._resume_size)
)
self.bytesChanged.emit(existing, total)
self.progressChanged.emit(frac)
if QtCore.QThread.currentThread().isInterruptionRequested():
return 1
class Widget(QtWidgets.QWidget):
def __init__(self, parent=None):
super().__init__(parent)
self.url_lineedit = QtWidgets.QLineEdit()
self.save_location_lineedit = QtWidgets.QLineEdit()
browse_button = QtWidgets.QPushButton(self.tr("Browse"))
self.download_progressbar = QtWidgets.QProgressBar(minimum=0, maximum=100)
self.download_log_browser = QtWidgets.QTextBrowser()
self.start_download_button = QtWidgets.QPushButton(self.tr("Start Download"))
widget = QtWidgets.QWidget()
widget.setContentsMargins(0, 0, 0, 0)
hlay = QtWidgets.QHBoxLayout(widget)
hlay.addWidget(self.save_location_lineedit)
hlay.addWidget(browse_button)
hlay.setContentsMargins(0, 0, 0, 0)
flay = QtWidgets.QFormLayout()
flay.addRow("URL", self.url_lineedit)
flay.addRow("Save as", widget)
flay.addRow("", self.download_progressbar)
flay.addRow("", QtWidgets.QLabel(self.tr("Packets output in Bytes")))
flay.addRow("", self.download_log_browser)
hlay2 = QtWidgets.QHBoxLayout()
hlay2.addStretch()
hlay2.addWidget(self.start_download_button)
hlay2.addStretch()
vlay = QtWidgets.QVBoxLayout(self)
vlay.addLayout(flay)
vlay.addLayout(hlay2)
self.start_download_button.clicked.connect(self.start_download)
browse_button.clicked.connect(self.select_save_location)
self._thread = QtCore.QThread(self)
self._thread.start()
self._downloader = PycURLDownloader()
self._downloader.moveToThread(self._thread)
self._downloader.progressChanged.connect(self.download_progressbar.setValue)
self._downloader.bytesChanged.connect(self.on_bytesChanged)
self._downloader.started.connect(self.on_started)
self._downloader.finished.connect(self.on_finished)
self.url_lineedit.setText("http://techslides.com/demos/sample-videos/small.mp4")
#QtCore.pyqtSlot()
def start_download(self):
url = self.url_lineedit.text()
save_location = self.save_location_lineedit.text()
if not url:
QtWidgets.QMessageBox.information(self, "Error", "Please put the links")
return
elif not save_location:
QtWidgets.QMessageBox.information(self, "Error", "Please put the location")
return
wrapper = partial(self._downloader.download, url, save_location)
QtCore.QTimer.singleShot(0, wrapper)
#QtCore.pyqtSlot()
def select_save_location(self):
filename, _ = QtWidgets.QFileDialog.getSaveFileName(self, "Select")
if filename:
self.save_location_lineedit.setText(filename)
#QtCore.pyqtSlot(int, str)
def on_error(self, t, msg):
QtWidgets.QMessageBox.information(self, "Error", msg)
#QtCore.pyqtSlot(int, int)
def on_bytesChanged(self, existing, total):
self.download_log_browser.append(
"Downloaded %d/%d %d%%" % (existing, total, 100 * existing / total)
)
#QtCore.pyqtSlot()
def on_started(self):
self.start_download_button.setEnabled(False)
#QtCore.pyqtSlot()
def on_finished(self):
self.start_download_button.setEnabled(True)
def closeEvent(self, event):
self._thread.requestInterruption()
self._thread.quit()
self._thread.wait()
super().closeEvent(event)
if __name__ == "__main__":
from functools import partial
import sys
app = QtWidgets.QApplication(sys.argv)
w = Widget()
w.resize(640, 480)
w.show()
ret = app.exec_()
sys.exit(ret)
I'm using Qt's QWebPage to render a page that uses javascript to update its content dynamically - so a library that just downloads a static version of the page (such as urllib2) won't work.
My problem is, when I render a second page, about 99% of the time the program just crashes. At other times, it will work three times before crashing. I've also gotten a few segfaults, but it is all very random.
My guess is the object I'm using to render isn't getting deleted properly, so trying to reuse it is possibly causing some problems for myself. I've looked all over and no one really seems to be having this same issue.
Here's the code I'm using. The program downloads web pages from steam's community market so I can create a database of all the items. I need to call the getItemsFromPage function multiple times to get all of the items, as they are broken up into pages (showing results 1-10 out of X amount).
import csv
import re
import sys
from string import replace
from bs4 import BeautifulSoup
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
class Item:
__slots__ = ("name", "count", "price", "game")
def __repr__(self):
return self.name + "(" + str(self.count) + ")"
def __str__(self):
return self.name + ", " + str(self.count) + ", $" + str(self.price)
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
self.deleteLater()
def getItemsFromPage(appid, page=1):
r = Render("http://steamcommunity.com/market/search?q=appid:" + str(appid) + "#p" + str(page))
soup = BeautifulSoup(str(r.frame.toHtml().toUtf8()))
itemLst = soup.find_all("div", "market_listing_row market_recent_listing_row")
items = []
for k in itemLst:
i = Item()
i.name = k.find("span", "market_listing_item_name").string
i.count = int(replace(k.find("span", "market_listing_num_listings_qty").string, ",", ""))
i.price = float(re.search(r'\$([0-9]+\.[0-9]+)', str(k)).group(1))
i.game = appid
items.append(i)
return items
if __name__ == "__main__":
print "Updating market items to dota2.csv ..."
i = 1
with open("dota2.csv", "w") as f:
writer = csv.writer(f)
r = None
while True:
print "Page " + str(i)
items = getItemsFromPage(570)
if len(items) == 0:
print "No items found, stopping..."
break
for k in items:
writer.writerow((k.name, k.count, k.price, k.game))
i += 1
print "Done."
Calling getItemsFromPage once works fine. Subsequent calls give me my problem. The output of the program is typically
Updating market items to dota2.csv ...
Page 1
Page 2
and then it crashes. It should go on for over 700 pages.
The problem with your program is that you are attempting to create a new QApplication with every url you fetch.
Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.
Here are some examples which show how to use the WebPage class:
Usage:
def my_html_processor(html, url):
print('loaded: [%d chars] %s' % (len(html), url))
import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)
# example 1: process list of urls
urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)
# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url = 'https://en.wikipedia.org/wiki/Special:Random'
# webpage.process(itertools.repeat(url))
sys.exit(app.exec_())
PyQt5 WebPage:
from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class WebPage(QWebEnginePage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super().__init__()
self._verbose = verbose
self.loadFinished.connect(self.handleLoadFinished)
def process(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QUrl(url))
return True
def processCurrentPage(self, html):
self.htmlReady.emit(html, self.url().toString())
if not self.fetchNext():
QApplication.instance().quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super().javaScriptConsoleMessage(*args, **kwargs)
PyQt4 WebPage:
from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
class WebPage(QWebPage):
htmlReady = pyqtSignal(str, str)
def __init__(self, verbose=False):
super(WebPage, self).__init__()
self._verbose = verbose
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.mainFrame().load(QUrl(url))
return True
def processCurrentPage(self):
self.htmlReady.emit(
self.mainFrame().toHtml(), self.mainFrame().url().toString())
print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))
def handleLoadFinished(self):
self.processCurrentPage()
if not self.fetchNext():
QApplication.instance().quit()
def javaScriptConsoleMessage(self, *args, **kwargs):
if self._verbose:
super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)
I want to edit all client QTextBrowser logs when any client sends some message to server.
my intended procedure is this:
[client]enter -> [client]chatUI.handleEnter -> (RFC)[server]exposed_send -> [server]broadcast -> (RFC)[clients]update.emit() -> [clients]listen -> log changed
When I run this code, other clients logs are not changed and only the client that give input to server has an updated log.
How I can solve this to update all clients properly?
chat_server.py
import rpyc
import random
import string
from threading import RLock
users = dict()
callbacks = dict()
user_num = 0
lock = RLock()
buf = dict()
class chatService(rpyc.Service):
def on_connect(self):
global user_num
with lock:
user_num = user_num+1
print ("connect user: %d" % user_num)
def on_disconnect(self):
global user_num
with lock:
user_num = user_num-1
print ("disconnect user: %d" % user_num)
def exposed_accept(self, idt, callback):
with lock:
global users
global callbacks
if not isinstance(idt, str) or len(idt) != 6:
return False
elif idt in users:
return -1
else:
pw = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32))
users[idt] = pw
callbacks[idt] = rpyc.async(callback)
return pw
def exposed_send(self, target, msg, idt, pw):
print ('here')
name = self.identify(idt, pw)
if name == False:
print ('here2')
return False
else:
print ('here3')
global callbacks
if target == None:
self.broadcast("[%s] %s" % (name, msg))
elif msg.target in callbacks:
self.send("[%s] %s" %(name, msg), target)
else:
return False
def exposed_order(self, msg, idt, pw):
pass
def identify(self, idt, pw):
global users
if users[idt] == pw:
return idt
else:
return False
def broadcast(self, msg):
with lock:
print("bloadcast calls")
global callbacks
global buf
for user, callback in callbacks.items():
if user not in buf or buf[user] == None:
buf[user] = (msg,)
else:
buf[user] = buf[user] + (msg,)
callback()
def send(self, msg, target):
global callbacks
global buf
if user not in buf or buf[user] == None:
buf[target] = (msg,)
else:
buf[target] = buf[target] + (msg,)
callbacks[target]()
def exposed_get_buf(self, user):
global buf
temp = buf[user]
buf[user] = None
return temp
if __name__ == '__main__':
from rpyc.utils.server import ThreadedServer
t = ThreadedServer(chatService, port = 3743)
t.start()
chat_client.py
from chatUI import *
import rpyc
import random
import string
if __name__ == '__main__':
service = rpyc.connect('floating.tk', 3743)
app, chat = UIReady(service)
while True:
idt = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6))
pw = service.root.accept(idt, chat.update.update.emit)
if pw != False and pw != -1:
break
chat.idt = idt
chat.pw = pw
sys.exit(app.exec_())
chatUI.py
import sys
from PyQt5.QtWidgets import *
from PyQt5.QtCore import *
class Updater(QObject):
update = pyqtSignal()
class Chat(QWidget):
log = None
enter = None
def __init__(self, service) :
super().__init__()
self.service = service
self.idt = None
self.pw = None
self.initUI()
self.update = Updater()
self.update.update.connect(self.listen)
def initUI(self):
logLabel = QLabel('chat log')
enterLabel = QLabel('enter')
self.log = QTextBrowser()
self.enter = QLineEdit()
self.enter.returnPressed.connect(self.handleEnter)
layout = QGridLayout()
layout.addWidget(logLabel, 0, 0)
layout.addWidget(self.log, 0, 1, 5, 1)
layout.addWidget(enterLabel, 6, 0)
layout.addWidget(self.enter, 6, 1)
self.setLayout(layout)
self.setWindowTitle('chat')
self.resize(600, 600)
self.show()
def handleEnter(self):
msg = self.enter.text()
self.enter.setText("")
self.service.root.send(None, msg, self.idt, self.pw)
print('get enter')
def listen(self):
msg = self.service.root.get_buf(self.idt)
for m in msg:
self.log.append(m)
def UIReady(service):
app = QApplication(sys.argv)
chat = Chat(service)
return app, chat
I'm trying to scrape a large website of government records which requires a "snowball" method, i.e., starting at the main search page and then following each link that the scraper finds to the next page.
I've been able to load the main page using PyQt this SiteScraper tutorial.
import sys
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from BeautifulSoup import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def main():
baseUrl = 'http://www.thesite.gov'
url = 'http://www.thesite.gov/search'
r = Render(url)
html = r.frame.toHtml()
# use BeautifulSoup to cycle through each regulation
soup = BeautifulSoup(html)
regs = soup.find('div',{'class':'x-grid3-body'}).findAll('a')
# cycle through list and call up each page separately
for reg in regs:
link = baseUrl + reg['href']
link = str(link)
# use Qt to load each regulation page
r = Render(link)
html = r.frame.toHtml() # get actual rendered web page
The problem is I get this error when I try to render a new webpage:
RuntimeError: A QApplication instance already exists.
I get it that the function is trying to call another QApplication instance. But how do I navigate to a new page with the same instance?
class Render(QWebPage):
def __init__(self, app, url):
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
def main():
app = QApplication(sys.argv)
baseUrl = 'http://www.thesite.gov'
url = 'http://www.thesite.gov/search'
r = Render(app, url)
html = r.frame.toHtml()
I had the same problem (needing to load multiple pages with QWebPage) but I couldn't get any of these answers to work for me. Here's what did work, the key is to use a QEventLoop and connect loadFinished to loop.quit:
from PySide import QtCore, QtGui, QtWebKit
import sys
def loadPage(url):
page = QtWebKit.QWebPage()
loop = QtCore.QEventLoop() # Create event loop
page.mainFrame().loadFinished.connect(loop.quit) # Connect loadFinished to loop quit
page.mainFrame().load(url)
loop.exec_() # Run event loop, it will end on loadFinished
return page.mainFrame().toHtml()
app = QtGui.QApplication(sys.argv)
urls = ['https://google.com', 'http://reddit.com', 'http://wikipedia.org']
for url in urls:
print '-----------------------------------------------------'
print 'Loading ' + url
html = loadPage(url)
print html
app.exit()
Posting a simplified example here compared to OP's to demonstrate the essential problem and solution.
You're crazy man! QT has a much better DOM than beautifulsoup.
Replace:
soup = BeautifulSoup(html)
With
page = QWebPage()
page.settings().setAttribute(QWebSettings.AutoLoadImages, False)
page.settings().setAttribute(QWebSettings.PluginsEnabled, False)
page.mainFrame().setHtml(html)
dom = page.mainFrame().documentElement()
Then you can simply scrape data like so:
li = dom.findFirst("body div#content div#special ul > li")
if not li.isNull():
class = li.attribute("class")
text = li.toPlainText()
Finally you should use QWebView instead of QWebPage. You can set it up to act like a server which can be controlled with a socket. This is what I do:
class QTimerWithPause(QTimer):
def __init__(self, parent = None):
super(QTimerWithPause, self).__init__ (parent)
self.startTime = 0
self.interval = 0
return
def start(self, interval):
from time import time
self.interval = interval
self.startTime = time()
super(QTimerWithPause, self).start(interval)
return
def pause(self):
from time import time
if self.isActive ():
self.stop()
elapsedTime = self.startTime - time()
self.startTime -= elapsedTime
# time() returns float secs, interval is int msec
self.interval -= int(elapsedTime*1000)+1
return
def resume(self):
if not self.isActive():
self.start(self.interval)
return
class CrawlerWebServer(QWebView):
TIMEOUT = 60
STUPID = r"(bing|yahoo|google)"
def __init__(self, host="0.0.0.0", port=50007, parent=None, enableImages=True, enablePlugins=True):
# Constructor
super(CrawlerWebServer, self).__init__(parent)
self.command = None
self.isLoading = True
self.isConnected = False
self.url = QUrl("http://mast3rpee.tk/")
self.timeout = QTimerWithPause(self)
self.socket = QTcpServer(self)
# 1: Settings
self.settings().enablePersistentStorage()
self.settings().setAttribute(QWebSettings.AutoLoadImages, enableImages)
self.settings().setAttribute(QWebSettings.PluginsEnabled, enablePlugins)
self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
# 2: Server
if args.verbosity > 0: print "Starting server..."
self.socket.setProxy(QNetworkProxy(QNetworkProxy.NoProxy))
self.socket.listen(QHostAddress(host), int(port))
self.connect(self.socket, SIGNAL("newConnection()"), self._connect)
if args.verbosity > 1:
print " Waiting for connection(" + host + ":" + str(port) + ")..."
# 3: Default page
self._load(10*1000, self._loadFinished)
return
def __del__(self):
try:
self.conn.close()
self.socket.close()
except:
pass
return
def _sendAuth(self):
self.conn.write("Welcome to WebCrawler server (http://mast3rpee.tk)\r\n\rLicenced under GPL\r\n\r\n")
def _connect(self):
self.disconnect(self.socket, SIGNAL("newConnection()"), self._connect)
self.conn = self.socket.nextPendingConnection()
self.conn.nextBlockSize = 0
self.connect(self.conn, SIGNAL("readyRead()"), self.io)
self.connect(self.conn, SIGNAL("disconnected()"), self.close)
self.connect(self.conn, SIGNAL("error()"), self.close)
self._sendAuth()
if args.verbosity > 1:
print " Connection by:", self.conn.peerAddress().toString()
self.isConnected = True
if self.isLoading == False:
self.conn.write("\r\nEnter command:")
return
def io(self):
if self.isLoading: return None
if args.verbosity > 0:
print "Reading command..."
data = self.conn.read(1024).strip(" \r\n\t")
if not data: return None
elif self.command is not None:
r = self.command(data)
self.command = None
return r
return self._getCommand(data)
def _getCommand(self, d):
from re import search
d = unicode(d, errors="ignore")
if search(r"(help|HELP)", d) is not None:
self.conn.write("URL | JS | WAIT | QUIT\r\n\r\nEnter Command:")
elif search(r"(url|URL)", d) is not None:
self.command = self._print
self.conn.write("Enter address:")
elif search(r"(js|JS|javascript|JAVASCRIPT)", d) is not None:
self.command = self._js
self.conn.write("Enter javascript to execte:")
elif search(r"(wait|WAIT)", d) is not None:
self.loadFinished.connect(self._loadFinishedPrint)
self.loadFinished.connect(self._loadFinished)
elif search(r"(quit|QUIT|exit|EXIT)", d) is not None:
self.close()
else:
self.conn.write("Invalid command!\r\n\r\nEnter Command:")
return
def _print(self, d):
u = d[:250]
self.out(u)
return True
def _js(self, d):
try:
self.page().mainFrame().evaluateJavaScript(d)
except:
pass
self.conn.write("Enter Javascript:")
return True
def _stop(self):
from time import sleep
if self.isLoading == False: return
if args.verbosity > 0:
print " Stopping..."
self.timeout.stop()
self.stop()
def _load(self, timeout, after):
# Loads a page into frame / sets up timeout
self.timeout.timeout.connect(self._stop)
self.timeout.start(timeout)
self.loadFinished.connect(after)
self.load(self.url)
return
def _loadDone(self, disconnect = None):
from re import search
from time import sleep
self.timeout.timeout.disconnect(self._stop)
self.timeout.stop()
if disconnect is not None:
self.loadFinished.disconnect(disconnect)
# Stick a while on the page
if search(CrawlerWebServer.STUPID, self.url.toString(QUrl.RemovePath)) is not None:
sleep(5)
else:
sleep(1)
return
def _loadError(self):
from time import sleep, time
if not self.timeout.isActive(): return True
if args.verbosity > 0: print " Error retrying..."
# 1: Pause timeout
self.timeout.pause()
# 2: Check for internet connection
while self.page().networkAccessManager().networkAccessible() == QNetworkAccessManager.NotAccessible: sleep(1)
# 3: Wait then try again
sleep(2)
self.reload()
self.timeout.resume()
return False
def go(self, url, after = None):
# Go to a specific address
global args
if after is None:
after = self._loadFinished
if args.verbosity > 0:
print "Loading url..."
self.url = QUrl(url)
self.isLoading = True
if args.verbosity > 1:
print " ", self.url.toString()
self._load(CrawlerWebServer.TIMEOUT * 1000, after)
return
def out(self, url):
# Print html of a a specific url
self.go(url, self._loadFinishedPrint)
return
def createWindow(self, windowType):
# Load links in the same web-view.
return self
def _loadFinished(self, ok):
# Default LoadFinished
from time import sleep
from re import search
if self.isLoading == False: return
if ok == False:
if not self._loadError(): return
self._loadDone(self._loadFinished)
if args.verbosity > 1:
print " Done"
if self.isConnected == True:
self.conn.write("\r\nEnter command:")
self.isLoading = False
return
def _loadFinishedPrint(self, ok):
# Print the evaluated HTML to stdout
if self.isLoading == False: return
if ok == False:
if not self._loadError(): return
self._loadDone(self._loadFinishedPrint)
if args.verbosity > 1:
print " Done"
h = unicode( self.page().mainFrame().toHtml(), errors="ignore" )
if args.verbosity > 2:
print "------------------\n" + h + "\n--------------------"
self.conn.write(h)
self.conn.write("\r\nEnter command:")
self.isLoading = False
return
def contextMenuEvent(self, event):
# Context Menu
menu = self.page().createStandardContextMenu()
menu.addSeparator()
action = menu.addAction('ReLoad')
#action.triggered.connect
def refresh():
self.load(self.url)
menu.exec_(QCursor.pos())
class CrawlerWebClient(object):
def __init__(self, host, port):
import socket
global args
# CONNECT TO SERVER
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.connect((host, port))
o = self.read()
if args.verbosity > 2:
print "\n------------------------------\n" + o + "\n------------------------------\n"
return
def __del__(self):
try: self.socket.close()
except: pass
def read(self):
from re import search
r = ""
while True:
out = self.socket.recv(64*1024).strip("\r\n")
if out.startswith(r"Enter"):
break
if out.endswith(r"Enter command:"):
r += out[:-14]
break
r += out
return r
def command(self, command):
global args
if args.verbosity > 2:
print " Command: [" + command + "]\n------------------------------"
self.socket.sendall(unicode(command))
r = self.read()
if args.verbosity > 2:
print r, "\n------------------------------\n"
return r
OK then. If you really need JavaScript. (Can you get the answer from JSON at all? That would probably be easier still with simplejson or json.) The answer is don't make more than one QApplication. You're not allowed to. Make main make a QApplication and then use the QWebPage without bothering to call QApplication.exec_(). If that doesn't work, run it all in another QThread.
I am not familiar with PyQt, but as an option, you could write your script without using a class. That way, you can more easily re-use that application instance.
Hope it helps.