WebScraping, PyQt - python

I've been working on a really simple script that gets the question titles from the python tag in stackoverflow and shows them in a QTextBrowser.
The application works as expected(at least at the beggining) but the Window doesn't show untill it finishes loading the webpage and the refresh button freezes the program until it loads too is there a way to fix this problem? Here's The full code:
#! usr/bin/env python
from PyQt4.QtGui import *
import requests
from bs4 import BeautifulSoup
import lxml
from threading import Thread
class Form(QWidget):
def __init__(self, parent=None):
super(Form, self).__init__(parent)
self.url = "http://www.stackoverflow.com/questions/tagged/python"
self.browser = QTextBrowser()
self.connectionlabel = QLabel()
self.refreshBtn = QPushButton("Refresh")
self.refreshBtn.clicked.connect(self.get)
layout = QGridLayout()
layout.addWidget(self.connectionlabel, 0, 0)
layout.addWidget(self.refreshBtn, 0, 1)
layout.addWidget(self.browser, 1, 0,1,2)
self.setLayout(layout)
self.setWindowTitle("StackOverflow: Python")
def get(self):
self.browser.clear()
self.connectionlabel.setText("Connecting.....")
try:
response = requests.get(self.url)
soup = BeautifulSoup(response.content, 'lxml')
self.connectionlabel.setText("Connected.")
questions = soup.find_all("a", {"class": "question-hyperlink"})
for i, questionTitle in enumerate(questions):
try:
self.browser.append("\n"+str(i+1)+". "+questionTitle.text)
except:
pass
except:
self.connectionlabel.setText("Couldn't connect.")
if __name__ == '__main__':
import sys
app = QApplication(sys.argv)
screen = Form()
screen.show()
t = Thread(screen.get)
t.deamon = True
t.start()
sys.exit(app.exec_())

The UI will always freeze until the code execution is complete. In order to avoid this use multiprocessing or threading and call the blocking code in a separate process/thread. You can also use PyQT's QThread.

Related

How to set an image on the background in Kivy (Python) before the content is loaded?

I am writing the weather application at KIVY in which when starting the weather page is running and the label label is set to the temperature value. Also at startup, before the parceling begins, I set the image on the background of the entire screen. But when I run the application, no images, it is loaded only after the parsing works. How do I make the installation of the image before the parsing begins?
from kivy.app import App
from kivy.uix.relativelayout import RelativeLayout
from kivy.graphics import Rectangle
from kivy.core.window import Window
from kivy.uix.label import Label
from kivy.lang import Builder
import requests
from bs4 import BeautifulSoup
from time import sleep
Builder.load_file('my.kv')
class MainWidget(RelativeLayout):
intWeather = 0
def __init__(self, ** kwargs):
super(MainWidget,
self).__init__( ** kwargs)
with self.canvas.before:
Rectangle(source='winter.jpg', pos=self.pos, size=Window.size)
strWeather = self.set_weather()
self.ids['weather'].text = strWeather+" °C"
self.intWeather = int(strWeather)
def set_weather(self):
url = 'https://pogoda33.ru/%D0%BF%D0%BE%D0%B3%D0%BE%D0%B4%D0%B0-%D1%83%D1%81%D1%82%D1%8C-%D1%82%D1%8B%D0%BC/14-%D0%B4%D0%BD%D0%B5%D0%B9'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html5lib')
weather = soup.find_all('div', class_='forecast-weather-temperature')
return weather[0].text[0:3]
class myApp(App):
def build(self):
# return a MainWidget() as a root widget
return MainWidget()
if __name__ == '__main__':
# Here the class MyApp is initialized
# and its run() method called.
myApp().run()
my.kv file
<MainWidget>:
Label:
id: weather
font_size:150
pos_hint:{'center_x':.5, 'center_y':.8}
Your background image is not showing up as soon as you would like because the __init__() method of the MainWidget is using the main thread to access the web and retrieve the weather. The kivy App also uses the main thread to update the GUI, so it cannot do its update until you release the main thread. As a general rule, you should not do any long running processing on the main thread of a kivy application. So, to fix this, do your web access in another thread:
class MainWidget(RelativeLayout):
intWeather = 0
def __init__(self, **kwargs):
super(MainWidget, self).__init__(**kwargs)
with self.canvas.before:
Rectangle(source='winter.jpg', pos=self.pos, size=Window.size)
# run web access in another thread
Thread(target=self.set_weather, daemon=True).start()
#mainthread
def show_weather(self, temp):
# this gets run on the main thread
self.ids['weather'].text = temp + " °C"
self.intWeather = int(temp)
def set_weather(self):
url = 'https://pogoda33.ru/%D0%BF%D0%BE%D0%B3%D0%BE%D0%B4%D0%B0-%D1%83%D1%81%D1%82%D1%8C-%D1%82%D1%8B%D0%BC/14-%D0%B4%D0%BD%D0%B5%D0%B9'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html5lib')
weather = soup.find_all('div', class_='forecast-weather-temperature')
temp = weather[0].text[0:3]
self.show_weather(temp)

Automate webbrowser

I'm trying to create a dedicated web browser for my work with some buttons which can automate tasks for me. The script I found is like below, but I'm struggling with the piece that would enter text in de search field and then hit the button. (This is a test piece ofcourse)
Does anyone know how to automate this, using the pyqt5 example found?
Something like:
self.browser.getObjectbyName("ObjName").setText("newtext")
I've been googling for hours but have not found any example code yet.
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from PyQt5.QtGui import *
from PyQt5.QtWebEngineWidgets import *
import sys
class MainWindow(QMainWindow):
def __init__(self, *args, **kwargs):
super(MainWindow,self).__init__(*args, **kwargs)
self.browser = QWebEngineView()
self.browser.setUrl(QUrl("http://www.google.com"))
self.setCentralWidget(self.browser)
self.show()
app = QApplication(sys.argv)
window = MainWindow()
app.exec_()
There are several tools in QtWebEngine to perform automated tasks on web pages:
Execute js script using runJavaScript() method.
Use QtWebChannel to exchange information and perform tasks between python and the web page.
Implement userscripts through QWebEngineScript.
In this case I will show a demo of how to do an automated search using the first option:
from PyQt5.QtCore import QUrl
from PyQt5.QtWidgets import QApplication, QMainWindow
from PyQt5.QtWebEngineWidgets import QWebEngineView
import sys
script = """
var input = document.querySelector("body > div.L3eUgb > div.o3j99.ikrT4e.om7nvf > form > div:nth-child(1) > div.A8SBwf > div.RNNXgb > div > div.a4bIc > input")
input.value = "StackOverflow"
var form = document.querySelector("body > div.L3eUgb > div.o3j99.ikrT4e.om7nvf > form")
form.submit()
"""
class MainWindow(QMainWindow):
def __init__(self, *args, **kwargs):
super(MainWindow, self).__init__(*args, **kwargs)
self.browser = QWebEngineView()
self.browser.loadFinished.connect(self.handle_load_finished)
self.browser.setUrl(QUrl("http://www.google.com"))
self.setCentralWidget(self.browser)
def handle_load_finished(self, ok):
if ok:
if self.browser.url() == QUrl("https://www.google.com/"):
self.browser.page().runJavaScript(script)
app = QApplication(sys.argv)
window = MainWindow()
window.show()
app.exec_()

How to scrape several websites with pyqt4, scope change?

I would like to scrape two websites in java for links using PyQt4.QtWebKit to render the pages and then get the desired links. The code works fine with one page or url, but stops (but continues running until force quit) after printing the links of the first website. It seems the scope stays in the event loop of the render class. How can I get the program to change scope and continue with the for loop and rendering the second website? Using exit() in _loadFinished method just quits the program after the first iteration. Maybe the python app has to close and reopen to render the next page, which is impossible because the app is opened/reopened outside of the program?
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from PyQt4 import QtGui
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.frame = None
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
def _loadFinished(self, result):
self.frame = self.mainFrame()
result = self.frame.toHtml()
formatted_result = str(result)
tree = html.fromstring(formatted_result)
archive_links = tree.xpath('//div/div/a/#href')[0:4]
print(archive_links)
urls = ['http://pycoders.com/archive/', 'http://www.pythonjobshq.com']
def main(urls):
app = QtGui.QApplication(sys.argv)
for url in urls:
r = Render(url)
#s = Render(urls[1]) #The pages can be rendered parallel, but rendering more than a handful of pages a the same time is a bad idea
sys.exit(app.exec_())
if __name__ == '__main__':
main(urls)
Thankful for any help!

PyQt Class not working for the second usage

I'm using PyQt to fully load a page(including JS) and get it contents using Beautiful Soup. Works fine at the first iteration, but after, it crashes. I don't have a big knowledge in Python, and even less in PyQt, so any help is very welcome.
Class borrowed from here.
from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import sys
import signal
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.html = None
signal.signal(signal.SIGINT, signal.SIG_DFL)
self.connect(self, SIGNAL('loadFinished(bool)'), self._finished_loading)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _finished_loading(self, result):
self.html = self.mainFrame().toHtml()
self.soup = BeautifulSoup(UnicodeDammit(self.html).unicode_markup)
self.app.quit()
###################################################################
l = ["http://www.google.com/?q=a", "http://www.google.com/?q=b", "http://www.google.com/?q=c"]
for page in l:
soup = Render(page).soup
print("# soup done: " + page)
The example crashes because the RenderPage class attempts to create a new QApplication and event-loop for every url it tries to load.
Instead, only one QApplication should be created, and the QWebPage subclass should load a new url after each page has been processed, rather than using a for-loop.
Here's a re-write of the example which should do what you want:
import sys, signal
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt4 import QtCore, QtGui, QtWebKit
class WebPage(QtWebKit.QWebPage):
def __init__(self):
QtWebKit.QWebPage.__init__(self)
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def process(self, items):
self._items = iter(items)
self.fetchNext()
def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.mainFrame().load(QtCore.QUrl(self._url))
except StopIteration:
return False
return True
def handleLoadFinished(self):
self._func(self._url, self.mainFrame().toHtml())
if not self.fetchNext():
print('# processing complete')
QtGui.qApp.quit()
def funcA(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
def funcB(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
if __name__ == '__main__':
items = [
('http://stackoverflow.com', funcA),
('http://google.com', funcB),
]
signal.signal(signal.SIGINT, signal.SIG_DFL)
print('Press Ctrl+C to quit\n')
app = QtGui.QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())

Wait until a webpage finished loading to load the next one in a list

I'm using PyQT webView to visit some webpages I have stored on a dictionary the code is something like this:
def loadLink(self, url):
manager = QNetworkAccessManager()
request = QNetworkRequest(QUrl(url))
self.ui.webView.load(QUrl(visitar))
def readUnreadLinks(self):
print "Links to read: " + str(len(self.unreadLinks))
for link in self.unreadLinks:
print "link-> " + str(link)
self.loadLink(link)
the problem is it doesn't wait until finished loading the web page and starts loading the next one. I want to load a webpage, wait until it finished loading and then load the next one.
Thanks, this is driving me crazy :)
you might want to use loadFinished signal of yout QWebView control to detect when page loading was finished and trigger loading of the new one. Pls, see if an example below would work for you:
import sys
from PyQt4 import QtGui, QtCore, QtWebKit
class MainForm(QtGui.QMainWindow):
def __init__(self, parent=None):
super(MainForm, self).__init__(parent)
self.pages = ['http://www.google.com', 'http://www.citrix.com', 'http://yahoo.com', 'http://reddit.com']
self.index = 0
self.view = QtWebKit.QWebView()
self.view.connect(self.view, QtCore.SIGNAL('loadFinished(bool)'), self.loadFinished)
self.setCentralWidget(self.view)
self.view.load(QtCore.QUrl(self.pages[self.index]))
def loadFinished(self, ok):
self.index += 1
if self.index < len(self.pages) :
self.view.load(QtCore.QUrl(self.pages[self.index]))
else:
print 'done'
def main():
app = QtGui.QApplication(sys.argv)
form = MainForm()
form.show()
app.exec_()
if __name__ == '__main__':
main()
hope this helps, regards

Categories