from selenium import webdriver
from selenium.webdriver.common.by import By
from multiprocessing import Process
from multiprocessing import Pool
import time
import json
def json_read(file):
with open(file, "r") as f:
return json.loads(f.read())
class item:
def __init__(self, name, level, location, guild, unit_price, volume, total_cost, last_seen):
self.name = name
self.level = level
self.location = location
self.guild = guild
self.unit_price = unit_price
self.volume = volume
self.total_cost = total_cost
self.last_seen = last_seen
class web_page_elements:
def __init__(self, item=None, max_price=None, depth=None):
self.items = []
self.depth = depth
self.max_price = max_price
self.url = "https://eu.tamrieltradecentre.com/pc/Trade/"
self.item = item
self.current_page = 1
self.op = op = webdriver.ChromeOptions()
self.op.add_argument('headless')
self.driver = webdriver.Chrome('chromedriver.exe') # options=op
def search(self):
self.driver.get(self.url)
print("on")
input_field = self.driver.find_element(By.XPATH, "/html/body/div[2]/table/tbody/tr[3]/td[2]/section/div/div[3]/div/form/div[2]/div[2]/div[1]/input[2]")
input_field.send_keys(self.item)
self.driver.find_element(By.XPATH, "/html/body/div[2]/table/tbody/tr[3]/td[2]/section/div/div[3]/div/form/div[2]/div[2]/div[1]/span/button").click()
self.driver.switch_to.window(self.driver.window_handles[-1])
def data_parse(self, data):
raw_data = data.text.split('\n')
return item(raw_data[0], raw_data[1], raw_data[3], raw_data[4], float(raw_data[5].replace(',', '')), int(raw_data[7]), raw_data[9].split()[0], raw_data[9].split(' ')[1])
def next_page(self):
if self.current_page == 1:
self.driver.get(self.driver.current_url + f"&page={self.current_page+1}")
self.current_page = self.current_page + 1
else:
self.driver.get(self.driver.current_url.replace(f"&page={self.current_page}", f"&page={self.current_page+1}"))
self.current_page = self.current_page + 1
def data_pull(self):
# gathers tabl row as elements(objects)
rows = self.driver.find_elements(By.XPATH, '//tr[#class="cursor-pointer"]')
data = [self.data_parse(item) for item in rows]
for item in data:
print(item.__dict__)
def quit(self):
self.driver.close()
self.driver.stop_client()
self.driver.quit()
def run(self):
self.search()
self.data_pull()
time.sleep(2)
self.next_page()
self.data_pull()
tasks = [Process(target=obj.run()) for obj in objs]
for task in tasks:
print(task)
task.start()
if __name__ == '__main__':
main()
this is the entire code as essentially it pulls items of a website will all the relevant data an there is multiple items. currently I create an instance for each item inputted in an external file, and then I then wish to take each object in the objs list and call the run class method on each object so that the run simultaneously instead of one at a time for run time purposes however i am unsure that you are able to do this in python
tasks = [Process(target=obj.run()) for obj in objs]
for task in tasks:
print(task)
task.start()
if __name__ == '__main__':
main()
this is currently how i was trying to do it however when i run the code they still appear to run after the one before has completed and i was hoping if anyone has a solution for this.
Related
TL;DR - the consumer processes finish but do not join, no errors are raised and the script runs infinitely, stuck in limbo on the join statment?
I am aiming to speed up a data retrieval process, however I do not know how many 'tasks' (pieces of data to retrieve) there might be. So I made a modified version of the poison pill method so that the task recognizes when it is no longer retrieving information, and triggers the poison pill if statement.
I have posted a proof, which is a working example of my poison pill method, and a full script, which as the name implies is the full script. (both should be able to run as is)
proof:
import multiprocessing
class Task:
def __init__(self, number):
self.number = number
def __call__(self):
"""Find officer and company data and combine and save it"""
try:
# 'gather some data!'
self.result = self.number*2
print(self.number)
# 'fake' finding no data
if self.result >= 8:
raise NameError
except NameError:
# become poison pill once latest is done
self.result = None
def output(self):
return self.result
class Consumer(multiprocessing.Process):
"""Handle process and re-queue complete tasks"""
def __init__(self, waiting_queue, complete_queue):
multiprocessing.Process.__init__(self)
self.waiting_queue = waiting_queue
self.complete_queue = complete_queue
def run(self):
"""process tasks until queue is empty"""
proc_name = self.name
while True:
current_task = self.waiting_queue.get()
current_task()
if current_task.output() is None:
print('{}: Exiting, poison pill reached'.format(proc_name))
self.waiting_queue.task_done()
break
self.waiting_queue.task_done()
self.complete_queue.put(current_task)
print('{}: complete'.format(proc_name))
class Shepard:
"""Handle life cycle of Consumers, Queues and Tasks"""
def __init__(self):
pass
def __call__(self, start_point):
# initialize queues
todo = multiprocessing.JoinableQueue()
finished = multiprocessing.JoinableQueue()
# start consumers
num_consumers = multiprocessing.cpu_count() * 2
consumers = [Consumer(todo, finished) for i in range(num_consumers)]
for q in consumers:
q.start()
# decide on (max) end limit (make much longer than suspected amount of data to be gathered
start = int(start_point)
max_record_range = 100
end = start + max_record_range
# Enqueue jobs
for i in range(start, end):
todo.put(Task(i))
print('Processes joining')
# wait for processes to join
for p in consumers:
p.join()
print('Processes joined')
# process results - UNFINISHED
pass
# return results - UNFINISHED
return 'results!'
if __name__ == '__main__':
# load start points:
start_points = {'cat1': 1, 'cat2': 3, 'cat3': 4}
master = Shepard()
cat1 = master(start_points['cat1'])
print('cat1 done')
cat2 = master(start_points['cat2'])
print('cat2 done')
cat3 = master(start_points['cat3'])
So here is the full script:
import time
import requests
import sys
import json
import pandas as pd
import multiprocessing
import queue
class CompaniesHouseRequest:
"""Retreive information from Companies House"""
def __init__(self, company, catagory_url=''):
"""Example URL: '/officers'"""
self.company = str(company)
self.catagory_url = str(catagory_url)
def retrieve(self, key='Rn7RLDV9Tw9v4ShDCotjDtJFBgp1Lr4d-9GRYZMo'):
"""retrieve data from Companies House"""
call = 'https://api.companieshouse.gov.uk/company/' + self.company + self.catagory_url
retrieve_complete = False
while retrieve_complete is False:
resp = requests.get(call, auth=requests.auth.HTTPBasicAuth(key, ''))
code = resp.status_code
if code == 404:
print(resp.status_code)
raise NameError('Company not found')
elif code == 200:
try:
self.data = json.loads(resp.content.decode('UTF8'))
retrieve_complete = True
except json.decoder.JSONDecodeError:
print('Decode Error in Officers!')
else:
print("Error:", sys.exc_info()[0])
print('Retrying')
time.sleep(5)
return self.data
class Company:
"""Retrieve and hold company details"""
def __init__(self, company_number):
self.company_number = company_number
def __call__(self):
"""Create request and process data"""
# make request
req = CompaniesHouseRequest(self.company_number)
data = req.retrieve()
# extract data
try:
line = [self.company_number,
data['company_name'],
data['registered_office_address'].get('premises', ''),
data['registered_office_address'].get('address_line_1', ''),
data['registered_office_address'].get('address_line_2', ''),
data['registered_office_address'].get('country', ''),
data['registered_office_address'].get('locality', ''),
data['registered_office_address'].get('postal_code', ''),
data['registered_office_address'].get('region', '')]
except KeyError:
line = ['' for i in range(0, 9)]
# save as pandas dataframe
return pd.DataFrame([line], columns=['company_number', 'company_name', 'company_address_premises',
'company_address_line_1', 'company_address_line_2',
'company_address_country', 'company_address_locality',
'company_address_postcode', 'company_address_region'])
def name_splitter(name):
split = name.split(', ')
if len(split) > 2:
return [split[2], split[1], split[0]]
else:
return ['', split[1], split[0]]
class Officers:
"""Retrieve and hold officers details"""
def __init__(self, company_number):
self.company_number = company_number
def __call__(self):
"""Create request and process data"""
# make request
req = CompaniesHouseRequest(self.company_number, '/officers')
data = req.retrieve()
# extract data
for officer in data['items']:
if officer['officer_role'] == 'director':
name = name_splitter(officer['name'])
line = [name[0],
name[1],
name[2],
officer.get('occupation'),
officer.get('country_of_residence'),
officer.get('nationality'),
officer.get('appointed_on', ''),
officer['address'].get('premises', ''),
officer['address'].get('address_line_1', ''),
officer['address'].get('address_line_2', ''),
officer['address'].get('country', ''),
officer['address'].get('locality', ''),
officer['address'].get('postal_code', ''),
officer['address'].get('region', '')]
break
director_count = sum(map(lambda x: x['officer_role'] == 'director', data['items']))
if director_count > 1:
line += [True]
elif director_count == 1:
line += [False]
else:
line = ['no directors'] * 3 + [''] * 12
return pd.DataFrame([line], columns=['title', 'first_name', 'surname', 'occupation', 'country_of_residence',
'nationality', 'appointed_on',
'address_premises', 'address_line_1', 'address_line_2',
'address_country', 'address_locality', 'address_postcode',
'address_region', 'multi_director'])
class Task:
def __init__(self, prefix, company_number):
self.prefix = prefix
self.company_number = company_number
def __call__(self):
"""Find officer and company data and combine and save it"""
comp_id = self.prefix + str(self.company_number)
print(comp_id)
try:
# initialise company class
comp = Company(comp_id)
# initialise officer class
off = Officers(comp_id)
# retrieve and concatonate
self.result = pd.concat([comp(), off()], axis=1)
except NameError:
# become poison pill once latest is done
self.result = None
def output(self):
return self.result
class Consumer(multiprocessing.Process):
"""Handle process and re-queue complete tasks"""
def __init__(self, waiting_queue, complete_queue):
multiprocessing.Process.__init__(self)
self.waiting_queue = waiting_queue
self.complete_queue = complete_queue
def run(self):
"""process tasks until queue is empty"""
proc_name = self.name
while True:
current_task = self.waiting_queue.get()
current_task()
if current_task.output() is None:
print('{}: Exiting, poison pill reached'.format(proc_name))
self.waiting_queue.task_done()
break
self.waiting_queue.task_done()
self.complete_queue.put(current_task)
print('{}: complete'.format(proc_name))
class Shepard:
"""Handle life of Consumers, Queues and Tasks"""
def __init__(self):
pass
def __call__(self, prefix, start_point):
# initialize queues
todo = multiprocessing.JoinableQueue()
finished = multiprocessing.JoinableQueue()
# start consumers
num_consumers = multiprocessing.cpu_count() * 2
consumers = [Consumer(todo, finished) for i in range(num_consumers)]
for q in consumers:
q.start()
# decide on (max) end limit
start = int(start_point)
max_record_range = 1000
end = start + max_record_range
# Enqueue jobs
for i in range(start, end):
todo.put(Task(prefix, i))
print('Processes joining')
# wait for processes to join
for p in consumers:
p.join()
print('Processes joined')
# process results - UNFINISHED
pass
# return results - UNFINISHED
return 'results!'
if __name__ == '__main__':
# paths to data
data_directory = r'C:\Users\hdewinton\OneDrive - Advanced Payment Solutions\Python\Corporate DM\data'
base = r'\base'
# load start points:
init = {"England": 10926071, "Scotland": 574309, "Ireland": 647561}
# gather data for each catagory
master = Shepard()
ireland = master('NI', init['Ireland'])
scotland = master('SC', init['Scotland'])
england = master('', init['England'])
TL;DR - the consequence (getting stuck in limbo while the consumers fail to join) can be fixed by changing this:
finished = multiprocessing.JoinableQueue()
to this:
mananger = multiprocessing.Manager()
finished = mananger.Queue()
Details - "When an object is put on a queue, the object is pickled and a background thread later flushes the pickled data to an underlying pipe. This has some consequences which are a little surprising, but should not cause any practical difficulties – if they really bother you then you can instead use a queue created with a manager." from the documentation
The second queue, of finished items, triggers one of the aforementioned surprising consquences if a certain number of tasks are added to it. Below the limit there are no problems and above the limit the consequence occurs. This does not occur in the dummy because the second queue, while present, is not used. The limit depends on the size and complexity of the Task objects, so I recon this has something to do with the flushing of pickled data only occurring after a certain volume of data is reached - the volume of data triggers this consequence
Addendum - Another error also appears once the fix has been implemented: a pipe error occurs as the consumers of the todo queue are terminated before the queue
is empty leaving the pipe within the queue object with no connection object to send data to. This triggers a WinError 232. Not to worry though, the pipe error can be fixed by emptying the queue before exiting the consumers.
Simply add this to the consumers class run method:
while not self.waiting_queue.empty():
try:
self.waiting_queue.get(timeout=0.001)
except:
pass
self.waiting_queue.close()
this removes every element from the queue, make sure its after the main while loop and the pipe error should not occur because the consumers will empty the will queue before terminating.
class Main:
PROJECT_NAME = 'something'
HOMEPAGE = 'something'
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME + '/queue.txt'
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
DATA_FILE = PROJECT_NAME + '/data.txt'
NUMBER_OF_THREADS = 20
queue = Queue()
Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)
# Create worker threads (will die when main exits)
def create_workers(self):
for _ in range(self.NUMBER_OF_THREADS):
t = self.threading.Thread(target=self.work)
t.daemon = True
t.start()
# Do the next job in the queue
def work(self):
while True:
url = self.queue.get()
Spider.crawl_page(self.threading.current_thread().name, url)
self.queue.task_done()
# Each queued link is a new job
def create_jobs(self):
for link in self.file_to_set(self.QUEUE_FILE):
self.queue.put(link)
self.queue.join()
self.crawl()
# Check if there are items in the queue, if so crawl them
def crawl(self):
queued_links = self.file_to_set(self.QUEUE_FILE)
if len(queued_links) > 0:
print(str(len(queued_links)) + ' links in the queue')
self.create_jobs()
create_workers()
crawl()
Above are my code. I have been receiving:
NameError: name 'create_workers' is not defined and NameError: name 'crawl' is not defined.
Any help or suggestion for beginner here?
You must create an object of the class Main and through that object to use its methods, for this you must change:
create_workers()
crawl()
to:
m = Main()
m.create_workers()
m.crawl()
Why is not Factory.rdyq visible to inherited classes?
bar.py
import Queue
import threading
class Factory:
name = ""
rdyQ = Queue.Queue()
dispQ = Queue.Queue()
def __init__(self):
self.worker_count = 1
self.worker_thread = None
def worker(self): pass
def initialize(self):
for i in range(self.worker_count):
t = threading.Thread(target=self.worker)
t.daemon = True
t.start()
self.worker_thread = t
#staticmethod
def joinQ():
Factory.rdyQ.join()
Factory.dispQ.join()
return
#staticmethod
def getFactory(factory_name):
if factory_name == "setup":
return SetupFactory()
elif factory_name == "dispatch":
return DispatchFactory()
elif factory_name == "complete":
return CompleteFactory()
else:
return None
class SetupFactory(Factory):
name = "setup"
def worker(self):
while True:
try:
item = Factory.rdyq.get(timeout=1)
print "setup: %s" % item
Factory.rdyq.task_done()
Factory.dispQ.put(item)
except Queue.Empty, msg:
continue
class DispatchFactory(Factory):
name = "dispatch"
def worker(self):
while True:
try:
item = Factory.dispQ.get(timeout=1)
print "dispQ: %s" % item
Factory.dispQ.task_done()
except Queue.Empty, msg:
continue
class CompleteFactory(Factory):
name = "complete"
def worker(self):
for i in range(10):
Factory.rdyq.put(i)
foo.py
import bar as df
setup = df.Factory.getFactory("setup")
dispatch = df.Factory.getFactory("dispatch")
complete = df.Factory.getFactory("complete")
setup.initialize()
dispatch.initialize()
complete.initialize()
df.Factory.joinQ()
setup.worker_thread.join()
dispatch.worker_thread.join()
complete.worker_thread.join()
python foo.py
File "/u/bar.py", line 73, in worker
Factory.rdyq.put(i)
AttributeError: class Factory has no attribute 'rdyq'
You should use a classmethod instead of a staticmethod for joinQ.
You also had a rdyq (bar.py last line) instead of the expected rdyQ and Python is case-sensitive.
bar.py
import Queue
import threading
class Factory:
name = ""
rdyQ = Queue.Queue()
dispQ = Queue.Queue()
def __init__(self):
self.worker_count = 1
self.worker_thread = None
def worker(self): pass
def initialize(self):
for i in range(self.worker_count):
t = threading.Thread(target=self.worker)
t.daemon = True
t.start()
self.worker_thread = t
#classmethod
def joinQ(cls):
cls.rdyQ.join()
cls.dispQ.join()
return
#staticmethod
def getFactory(factory_name):
if factory_name == "setup":
return SetupFactory()
elif factory_name == "dispatch":
return DispatchFactory()
elif factory_name == "complete":
return CompleteFactory()
else:
return None
class SetupFactory(Factory):
name = "setup"
def worker(self):
while True:
try:
item = Factory.rdyq.get(timeout=1)
print "setup: %s" % item
Factory.rdyq.task_done()
Factory.dispQ.put(item)
except Queue.Empty, msg:
continue
class DispatchFactory(Factory):
name = "dispatch"
def worker(self):
while True:
try:
item = Factory.dispQ.get(timeout=1)
print "dispQ: %s" % item
Factory.dispQ.task_done()
except Queue.Empty, msg:
continue
class CompleteFactory(Factory):
name = "complete"
def worker(self):
for i in range(10):
Factory.rdyQ.put(i)
Python variables and attributes are case sensitive, so "rdyq" is not the same as "rdyQ".
You set the name with a capital q, so maybe this will fix it?
Factory.rdyQ.put(i)
I recommend using_underscores_for_variables as you avoid issues camelCase can cause.
Below I have an example program. When the button is pressed, it takes a second before it can calculate the value to show. If the user presses the button in rapid succession they end up waiting a long time to see the last answer, which is the only answer they care about. In the code, you can see that the _dataCruncher function needs to know self._count, but self._count does not depend on the output of _dataCruncher.
My question, therefore, is how can I interrupt the normal execution of _dataCruncher on subsequent calls in order to keep the GUI free to do other stuff, and to not waste processing time when it is not needed? I realize that I will likely need to use a thread to run _dataCruncher and some sort of Queue to get the appropriate val to display, but I do not understand how to put this all together.
from PyQt4 import QtGui, QtCore
import sys
import time
import random
import random
class MainWindow(QtGui.QMainWindow):
def __init__(self):
self.app = QtGui.QApplication(sys.argv)
super(MainWindow, self).__init__()
self.count = 0
self.initUI()
def initUI(self):
# Layouts
central = QtGui.QWidget()
layout = QtGui.QVBoxLayout()
self.button = QtGui.QPushButton('Press Me')
self.text = QtGui.QLabel('?')
layout.addWidget(self.button)
layout.addWidget(self.text)
central.setLayout(layout)
self.setCentralWidget(central)
self.button.clicked.connect(self._buttonClicked)
def _dataCruncher(self, val):
time.sleep(1) # takes a long time to process data using val
return val * random.randint(1,10)
def _buttonClicked(self):
self.count += 1
val = self._dataCruncher(self.count)
self.text.setText('Value {}'.format(val))
def startup(self):
self.show()
result = self.app.exec_()
sys.exit(result)
if __name__ == '__main__':
random.seed()
myWindow = MainWindow()
myWindow.startup()
So, finding an answer to this was more complicated than I thought. As #MTset mentions in one of the comments, python does not offer any means by which to cancel the execution of a Thread. So, what I did was create a 'threadHandler' class that, well, handles thread. It keeps track of the last thread that was created and offers a means by which to get the result from the execution of the last thread.
I am posting an modified version of the test code from the original post as well as the threadHandler code in full in case anyone has use for it.
File 1 here
# tester.py, run this file
from PyQt4 import QtGui, QtCore
import random, sys, time
from threadHandler import MyThreadHandler
class MyModel(object):
def dataCruncher(self, val):
delay = random.randint(1,5)
print('{} sleeping for {}'.format(val, delay))
time.sleep(delay) # takes a long time to process data using val
print('{} done sleeping'.format(val))
return val
class MainWindow(QtGui.QMainWindow):
def __init__(self, threadHandler):
self.app = QtGui.QApplication(sys.argv)
super(MainWindow, self).__init__()
self.count = 0
self.initUI()
self.button_clicked_events = Event()
self.threadHandler = threadHandler
def initUI(self):
# Layouts
central = QtGui.QWidget()
layout = QtGui.QVBoxLayout()
self.button = QtGui.QPushButton('Press Me')
self.text = QtGui.QLabel('?')
layout.addWidget(self.button)
layout.addWidget(self.text)
central.setLayout(layout)
self.setCentralWidget(central)
self.button.clicked.connect(self._buttonClicked)
def _buttonClicked(self):
self.count += 1
self.button_clicked_events(self.count)
def setLabel(self, val):
self.text.setText(str(val))
def startup(self):
self.show()
result = self.app.exec_()
return result
class Event(list):
"""Event subscription.
A list of callable objects. Calling an instance of this will cause a
call to each item in the list in ascending order by index.
Example Usage:
>>> def f(x):
... print 'f(%s)' % x
>>> def g(x):
... print 'g(%s)' % x
>>> e = Event()
>>> e()
>>> e.append(f)
>>> e(123)
f(123)
>>> e.remove(f)
>>> e()
>>> e += (f, g)
>>> e(10)
f(10)
g(10)
>>> del e[0]
>>> e(2)
g(2)
"""
def __init__(self):
self.output = {}
def __call__(self, *args, **kwargs):
for f,key in self:
output = f(*args, **kwargs)
self.output[key] = output
return self.output
def __repr__(self):
return "Event({})".format(list.__repr__(self))
if __name__ == '__main__':
def checker(handler, window):
if handler.isLastDone():
val = handler.getLastResult()
window.setLabel(val)
else:
window.setLabel('calculating...')
random.seed()
model = MyModel()
threadHandler = MyThreadHandler()
myWindow = MainWindow(threadHandler)
threadHandler.createTimer(1, checker, threadHandler, myWindow)
def getData(count):
threadHandler.createOneShot(model.dataCruncher, count)
myWindow.button_clicked_events.append((getData, 'dt'))
result = myWindow.startup()
print('ending')
threadHandler.end()
print('ended')
sys.exit(result)
File 2 below
#threadHandler.py, save this file in the same folder as tester.py
import threading, time
class MyThreadHandler(object):
def __init__(self):
self.oneShots = []
self.timers = []
self.oldOneShots = []
self.latest = None
self.cleaning = False
self._startCleaner()
def _startCleaner(self):
print('-'*20+'Starting cleaner'+'-'*20)
self.cleaner = self.createTimer(1, self._cleanupThreads)
def _stopCleaner(self):
print('-'*20+'Stopping cleaner'+'-'*20)
self.cleaner.stop()
def getNumThreads(self):
return len(self.oneShots)
def getNumOldThreads(self):
return len(self.oldOneShots)
def end(self):
for i,timer in enumerate(self.timers):
timer.stop()
self.timers.pop(i)
def createTimer(self, interval, func, *args, **kwargs):
timer = myTimer(interval, func, args, kwargs)
self.timers.append(timer)
return timer
def createOneShot(self, func, *args, **kwargs):
oneshot = myOneShot(func, args, kwargs)
self.oneShots.append(oneshot)
self.latest = oneshot
def isLastDone(self):
if not self.latest is None:
return not self.latest.running()
else:
return None
def getLastResult(self):
if self.latest is None:
raise ValueError('There have not been any oneshots created.')
while self.latest.running():
pass
result = self.latest.getResult()
if len(self.oneShots) > 0:
self.oldOneShots.append(myOneShot(self._cleanAll, (self.oneShots,)))
self.oneShots = []
return result
def _cleanAll(self, toClean):
# loop through toClean and pop up anything that's done. this DOES lock
while len(toClean) > 0:
toClean = self._cleanup(toClean)
def _cleanup(self, toCleanup):
while not self.cleaning:
self.cleaning = True
for i, thread in enumerate(toCleanup):
if not thread.running():
toCleanup.pop(i)
self.cleaning = False
return toCleanup
def _cleanupThreads(self):
# check each of these lists and pop out any threads that are done. This
# does not lock. This function should really only be called by the
# cleaner, which is set up in __init__
self.oneShots = self._cleanup(self.oneShots)
self.timers = self._cleanup(self.timers)
self.oldOneShots = self._cleanup(self.oldOneShots)
class myTimer(object):
def __init__(self, delay, func, args=tuple(), kwargs={}):
self.delay = delay
self.func = func
self.loop = True
self.args = args
self.kwargs = kwargs
self.thread = threading.Thread(target=self.run, daemon=True)
self.thread.start()
self.output = None
def run(self):
while self.loop:
self.output = self.func(*self.args, **self.kwargs)
if self.delay > 0.1:
count = 0
while count <= self.delay:
count += 0.1
time.sleep(0.1)
else:
time.sleep(self.delay)
def stop(self):
self.loop = False
def running(self):
return self.loop
def getResult(self):
return self.output
class myOneShot(object):
def __init__(self, func, args=tuple(), kwargs={}):
self.func = func
self.args = args
self.kwargs = kwargs
self.thread = threading.Thread(target=self.run, daemon=True)
self.thread.start()
self.output = None
def run(self):
self.output = self.func(*self.args, **self.kwargs)
def running(self):
return self.thread.is_alive()
def getResult(self):
return self.output
if __name__ == '__main__':
import random
random.seed()
def longFunc(num):
delay = random.randint(5,8)
if num in (3, 6):
delay = 2
print('-'*30+'func {} has sleep {}'.format(num, delay))
time.sleep(delay)
print('-'*30+'func {} is done'.format(num))
return num
def checker(handler):
if handler.isLastDone():
return handler.getLastResult()
else:
return None
myHandler = MyThreadHandler()
# The 'checker' function simulates something in my program that uses the
# data generated by the 'longFunc'. It waits until there are no more threads
# in the threadHandler, as that would indicate that the user is done
# switching back-and-forth between different values
checkTimer = myHandler.createTimer(1, checker, myHandler)
# create 10 one-shot threads that take a 'long' time. The delay is to keep
# them in order, as this loop is meant to simulate a user switching between
# items using a keyboard or mouse, which I imagine they couldn't do any
# faster than every 1/10th of a second
start = time.time()
for i in range(4):
myHandler.createOneShot(longFunc, i)
time.sleep(0.1)
# wait until there are no more threads executing
last = myHandler.getLastResult()
print('result from last = {}'.format(last))
for i in range(4, 7):
myHandler.createOneShot(longFunc, i)
time.sleep(0.1)
last = myHandler.getLastResult()
print('result from last = {}'.format(last))
while myHandler.getNumOldThreads() >0 or myHandler.getNumThreads() > 0:
pass
myHandler.end()
print('done ending')
You could disable the button after it's pressed until an answer is ready using:
setEnabled(False)
Then reset it just before providing the result.
I'm trying to scrape a large website of government records which requires a "snowball" method, i.e., starting at the main search page and then following each link that the scraper finds to the next page.
I've been able to load the main page using PyQt this SiteScraper tutorial.
import sys
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from BeautifulSoup import BeautifulSoup
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
def main():
baseUrl = 'http://www.thesite.gov'
url = 'http://www.thesite.gov/search'
r = Render(url)
html = r.frame.toHtml()
# use BeautifulSoup to cycle through each regulation
soup = BeautifulSoup(html)
regs = soup.find('div',{'class':'x-grid3-body'}).findAll('a')
# cycle through list and call up each page separately
for reg in regs:
link = baseUrl + reg['href']
link = str(link)
# use Qt to load each regulation page
r = Render(link)
html = r.frame.toHtml() # get actual rendered web page
The problem is I get this error when I try to render a new webpage:
RuntimeError: A QApplication instance already exists.
I get it that the function is trying to call another QApplication instance. But how do I navigate to a new page with the same instance?
class Render(QWebPage):
def __init__(self, app, url):
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
def main():
app = QApplication(sys.argv)
baseUrl = 'http://www.thesite.gov'
url = 'http://www.thesite.gov/search'
r = Render(app, url)
html = r.frame.toHtml()
I had the same problem (needing to load multiple pages with QWebPage) but I couldn't get any of these answers to work for me. Here's what did work, the key is to use a QEventLoop and connect loadFinished to loop.quit:
from PySide import QtCore, QtGui, QtWebKit
import sys
def loadPage(url):
page = QtWebKit.QWebPage()
loop = QtCore.QEventLoop() # Create event loop
page.mainFrame().loadFinished.connect(loop.quit) # Connect loadFinished to loop quit
page.mainFrame().load(url)
loop.exec_() # Run event loop, it will end on loadFinished
return page.mainFrame().toHtml()
app = QtGui.QApplication(sys.argv)
urls = ['https://google.com', 'http://reddit.com', 'http://wikipedia.org']
for url in urls:
print '-----------------------------------------------------'
print 'Loading ' + url
html = loadPage(url)
print html
app.exit()
Posting a simplified example here compared to OP's to demonstrate the essential problem and solution.
You're crazy man! QT has a much better DOM than beautifulsoup.
Replace:
soup = BeautifulSoup(html)
With
page = QWebPage()
page.settings().setAttribute(QWebSettings.AutoLoadImages, False)
page.settings().setAttribute(QWebSettings.PluginsEnabled, False)
page.mainFrame().setHtml(html)
dom = page.mainFrame().documentElement()
Then you can simply scrape data like so:
li = dom.findFirst("body div#content div#special ul > li")
if not li.isNull():
class = li.attribute("class")
text = li.toPlainText()
Finally you should use QWebView instead of QWebPage. You can set it up to act like a server which can be controlled with a socket. This is what I do:
class QTimerWithPause(QTimer):
def __init__(self, parent = None):
super(QTimerWithPause, self).__init__ (parent)
self.startTime = 0
self.interval = 0
return
def start(self, interval):
from time import time
self.interval = interval
self.startTime = time()
super(QTimerWithPause, self).start(interval)
return
def pause(self):
from time import time
if self.isActive ():
self.stop()
elapsedTime = self.startTime - time()
self.startTime -= elapsedTime
# time() returns float secs, interval is int msec
self.interval -= int(elapsedTime*1000)+1
return
def resume(self):
if not self.isActive():
self.start(self.interval)
return
class CrawlerWebServer(QWebView):
TIMEOUT = 60
STUPID = r"(bing|yahoo|google)"
def __init__(self, host="0.0.0.0", port=50007, parent=None, enableImages=True, enablePlugins=True):
# Constructor
super(CrawlerWebServer, self).__init__(parent)
self.command = None
self.isLoading = True
self.isConnected = False
self.url = QUrl("http://mast3rpee.tk/")
self.timeout = QTimerWithPause(self)
self.socket = QTcpServer(self)
# 1: Settings
self.settings().enablePersistentStorage()
self.settings().setAttribute(QWebSettings.AutoLoadImages, enableImages)
self.settings().setAttribute(QWebSettings.PluginsEnabled, enablePlugins)
self.settings().setAttribute(QWebSettings.DeveloperExtrasEnabled, True)
# 2: Server
if args.verbosity > 0: print "Starting server..."
self.socket.setProxy(QNetworkProxy(QNetworkProxy.NoProxy))
self.socket.listen(QHostAddress(host), int(port))
self.connect(self.socket, SIGNAL("newConnection()"), self._connect)
if args.verbosity > 1:
print " Waiting for connection(" + host + ":" + str(port) + ")..."
# 3: Default page
self._load(10*1000, self._loadFinished)
return
def __del__(self):
try:
self.conn.close()
self.socket.close()
except:
pass
return
def _sendAuth(self):
self.conn.write("Welcome to WebCrawler server (http://mast3rpee.tk)\r\n\rLicenced under GPL\r\n\r\n")
def _connect(self):
self.disconnect(self.socket, SIGNAL("newConnection()"), self._connect)
self.conn = self.socket.nextPendingConnection()
self.conn.nextBlockSize = 0
self.connect(self.conn, SIGNAL("readyRead()"), self.io)
self.connect(self.conn, SIGNAL("disconnected()"), self.close)
self.connect(self.conn, SIGNAL("error()"), self.close)
self._sendAuth()
if args.verbosity > 1:
print " Connection by:", self.conn.peerAddress().toString()
self.isConnected = True
if self.isLoading == False:
self.conn.write("\r\nEnter command:")
return
def io(self):
if self.isLoading: return None
if args.verbosity > 0:
print "Reading command..."
data = self.conn.read(1024).strip(" \r\n\t")
if not data: return None
elif self.command is not None:
r = self.command(data)
self.command = None
return r
return self._getCommand(data)
def _getCommand(self, d):
from re import search
d = unicode(d, errors="ignore")
if search(r"(help|HELP)", d) is not None:
self.conn.write("URL | JS | WAIT | QUIT\r\n\r\nEnter Command:")
elif search(r"(url|URL)", d) is not None:
self.command = self._print
self.conn.write("Enter address:")
elif search(r"(js|JS|javascript|JAVASCRIPT)", d) is not None:
self.command = self._js
self.conn.write("Enter javascript to execte:")
elif search(r"(wait|WAIT)", d) is not None:
self.loadFinished.connect(self._loadFinishedPrint)
self.loadFinished.connect(self._loadFinished)
elif search(r"(quit|QUIT|exit|EXIT)", d) is not None:
self.close()
else:
self.conn.write("Invalid command!\r\n\r\nEnter Command:")
return
def _print(self, d):
u = d[:250]
self.out(u)
return True
def _js(self, d):
try:
self.page().mainFrame().evaluateJavaScript(d)
except:
pass
self.conn.write("Enter Javascript:")
return True
def _stop(self):
from time import sleep
if self.isLoading == False: return
if args.verbosity > 0:
print " Stopping..."
self.timeout.stop()
self.stop()
def _load(self, timeout, after):
# Loads a page into frame / sets up timeout
self.timeout.timeout.connect(self._stop)
self.timeout.start(timeout)
self.loadFinished.connect(after)
self.load(self.url)
return
def _loadDone(self, disconnect = None):
from re import search
from time import sleep
self.timeout.timeout.disconnect(self._stop)
self.timeout.stop()
if disconnect is not None:
self.loadFinished.disconnect(disconnect)
# Stick a while on the page
if search(CrawlerWebServer.STUPID, self.url.toString(QUrl.RemovePath)) is not None:
sleep(5)
else:
sleep(1)
return
def _loadError(self):
from time import sleep, time
if not self.timeout.isActive(): return True
if args.verbosity > 0: print " Error retrying..."
# 1: Pause timeout
self.timeout.pause()
# 2: Check for internet connection
while self.page().networkAccessManager().networkAccessible() == QNetworkAccessManager.NotAccessible: sleep(1)
# 3: Wait then try again
sleep(2)
self.reload()
self.timeout.resume()
return False
def go(self, url, after = None):
# Go to a specific address
global args
if after is None:
after = self._loadFinished
if args.verbosity > 0:
print "Loading url..."
self.url = QUrl(url)
self.isLoading = True
if args.verbosity > 1:
print " ", self.url.toString()
self._load(CrawlerWebServer.TIMEOUT * 1000, after)
return
def out(self, url):
# Print html of a a specific url
self.go(url, self._loadFinishedPrint)
return
def createWindow(self, windowType):
# Load links in the same web-view.
return self
def _loadFinished(self, ok):
# Default LoadFinished
from time import sleep
from re import search
if self.isLoading == False: return
if ok == False:
if not self._loadError(): return
self._loadDone(self._loadFinished)
if args.verbosity > 1:
print " Done"
if self.isConnected == True:
self.conn.write("\r\nEnter command:")
self.isLoading = False
return
def _loadFinishedPrint(self, ok):
# Print the evaluated HTML to stdout
if self.isLoading == False: return
if ok == False:
if not self._loadError(): return
self._loadDone(self._loadFinishedPrint)
if args.verbosity > 1:
print " Done"
h = unicode( self.page().mainFrame().toHtml(), errors="ignore" )
if args.verbosity > 2:
print "------------------\n" + h + "\n--------------------"
self.conn.write(h)
self.conn.write("\r\nEnter command:")
self.isLoading = False
return
def contextMenuEvent(self, event):
# Context Menu
menu = self.page().createStandardContextMenu()
menu.addSeparator()
action = menu.addAction('ReLoad')
#action.triggered.connect
def refresh():
self.load(self.url)
menu.exec_(QCursor.pos())
class CrawlerWebClient(object):
def __init__(self, host, port):
import socket
global args
# CONNECT TO SERVER
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.connect((host, port))
o = self.read()
if args.verbosity > 2:
print "\n------------------------------\n" + o + "\n------------------------------\n"
return
def __del__(self):
try: self.socket.close()
except: pass
def read(self):
from re import search
r = ""
while True:
out = self.socket.recv(64*1024).strip("\r\n")
if out.startswith(r"Enter"):
break
if out.endswith(r"Enter command:"):
r += out[:-14]
break
r += out
return r
def command(self, command):
global args
if args.verbosity > 2:
print " Command: [" + command + "]\n------------------------------"
self.socket.sendall(unicode(command))
r = self.read()
if args.verbosity > 2:
print r, "\n------------------------------\n"
return r
OK then. If you really need JavaScript. (Can you get the answer from JSON at all? That would probably be easier still with simplejson or json.) The answer is don't make more than one QApplication. You're not allowed to. Make main make a QApplication and then use the QWebPage without bothering to call QApplication.exec_(). If that doesn't work, run it all in another QThread.
I am not familiar with PyQt, but as an option, you could write your script without using a class. That way, you can more easily re-use that application instance.
Hope it helps.