I'm building a Sublime Text 3 plugin to shorten URLs using the goo.gl API. Bear in mind that the following code is hacked together from other plugins and tutorial code. I have no previous experience with Python.
The plugin does actually work as it is. The URL is shortened and replaced inline. Here is the plugin code:
import sublime
import sublime_plugin
import urllib.request
import urllib.error
import json
import threading
class ShortenUrlCommand(sublime_plugin.TextCommand):
def run(self, edit):
sels = self.view.sel()
threads = []
for sel in sels:
url = self.view.substr(sel)
thread = GooglApiCall(sel, url, 5) # Send the selection, the URL and timeout to the class
threads.append(thread)
thread.start()
# Wait for threads
for thread in threads:
thread.join()
self.view.sel().clear()
self.handle_threads(edit, threads, sels)
def handle_threads(self, edit, threads, sels, offset=0, i=0, dir=1):
next_threads = []
for thread in threads:
sel = thread.sel
result = thread.result
if thread.is_alive():
next_threads.append(thread)
continue
if thread.result == False:
continue
offset = self.replace(edit, thread, sels, offset)
thread = next_threads
if len(threads):
before = i % 8
after = (7) - before
if not after:
dir = -1
if not before:
dir = 1
i += dir
self.view.set_status("shorten_url", "[%s=%s]" % (" " * before, " " * after))
sublime.set_timeout(lambda: self.handle_threads(edit, threads, sels, offset, i, dir), 100)
return
self.view.erase_status("shorten_url")
selections = len(self.view.sel())
sublime.status_message("URL shortener successfully ran on %s URL%s" %
(selections, "" if selections == 1 else "s"))
def replace(self, edit, thread, sels, offset):
sel = thread.sel
result = thread.result
if offset:
sel = sublime.Region(edit, thread.sel.begin() + offset, thread.sel.end() + offset)
self.view.replace(edit, sel, result)
return
class GooglApiCall(threading.Thread):
def __init__(self, sel, url, timeout):
self.sel = sel
self.url = url
self.timeout = timeout
self.result = None
threading.Thread.__init__(self)
def run(self):
try:
apiKey = "xxxxxxxxxxxxxxxxxxxxxxxx"
requestUrl = "https://www.googleapis.com/urlshortener/v1/url"
data = json.dumps({"longUrl": self.url})
binary_data = data.encode("utf-8")
headers = {
"User-Agent": "Sublime URL Shortener",
"Content-Type": "application/json"
}
request = urllib.request.Request(requestUrl, binary_data, headers)
response = urllib.request.urlopen(request, timeout=self.timeout)
self.result = json.loads(response.read().decode())
self.result = self.result["id"]
return
except (urllib.error.HTTPError) as e:
err = "%s: HTTP error %s contacting API. %s." % (__name__, str(e.code), str(e.reason))
except (urllib.error.URLError) as e:
err = "%s: URL error %s contacting API" % (__name__, str(e.reason))
sublime.error_message(err)
self.result = False
The problem is that I get the following error in the console every time the plugin runs:
Traceback (most recent call last):
File "/Users/joejoinerr/Library/Application Support/Sublime Text 3/Packages/URL Shortener/url_shortener.py", line 51, in <lambda>
sublime.set_timeout(lambda: self.handle_threads(edit, threads, sels, offset, i, dir), 100)
File "/Users/joejoinerr/Library/Application Support/Sublime Text 3/Packages/URL Shortener/url_shortener.py", line 39, in handle_threads
offset = self.replace(edit, thread, sels, offset)
File "/Users/joejoinerr/Library/Application Support/Sublime Text 3/Packages/URL Shortener/url_shortener.py", line 64, in replace
self.view.replace(edit, sel, result)
File "/Applications/Sublime Text.app/Contents/MacOS/sublime.py", line 657, in replace
raise ValueError("Edit objects may not be used after the TextCommand's run method has returned")
ValueError: Edit objects may not be used after the TextCommand's run method has returned
I'm not sure what the problem is from that error. I have done some research and I understand that the solution may be held in the answer to this question, but due to my lack of Python knowledge I can't figure out how to adapt it to my use case.
I was searching for a Python autocompletion plugin for Sublime and found this question. I like your plugin idea. Did you ever get it working? The ValueError is telling you that you are trying to use the edit argument to ShortenUrlCommand.run after ShortenUrlCommand.run has returned. I think you could do this in Sublime Text 2 using begin_edit and end_edit, but in 3 your plugin has to finish all of its edits before run returns (https://www.sublimetext.com/docs/3/porting_guide.html).
In your code, the handle_threads function is checking the GoogleApiCall threads every 100 ms and executing the replacement for any thread that has finished. But handle_threads has a typo that causes it to run forever: thread = next_threads where it should be threads = next_threads. This means that finished threads are never removed from the list of active threads and all threads get processed in each invocation of handle_threads (eventually throwing the exception that you see).
You actually don't need to worry about whether the GoogleApiCall treads are finished in handle_threads, though, because you call join on each one before calling handle_threads (see the python threading docs for more detail on join: https://docs.python.org/2/library/threading.html). You know the threads are finished, so you can just do something like:
def handle_threads(self, edit, threads, sels):
offset = 0
for thread in threads:
if thread.result:
offset = self.replace(edit, thread, sels, offset)
selections = len(threads)
sublime.status_message("URL shortener successfully ran on %s URL%s" %
(selections, "" if selections == 1 else "s"))
This still has problems: it does not properly handle multiple selections and it blocks the UI thread in Sublime.
Multiple Selections
When you replace multiple selections you have to consider that the replacement text might not be the same length as the text it replaces. This shifts the text after it and you have to adjust the indexes for subsequent selected regions. For example, suppose the URLs are selected in the following text and that you are replacing them with shortened URLs:
1 2 3 4 5 6 7
01234567890123456789012345678901234567890123456789012345678901234567890123
blah blah http://example.com/long blah blah http://example.com/longer blah
The second URL occupies indexes 44 to 68. After replacing the first URL we have:
1 2 3 4 5 6 7
01234567890123456789012345678901234567890123456789012345678901234567890123
blah blah http://goo.gl/abc blah blah http://example.com/longer blah
Now the second URL occupies indexes 38 to 62. It is shifted by -6: the difference between the length of the string we just replaced and the length of the string we replaced it with. You need keep track of that difference and update it after each replacement as you go along. It looks like you had this in mind with your offset argument, but never got around to implementing it.
def handle_threads(self, edit, threads, sels):
offset = 0
for thread in threads:
if thread.result:
offset = self.replace(edit, thread.sel, thread.result, offset)
selections = len(threads)
sublime.status_message("URL shortener successfully ran on %s URL%s" %
(selections, "" if selections == 1 else "s"))
def replace(self, edit, selection, replacement_text, offset):
# Adjust the selection region to account for previous replacements
adjusted_selection = sublime.Region(selection.begin() + offset,
selection.end() + offset)
self.view.replace(edit, adjusted_selection, replacement_text)
# Update the offset for the next replacement
old_len = selection.size()
new_len = len(replacement_text)
delta = new_len - old_len
new_offset = offset + delta
return new_offset
Blocking the UI Thread
I'm not familiar with Sublime plugins, so I looked at how this is handled in the Gist plugin (https://github.com/condemil/Gist). They block the UI thread for the duration of the HTTP requests. This seems undesirable, but I think there might be a problem if you don't block: the user could change the text buffer and invalidate the selection indexes before your plugin finishes its updates. If you want to go down this road, you might try moving the URL shortening calls into a WindowCommand. Then once you have the replacement text you could execute a replacement command on the current view for each one. This example gets the current view and executes ShortenUrlCommand on it. You will have to move the code that collects the shortened URLs out into ShortenUrlWrapperCommand.run:
class ShortenUrlWrapperCommand(sublime_plugin.WindowCommand):
def run(self):
view = self.window.active_view()
view.run_command("shorten_url")
Related
I need to get all the elements on a page and iterate through them to search each element.
currently I am using, driver.find_elements_by_xpath('//*[#*]')
However, there can be a delay in completing the line of code above on larger pages. Is there a way to retrieve the results in increments of 100 elements? Or at least add a timeout?
Terminating driver.find_elements_by_xpath('//*[#*]') inside a multithread is the only why I currently think I can solve this.
I need to find all elements on a page that contain certain strings. For example. elem.get_attribute('outerHTML').find('type="submit"') != -1 … and so on and so forth … I also need their proximity to each other to compare index positions
Thanks!
import Globalz ###### globals import is an empty .py file
import threading
import time
import ctypes
def find_xpath():
for i in range(5):
print(i)
time.sleep(1)
Globalz.curr_value = 'DONE!'
### this is where the xpath retrieval goes (ABOVE loop is for example purposes only)
def stopwatch(info):
curr_time = 0
failed = False
Globalz.curr_value = ''
thread1 = threading.Thread(target=info['function'])
thread1.start()
while thread1.is_alive() is True:
if curr_time >= info['timeout']: failed = True; ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(thread1.ident), ctypes.py_object(SystemExit))
curr_time += 1; time.sleep(1)
if failed is True: return info['failed_returns']
if failed is False: return Globalz.curr_value
betty = stopwatch({'function': find_xpath, 'timeout': 10, 'failed_returns': 'failed'})
print(betty)
If anyone is interested here is a solution. I've created a wrapper called stopwatch()
It is fairly easy to do parallel work with Python 3's concurrent.futures module as shown below.
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
future_to = {executor.submit(do_work, input, 60): input for input in dictionary}
for future in concurrent.futures.as_completed(future_to):
data = future.result()
It is also very handy to insert and retrieve items into a Queue.
q = queue.Queue()
for task in tasks:
q.put(task)
while not q.empty():
q.get()
I have a script running in background listening for updates. Now, in theory assume that, as those updates arrive, I would queue them and do work on them concurrently using the ThreadPoolExecutor.
Now, individually, all of these components work in isolation, and make sense, but how do I go about using them together? I am not aware if it is possible to feed the ThreadPoolExecutor work from the queue in real time unless the data to work from is predetermined?
In a nutshell, all I want to do is, receive updates of say 4 messages a second, shove them in a queue, and get my concurrent.futures to work on them. If I don't, then I am stuck with a sequential approach which is slow.
Let's take the canonical example in the Python documentation below:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
print('%r page is %d bytes' % (url, len(data)))
The list of URLS is fixed. Is it possible to feed this list in real-time and get the worker to process it as they come by, perhaps from a queue for management purposes? I am a bit confused on whether my approach is actually possible?
The example from the Python docs, expanded to take its work from a queue. A change to note, is that this code uses concurrent.futures.wait instead of concurrent.futures.as_completed to allow new work to be started while waiting for other work to complete.
import concurrent.futures
import urllib.request
import time
import queue
q = queue.Queue()
URLS = ['http://www.foxnews.com/',
'http://www.cnn.com/',
'http://europe.wsj.com/',
'http://www.bbc.co.uk/',
'http://some-made-up-domain.com/']
def feed_the_workers(spacing):
""" Simulate outside actors sending in work to do, request each url twice """
for url in URLS + URLS:
time.sleep(spacing)
q.put(url)
return "DONE FEEDING"
def load_url(url, timeout):
""" Retrieve a single page and report the URL and contents """
with urllib.request.urlopen(url, timeout=timeout) as conn:
return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
# start a future for a thread which sends work in through the queue
future_to_url = {
executor.submit(feed_the_workers, 0.25): 'FEEDER DONE'}
while future_to_url:
# check for status of the futures which are currently working
done, not_done = concurrent.futures.wait(
future_to_url, timeout=0.25,
return_when=concurrent.futures.FIRST_COMPLETED)
# if there is incoming work, start a new future
while not q.empty():
# fetch a url from the queue
url = q.get()
# Start the load operation and mark the future with its URL
future_to_url[executor.submit(load_url, url, 60)] = url
# process any completed futures
for future in done:
url = future_to_url[future]
try:
data = future.result()
except Exception as exc:
print('%r generated an exception: %s' % (url, exc))
else:
if url == 'FEEDER DONE':
print(data)
else:
print('%r page is %d bytes' % (url, len(data)))
# remove the now completed future
del future_to_url[future]
Output from fetching each url twice:
'http://www.foxnews.com/' page is 67574 bytes
'http://www.cnn.com/' page is 136975 bytes
'http://www.bbc.co.uk/' page is 193780 bytes
'http://some-made-up-domain.com/' page is 896 bytes
'http://www.foxnews.com/' page is 67574 bytes
'http://www.cnn.com/' page is 136975 bytes
DONE FEEDING
'http://www.bbc.co.uk/' page is 193605 bytes
'http://some-made-up-domain.com/' page is 896 bytes
'http://europe.wsj.com/' page is 874649 bytes
'http://europe.wsj.com/' page is 874649 bytes
At work I found a situation where I wanted to do parallel work on an unbounded stream of data. I created a small library inspired by the excellent answer already provided by Stephen Rauch.
I originally approached this problem by thinking about two separate threads, one that submits work to a queue and one that monitors the queue for any completed tasks and makes more room for new work to come in. This is similar to what Stephen Rauch proposed, where he consumes the stream using a feed_the_workers function that runs in a separate thread.
Talking to one of my colleagues, he helped me realize that you can get away with doing everything in a single thread if you define a buffered iterator that allows you to control how many elements are let out of the input stream every time you are ready to submit more work to the thread pool.
So we introduce the BufferedIter class
class BufferedIter(object):
def __init__(self, iterator):
self.iter = iterator
def nextN(self, n):
vals = []
for _ in range(n):
vals.append(next(self.iter))
return vals
which allows us to define the stream processor in the following way
import logging
import queue
import signal
import sys
import time
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
level = logging.DEBUG
log = logging.getLogger(__name__)
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
handler.setLevel(level)
log.addHandler(handler)
log.setLevel(level)
WAIT_SLEEP = 1 # second, adjust this based on the timescale of your tasks
def stream_processor(input_stream, task, num_workers):
# Use a queue to signal shutdown.
shutting_down = queue.Queue()
def shutdown(signum, frame):
log.warning('Caught signal %d, shutting down gracefully ...' % signum)
# Put an item in the shutting down queue to signal shutdown.
shutting_down.put(None)
# Register the signal handler
signal.signal(signal.SIGTERM, shutdown)
signal.signal(signal.SIGINT, shutdown)
def is_shutting_down():
return not shutting_down.empty()
futures = dict()
buffer = BufferedIter(input_stream)
with ThreadPoolExecutor(num_workers) as executor:
num_success = 0
num_failure = 0
while True:
idle_workers = num_workers - len(futures)
if not is_shutting_down():
items = buffer.nextN(idle_workers)
for data in items:
futures[executor.submit(task, data)] = data
done, _ = wait(futures, timeout=WAIT_SLEEP, return_when=ALL_COMPLETED)
for f in done:
data = futures[f]
try:
f.result(timeout=0)
except Exception as exc:
log.error('future encountered an exception: %r, %s' % (data, exc))
num_failure += 1
else:
log.info('future finished successfully: %r' % data)
num_success += 1
del futures[f]
if is_shutting_down() and len(futures) == 0:
break
log.info("num_success=%d, num_failure=%d" % (num_success, num_failure))
Below we show an example for how to use the stream processor
import itertools
def integers():
"""Simulate an infinite stream of work."""
for i in itertools.count():
yield i
def task(x):
"""The task we would like to perform in parallel.
With some delay to simulate a time consuming job.
With a baked in exception to simulate errors.
"""
time.sleep(3)
if x == 4:
raise ValueError('bad luck')
return x * x
stream_processor(integers(), task, num_workers=3)
The output for this example is shown below
2019-01-15 22:34:40,193 future finished successfully: 1
2019-01-15 22:34:40,193 future finished successfully: 0
2019-01-15 22:34:40,193 future finished successfully: 2
2019-01-15 22:34:43,201 future finished successfully: 5
2019-01-15 22:34:43,201 future encountered an exception: 4, bad luck
2019-01-15 22:34:43,202 future finished successfully: 3
2019-01-15 22:34:46,208 future finished successfully: 6
2019-01-15 22:34:46,209 future finished successfully: 7
2019-01-15 22:34:46,209 future finished successfully: 8
2019-01-15 22:34:49,215 future finished successfully: 11
2019-01-15 22:34:49,215 future finished successfully: 10
2019-01-15 22:34:49,215 future finished successfully: 9
^C <=== THIS IS WHEN I HIT Ctrl-C
2019-01-15 22:34:50,648 Caught signal 2, shutting down gracefully ...
2019-01-15 22:34:52,221 future finished successfully: 13
2019-01-15 22:34:52,222 future finished successfully: 14
2019-01-15 22:34:52,222 future finished successfully: 12
2019-01-15 22:34:52,222 num_success=14, num_failure=1
I really liked the interesting approach by #pedro above. However, when processing thousands of files, I noticed that at the end a StopIteration would be thrown and some files would always be skipped. I had to make a little modification to as follows. Very useful answer again.
class BufferedIter(object):
def __init__(self, iterator):
self.iter = iterator
def nextN(self, n):
vals = []
try:
for _ in range(n):
vals.append(next(self.iter))
return vals, False
except StopIteration as e:
return vals, True
-- Call as follows
...
if not is_shutting_down():
items, is_finished = buffer.nextN(idle_workers)
if is_finished:
stop()
...
-- Where stop is a function that simply tells to shutdown
def stop():
shutting_down.put(None)
It is possible to gain the benefits of the executor without strictly having to use a Queue. New tasks are submitted from the main thread. The undone futures are tracked and waited on until all futures are done.
import concurrent.futures
import sys
import time
sys.setrecursionlimit(64) # This is only for demonstration purposes to trigger a RecursionError. Do not set in practice.
def slow_factorial(n: int) -> int:
time.sleep(0.01)
if n == 0:
return 1
else:
return n * slow_factorial(n-1)
initial_inputs = [0, 1, 5, 20, 200, 100, 50, 51, 55, 40, 44, 21, 222, 333, 202, 1000, 10, 9000, 9009, 99, 9999]
for executor_class in (concurrent.futures.ThreadPoolExecutor, concurrent.futures.ProcessPoolExecutor):
for max_workers in (4, 8, 16, 32):
start_time = time.monotonic()
with executor_class(max_workers=max_workers) as executor:
futures_to_n = {executor.submit(slow_factorial, n): n for n in initial_inputs}
while futures_to_n:
futures_done, futures_not_done = concurrent.futures.wait(futures_to_n, return_when=concurrent.futures.FIRST_COMPLETED)
# Note: Length of futures_done is often > 1.
for future in futures_done:
n = futures_to_n.pop(future)
try:
factorial_n = future.result()
except RecursionError:
n_smaller = int(n ** 0.9)
future = executor.submit(slow_factorial, n_smaller)
futures_to_n[future] = n_smaller
# print(f'Failed to compute factorial of {n}. Trying to compute factorial of a smaller number {n_smaller} instead.')
else:
# print(f'Factorial of {n} is {factorial_n}.')
pass
used_time = time.monotonic() - start_time
executor_type = executor_class.__name__.removesuffix('PoolExecutor').lower()
print(f'Workflow took {used_time:.1f}s with {max_workers} {executor_type} workers.')
print()
Output:
Workflow took 9.4s with 4 thread workers.
Workflow took 6.3s with 8 thread workers.
Workflow took 5.4s with 16 thread workers.
Workflow took 5.2s with 32 thread workers.
Workflow took 9.0s with 4 process workers.
Workflow took 5.9s with 8 process workers.
Workflow took 5.1s with 16 process workers.
Workflow took 4.9s with 32 process workers.
For more clarity, uncomment the two print statements. As per the output above, there is an asymptotic speed benefit with more workers.
I have a checkin list which contains about 600000 checkins, and there is a url in each checkin, I need to expand them back to original long ones. I do so by
now = time.time()
files_without_url = 0
for i, checkin in enumerate(NYC_checkins):
try:
foursquare_url = urllib2.urlopen(re.search("(?P<url>https?://[^\s]+)", checkin[5]).group("url")).url
except:
files_without_url += 1
if i%1000 == 0:
print("from %d to %d: %2.5f seconds" %(i-1000, i, time.time()-now))
now = time.time()
But this takes too long time: from 0 to 1000 checkins, it takes 3241 seconds! Is this normal? What's the most efficient way to expand url by Python?
MODIFIED: Some Urls are from Bitly while some others are not, and I am not sure where they come from. In this case, I wanna simply use urllib2 module.
for your information, here is an example of checkin[5]:
I'm at The Diner (2453 18th Street NW, Columbia Rd., Washington) w/ 4 others. http...... (this is the short url)
I thought I would expand on my comment regarding the use of multiprocessing to speed up this task.
Let's start with a simple function that will take a url and resolve it as far as possible (following redirects until it gets a 200 response code):
import requests
def resolve_url(url):
try:
r = requests.get(url)
except requests.exceptions.RequestException:
return (url, None)
if r.status_code != 200:
longurl = None
else:
longurl = r.url
return (url, longurl)
This will either return a (shorturl, longurl) tuple, or it will
return (shorturl, None) in the event of a failure.
Now, we create a pool of workers:
import multiprocessing
pool = multiprocessing.Pool(10)
And then ask our pool to resolve a list of urls:
resolved_urls = []
for shorturl, longurl in pool.map(resolve_url, urls):
resolved_urls.append((shorturl, longurl))
Using the above code...
With a pool of 10 workers, I can resolve 500 URLs in 900 seconds.
If I increase the number of workers to 100, I can resolve 500 URLs in 30 seconds.
If I increase the number of workers to 200, I can resolve 500 URLs in 25 seconds.
This is hopefully enough to get you started.
(NB: you could write a similar solution using the threading module rather than multiprocessing. I usually just grab for multiprocessing first, but in this case either would work, and threading might even be slightly more efficient.)
Thread are most appropriate in case of network I/O. But you could try the following first.
pat = re.compile("(?P<url>https?://[^\s]+)") # always compile it
missing_urls = 0
bad_urls = 0
def check(checkin):
match = pat.search(checkin[5])
if not match:
global missing_urls
missing_urls += 1
else:
url = match.group("url")
try:
urllib2.urlopen(url) # don't lookup .url if you don't need it later
except URLError: # or just Exception
global bad_urls
bad_urls += 1
for i, checkin in enumerate(NYC_checkins):
check(checkin)
print(bad_urls, missing_urls)
If you get no improvement, now that we have a nice check function, create a threadpool and feed it. Speedup is guaranteed. Using processes for network I/O is pointless
I have a rather large program which loads some data from an excel file and populates a form, this can take a long time due to the size of the file so I have been moving the loading function onto a separate thread, the only problem is for some reason in this new thread I am not getting an automatic stack trace in the console whenever an error occurs. It has just been failing silently which is making debugging it a real pain.
I am using pydev in eclipse, I wrote the following test case to be sure everything is working correctly.
from PyQt4 import QtCore
class OtherThread(QtCore.QThread):
def __init__(self):
super(OtherThread, self).__init__()
def run(self):
try:
print(1/0)
except Exception as e:
print("exception caught in other thread: \n{0}".format(e))
class MainThread():
def __init__(self):
self.otherThread = OtherThread()
def run(self):
try:
print(1/0)
except Exception as e:
print("exception caught in main thread: \n{0}".format(e))
self.otherThread.run()
def main():
mainThread = MainThread()
mainThread.run()
if __name__ == '__main__':
main()
When I run this both exceptions are caught properly and when I comment out the try block in the tread object it also works just fine, I get my stack trace as expected. I am really at a loss as to what is going on. Is there something I could have done to cause this behavior?
Here is the code of the program I am working on.
def run(self):
print("excel thread running")
workbook = xlrd.open_workbook(self.path)
worksheet = workbook.sheet_by_name('PNA Form')
#
currentRow = 13 # start grabbing pna data
numRowStart = currentRow
newPartCol= 0
oldPartCol = 10
descriptionCol = 2
#
numberOfRows = worksheet.nrows - 1
#
print("number of rows = {0}".format(numberOfRows))
PNA = []
#
current_color = False
while (currentRow < numberOfRows):
print("about to parse excel rows")
newPartCell = int(worksheet.cell(currentRow,newPartCol).value)
oldPartCell = int(worksheet.cell(currentRow,oldPartCol).value)
descriptionCell = QtCore.QString(worksheet.cell(currentRow,descriptionCol).value)
print("excel rows parsed: {0}, {1}, {2}, {3}".format(oldPartCell,newPartCell,descriptionCell,current_color))
print("running line excel row {0}: {1}".format(currentRow, str(descriptionCell)))
if not self.isStrikethrough(currentRow,0): #make sure the line does not have strike through
#self.guiHandel.BOMVal.addPNARow(oldPN = oldPartCell, newPN = newPartCell, disc = descriptionCell)
print("about to emit pna row tracker for {0}".format(descriptionCell))
self.addPNARowTracker.emit(oldPartCell,newPartCell,descriptionCell)
print("thread still running after pna row tracker emit")
if (oldPartCell != "" and not self.isStrikethrough(currentRow,0)):
PNA.append((num(oldPartCell),num(newPartCell)))
current_color = not current_color
#self.guiHandel.pnaVerticalLayoutScroll.addWidget(PNACell(oldPartCell,newPartCell,descriptionCell,color = current_color))
print("about to emit addPNARow: {0}, {1}, {2}, {3}".format(oldPartCell,newPartCell,descriptionCell,current_color))
self.addPNARow.emit(oldPartCell,newPartCell,descriptionCell,current_color)
#self.guiHandel.widgetStack.append(PNACell(oldPartCell,newPartCell,descriptionCell,color = current_color))
print("thread still running after add pna row emit")
currentRow += 1
#self.guiHandel.pbar.setValue(int(100*(currentRow-13)/numberOfRows))
print("currentRow =",currentRow)
self.updateProgress.emit(int(100*(currentRow-numRowStart)/(numberOfRows-numRowStart)))
print(PNA)
self.done.emit()
Here is the console output when it fails.
slot add pna row tracker called
running is about to return
about to emit addPNARow: 28458820, 28489881, INST CSTR-ASM,DIESEL,KM,UP,GAT, False
thread still running after add pna row emit
('currentRow =', 29slot add pna row called)
about to parse excel rows
Added addPNARow: 28458820, 28489881, INST CSTR-ASM,DIESEL,KM,UP,GAT, False
excel progress update called ------- progress = 20
When running through a debugger it stops at this line:
newPartCell = int(worksheet.cell(currentRow,newPartCol).value)
I tried wrapping it in a try block but it never got to the exception. The cell it is trying to read is blank.
What is going on here? Any ideas would be greatly appreciated.
Answer to the question found here: error in pyqt qthread not printed
Basically you need to manually encapsulate the entire run method of the QThread object and then manually rethrow errors to stderr
Using the python watchdog file system events watching library I noticed that when being used under Windows Server 2003 it entered into "Polling Mode" thus stoping using asynchronous OS notification and, therefore, heavily reducing system performance under big amount of file changes.
I traced the problem to watchdog/observers/winapi.py file where CancelIoEx system call is used in order to stop ReadDirectoryChangesW call lock when the user wants to stop monitoring the watched directory or file:
(winapi.py)
CancelIoEx = ctypes.windll.kernel32.CancelIoEx
CancelIoEx.restype = ctypes.wintypes.BOOL
CancelIoEx.errcheck = _errcheck_bool
CancelIoEx.argtypes = (
ctypes.wintypes.HANDLE, # hObject
ctypes.POINTER(OVERLAPPED) # lpOverlapped
)
...
...
...
def close_directory_handle(handle):
try:
CancelIoEx(handle, None) # force ReadDirectoryChangesW to return
except WindowsError:
return
The problem with CancelIoEx call is that it is not available until Windows Server 2008:
http://msdn.microsoft.com/en-us/library/windows/desktop/aa363792(v=vs.85).aspx
One possible alternative is to change close_directory_handle in order to make it create a mock file within the monitored directory, thus unlocking the thread waiting for ReadDirectoryChangesW to return.
However, I noticed that CancelIo system call is in fact available in Windows Server 2003:
Cancels all pending input and output (I/O) operations that are issued
by the calling thread for the specified file. The function does not
cancel I/O operations that other threads issue for a file handle. To
cancel I/O operations from another thread, use the CancelIoEx
function.
But calling CancelIo won't affect the waiting thread.
Do you have any idea on how to solve this problem?
May be threading.enumerate() could be used issue a signal to be handled by each thread being CancelIo called from these handlers?
The natural approach is to implement a completion routine and call to ReadDirectoryChangesW using its overlapped mode. The following example shows the way to do that:
RDCW_CALLBACK_F = ctypes.WINFUNCTYPE(None, ctypes.wintypes.DWORD, ctypes.wintypes.DWORD, ctypes.POINTER(OVERLAPPED))
First, create a WINFUNCTYPE factory which will be used to generate (callable from Windows API) C like functions from python methods. In this case, no return value and 3 parameters corresponding to
VOID CALLBACK FileIOCompletionRoutine(
_In_ DWORD dwErrorCode,
_In_ DWORD dwNumberOfBytesTransfered,
_Inout_ LPOVERLAPPED lpOverlapped
);
FileIOCompletionRoutine header.
The callback reference as well as the overlapped structure need to be added to ReadDirectoryChangesW arguments list:
ReadDirectoryChangesW = ctypes.windll.kernel32.ReadDirectoryChangesW
ReadDirectoryChangesW.restype = ctypes.wintypes.BOOL
ReadDirectoryChangesW.errcheck = _errcheck_bool
ReadDirectoryChangesW.argtypes = (
ctypes.wintypes.HANDLE, # hDirectory
LPVOID, # lpBuffer
ctypes.wintypes.DWORD, # nBufferLength
ctypes.wintypes.BOOL, # bWatchSubtree
ctypes.wintypes.DWORD, # dwNotifyFilter
ctypes.POINTER(ctypes.wintypes.DWORD), # lpBytesReturned
ctypes.POINTER(OVERLAPPED), # lpOverlapped
RDCW_CALLBACK_F # FileIOCompletionRoutine # lpCompletionRoutine
)
From here, we are ready to perform the overlapped system call.
This is a simple call bacl just usefult to test that everything works fine:
def dir_change_callback(dwErrorCode,dwNumberOfBytesTransfered,p):
print("dir_change_callback! PID:" + str(os.getpid()))
print("CALLBACK THREAD: " + str(threading.currentThread()))
Prepare and perform the call:
event_buffer = ctypes.create_string_buffer(BUFFER_SIZE)
nbytes = ctypes.wintypes.DWORD()
overlapped_read_dir = OVERLAPPED()
call2pass = RDCW_CALLBACK_F(dir_change_callback)
hand = get_directory_handle(os.path.abspath("/test/"))
def docall():
ReadDirectoryChangesW(hand, ctypes.byref(event_buffer),
len(event_buffer), False,
WATCHDOG_FILE_NOTIFY_FLAGS,
ctypes.byref(nbytes),
ctypes.byref(overlapped_read_dir), call2pass)
print("Waiting!")
docall()
If you load and execute all this code into a DreamPie interactive shell you can check the system call is done and that the callback executes thus printing the thread and pid numbers after the first change done under c:\test directory. Besides, you will notice those are the same than the main thread and process: Despite the event is raised by a separated thread, the callback runs in the same process and thread as our main program thus providing an undesired behaviour:
lck = threading.Lock()
def dir_change_callback(dwErrorCode,dwNumberOfBytesTransfered,p):
print("dir_change_callback! PID:" + str(os.getpid()))
print("CALLBACK THREAD: " + str(threading.currentThread()))
...
...
...
lck.acquire()
print("Waiting!")
docall()
lck.acquire()
This program will lock the main thread and the callback will never execute.
I tried many synchronization tools, even Windows API semaphores always getting the same behaviour so, finally, I decided to implement the ansynchronous call using the synchronous configuration for ReadDirectoryChangesW within a separate process managed and synchronized using multiprocessing python library:
Calls to get_directory_handle won't return the handle number given by windows API but one managed by winapi library, for that I implemented a handle generator:
class FakeHandleFactory():
_hl = threading.Lock()
_next = 0
#staticmethod
def next():
FakeHandleFactory._hl.acquire()
ret = FakeHandleFactory._next
FakeHandleFactory._next += 1
FakeHandleFactory._hl.release()
return ret
Each generated handle has to be globally associated with a file system path:
handle2file = {}
Each call to read_directory_changes will now generate ReadDirectoryRequest (derived from multiprocessing.Process) object:
class ReadDirectoryRequest(multiprocessing.Process):
def _perform_and_wait4request(self, path, recursive, event_buffer, nbytes):
hdl = CreateFileW(path, FILE_LIST_DIRECTORY, WATCHDOG_FILE_SHARE_FLAGS,
None, OPEN_EXISTING, WATCHDOG_FILE_FLAGS, None)
#print("path: " + path)
aux_buffer = ctypes.create_string_buffer(BUFFER_SIZE)
aux_n = ctypes.wintypes.DWORD()
#print("_perform_and_wait4request! PID:" + str(os.getpid()))
#print("CALLBACK THREAD: " + str(threading.currentThread()) + "\n----------")
try:
ReadDirectoryChangesW(hdl, ctypes.byref(aux_buffer),
len(event_buffer), recursive,
WATCHDOG_FILE_NOTIFY_FLAGS,
ctypes.byref(aux_n), None, None)
except WindowsError as e:
print("!" + str(e))
if e.winerror == ERROR_OPERATION_ABORTED:
nbytes = 0
event_buffer = []
else:
nbytes = 0
event_buffer = []
# Python 2/3 compat
nbytes.value = aux_n.value
for i in xrange(self.int_class(aux_n.value)):
event_buffer[i] = aux_buffer[i]
CloseHandle(hdl)
try:
self.lck.release()
except:
pass
def __init__(self, handle, recursive):
buffer = ctypes.create_string_buffer(BUFFER_SIZE)
self.event_buffer = multiprocessing.Array(ctypes.c_char, buffer)
self.nbytes = multiprocessing.Value(ctypes.wintypes.DWORD, 0)
targetPath = handle2file.get(handle, None)
super(ReadDirectoryRequest, self).__init__(target=self._perform_and_wait4request, args=(targetPath, recursive, self.event_buffer, self.nbytes))
self.daemon = True
self.lck = multiprocessing.Lock()
self.result = None
try:
self.int_class = long
except NameError:
self.int_class = int
if targetPath is None:
self.result = ([], -1)
def CancelIo(self):
try:
self.result = ([], 0)
self.lck.release()
except:
pass
def read_changes(self):
#print("read_changes! PID:" + str(os.getpid()))
#print("CALLBACK THREAD: " + str(threading.currentThread()) + "\n----------")
if self.result is not None:
raise Exception("ReadDirectoryRequest object can be used only once!")
self.lck.acquire()
self.start()
self.lck.acquire()
self.result = (self.event_buffer, self.int_class(self.nbytes.value))
return self.result
This class specifies Process providing a process which perform the system call and waits until (or):
A change event has been raised.
The main thread cancels the request by calling to the ReadDirectoryRequest object CancelIo method.
Note that:
get_directory_handle
close_directory_handle
read_directory_changes
Roles are now to manage requests. For that, thread locks and auxiliary data structures are needed:
rqIndexLck = threading.Lock() # Protects the access to `rqIndex`
rqIndex = {} # Maps handles to request objects sets.
get_directory_handle
def get_directory_handle(path):
rqIndexLck.acquire()
ret = FakeHandleFactory.next()
handle2file[ret] = path
rqIndexLck.release()
return ret
close_directory_handle
def close_directory_handle(handle):
rqIndexLck.acquire()
rqset4handle = rqIndex.get(handle, None)
if rqset4handle is not None:
for rq in rqset4handle:
rq.CancelIo()
del rqIndex[handle]
if handle in handle2file:
del handle2file[handle]
rqIndexLck.release()
And last but not least: read_directory_changes
def read_directory_changes(handle, recursive):
rqIndexLck.acquire()
rq = ReadDirectoryRequest(handle, recursive)
set4handle = None
if handle in rqIndex:
set4handle = rqIndex[handle]
else:
set4handle = set()
rqIndex[handle] = set4handle
set4handle.add(rq)
rqIndexLck.release()
ret = rq.read_changes()
rqIndexLck.acquire()
if rq in set4handle:
set4handle.remove(rq)
rqIndexLck.release()
return ret