I'm writing a Python command line utility that involves converting a string into a TextBlob, which is part of a natural language processing module. Importing the module is very slow, ~300 ms on my system. For speediness, I created a memoized function that converts text to a TextBlob only the first time the function is called. Importantly, if I run my script over the same text twice, I want to avoid reimporting TextBlob and recomputing the blob, instead pulling it from the cache. That's all done and works fine, except, for some reason, the function is still very slow. In fact, it's as slow as it was before. I think this must be because the module is getting reimported even though the function is memoized and the import statement happens inside the memoized function.
The goal here is to fix the following code so that the memoized runs are as speedy as they ought to be, given that the result does not need to be recomputed.
Here's a minimal example of the core code:
#memoize
def make_blob(text):
import textblob
return textblob.TextBlob(text)
if __name__ == '__main__':
make_blob("hello")
And here's the memoization decorator:
import os
import shelve
import functools
import inspect
def memoize(f):
"""Cache results of computations on disk in a directory called 'cache'."""
path_of_this_file = os.path.dirname(os.path.realpath(__file__))
cache_dirname = os.path.join(path_of_this_file, "cache")
if not os.path.isdir(cache_dirname):
os.mkdir(cache_dirname)
cache_filename = f.__module__ + "." + f.__name__
cachepath = os.path.join(cache_dirname, cache_filename)
try:
cache = shelve.open(cachepath, protocol=2)
except:
print 'Could not open cache file %s, maybe name collision' % cachepath
cache = None
#functools.wraps(f)
def wrapped(*args, **kwargs):
argdict = {}
# handle instance methods
if hasattr(f, '__self__'):
args = args[1:]
tempargdict = inspect.getcallargs(f, *args, **kwargs)
for k, v in tempargdict.iteritems():
argdict[k] = v
key = str(hash(frozenset(argdict.items())))
try:
return cache[key]
except KeyError:
value = f(*args, **kwargs)
cache[key] = value
cache.sync()
return value
except TypeError:
call_to = f.__module__ + '.' + f.__name__
print ['Warning: could not disk cache call to ',
'%s; it probably has unhashable args'] % (call_to)
return f(*args, **kwargs)
return wrapped
And here's a demonstration that the memoization doesn't currently save any time:
❯ time python test.py
python test.py 0.33s user 0.11s system 100% cpu 0.437 total
~/Desktop
❯ time python test.py
python test.py 0.33s user 0.11s system 100% cpu 0.436 total
This is happening even though the function is correctly being memoized (print statements put inside the memoized function only give output the first time the script is run).
I've put everything together into a GitHub Gist in case it's helpful.
What about a different approach:
import pickle
CACHE_FILE = 'cache.pkl'
try:
with open(CACHE_FILE) as pkl:
obj = pickle.load(pkl)
except:
import slowmodule
obj = "something"
with open(CACHE_FILE, 'w') as pkl:
pickle.dump(obj, pkl)
print obj
Here we cache the object, not the module. Note that this will not give you any savings if the object your caching requires slowmodule. So in the above example you would see savings, since "something" is a string and doesn't require the slowmodule module to understand it. But if you did something like
obj = slowmodule.Foo("bar")
The unpickling process would automatically import slowmodule, negating any benefit of caching.
So if you can manipulate textblob.TextBlob(text) into something that, when unpickled doesn't require the textblob module, then you'll see savings using this approach.
Related
Ok, so I am having a weird one. I am running python in a SideFX Hython (their custom build) implementation that is using PDG. The only real difference between Hython and vanilla Python is some internal functions for handling geometry data and compiled nodes, which shouldn't be an issue even though they are being used.
The way the code runs, I am generating a list of files from the disk which creates PDG work items. Those work items are then processed in parallel by PDG. Here is the code for that:
import importlib.util
import pdg
import os
from pdg.processor import PyProcessor
import json
class CustomProcessor(PyProcessor):
def __init__(self, node):
PyProcessor.__init__(self,node)
self.extractor_module = 'GeoExtractor'
def onGenerate(self, item_holder, upstream_items, generation_type):
for upstream_item in upstream_items:
new_item = item_holder.addWorkItem(parent=upstream_item, inProcess=True)
return pdg.result.Success
def onCookTask(self, work_item):
spec = importlib.util.spec_from_file_location("callback", "Geo2Custom.py")
GE = importlib.util.module_from_spec(spec)
spec.loader.exec_module(GE)
GE.convert(f"{work_item.attribValue('directory')}/{work_item.attribValue('filename')}{work_item.attribValue('extension')}", work_item.index, f'FRAME { work_item.index }', self.extractor_module)
return pdg.result.Success
def bulk_convert (path_pattern, extractor_module = 'GeoExtractor'):
type_registry = pdg.TypeRegistry.types()
try:
type_registry.registerNode(CustomProcessor, pdg.nodeType.Processor, name="customprocessor", label="Custom Processor", category="Custom")
except Exception:
pass
whereItWorks = pdg.GraphContext("testBed")
whatWorks = whereItWorks.addScheduler("localscheduler")
whatWorks.setWorkingDir(os.getcwd (), '$HIP')
whereItWorks.setValues(f'{whatWorks.name}', {'maxprocsmenu':-1, 'tempdirmenu':0, 'verbose':1})
findem = whereItWorks.addNode("filepattern")
whereItWorks.setValue(f'{findem.name}', 'pattern', path_pattern, 0)
generic = whereItWorks.addNode("genericgenerator")
whereItWorks.setValue(generic.name, 'itemcount', 4, 0)
custom = whereItWorks.addNode("customprocessor")
custom.extractor_module = extractor_module
node1 = [findem]
node2 = [custom]*len(node1)
for n1, n2 in zip(node1, node2):
whereItWorks.connect(f'{n1.name}.output', f'{n2.name}.input')
n2.cook(True)
for node in whereItWorks.graph.nodes():
node.dirty(False)
whereItWorks.disconnect(f'{n1.name}.output', f'{n2.name}.input')
print ("FULLY DONE")
import os
import hou
import traceback
import CustomWriter
import importlib
def convert (filename, frame_id, marker, extractor_module = 'GeoExtractor'):
Extractor = importlib.__import__ (extractor_module)
base, ext = os.path.splitext (filename)
if ext == '.sc':
base = os.path.splitext (base)[0]
dest_file = base + ".custom"
geo = hou.Geometry ()
geo.loadFromFile (filename)
try:
frame = Extractor.extract_geometry (geo, frame_id)
except Exception as e:
print (f'F{ frame_id } Geometry extraction failed: { traceback.format_exc () }.')
return None
print (f'F{ frame_id } Geometry extracted. Writing file { dest_file }.')
try:
CustomWriter.write_frame (frame, dest_file)
except Exception as e:
print (f'F{ frame_id } writing failed: { e }.')
print (marker + " SUCCESS")
The onCookTask code is run when the work item is processed.
Inside of the GeoExtractor.py program I am importing the geometry file defined by the work item, then converting it into a couple Pandas dataframes to collate and process the massive volumes of data quickly, which is then passed to a custom set of functions for writing binary files to disk from the Pandas data.
Everything appears to run flawlessly, until I check my output binaries and see that they escalate in file size much more than they should, indicating that either something is being shared between instances or not cleared from memory and subsequent loads of the extractor code is appending the dataframes which are named the same.
I have run the GeoExtractor code sequentially with the python instance closing between each file conversion using the exact same code and the files are fine, growing only very slowly as the geometry data volume grows, so the issue has to lie somewhere in the parallelization of it using PDG and calling the GeoExtractor.py code over and over for each work item.
I have contemplated moving the importlib stuff to the __init__() for the class leaving only the call to the member function in the onCookTask() function. Maybe even going so far as to pass a unique variable for each work item which is used inside GeoExtractor to create a closure of the internal functions so they are unique instances in memory.
I tried to do a stripped down version of GeoExtractor and since I'm not sure where the leak is, I just ended up pulling out comments with proprietary or superfluous information and changing some custom library names, but the file ended up kinda long so I am including a pastebin: https://pastebin.com/4HHS8D2W
As for CustomGeometry and CustomWriter, there is no working form of either of those libraries that will be NDA safe, so unfortunately they have to stay blackboxed. The CustomGeometry is a handful of container classes which organize all of the data coming out of the geometry, and the writer is a formatter/writer for the binary format we are utilizing. I am hoping the issue wouldn't be in either of them.
Edit 1: I fixed an issue in the example code.
Edit 2: Added larger examples.
I want to test a Python function that reads a gzip file and extracts something from the file (using pytest).
import gzip
def my_function(file_path):
output = []
with gzip.open(file_path, 'rt') as f:
for line in f:
output.append('something from line')
return output
Can I create a gzip file like object that I can pass to my_function? The object should have defined content and should work with gzip.open()
I know that I can create a temporary gzip file in a fixture but this depends on the filesystem and other properties of the environment. Creating a file-like object from code would be more portable.
You can use the io and gzip libraries to create in-memory file objects. Example:
import io, gzip
def inmem():
stream = io.BytesIO()
with gzip.open(stream, 'wb') as f:
f.write(b'spam\neggs\n')
stream.seek(0)
return stream
You should never try to test outside code in a unit test. Only test the code you wrote. If you're testing gzip, then gzip is doing something wrong (they should be writing their own unit tests). Instead, do something like this:
from unittest import mock
#mock.Mock('gzip', return_value=b'<whatever you expect to be returned from gzip>')
def test_my_function(mock_gzip):
file_path = 'testpath'
output = my_function(file_path=file_path)
mock_gzip.open.assert_called_with(file_path)
assert output == b'<whatever you expect to be returned from your method>'
That's your whole unit test. All you want to know is that gzip.open() was called (and you assume it works or else gzip is failing and that's their problem) and that you got back what you expected from the method being tested. You specify what gzip returns based on what you expect it to return, but you don't actually call the function in your test.
It's a bit verbose but I'd do something like this (I have assumed that you saved my_function to a file called patch_one.py):
import patch_one # this is the file with my_function in it
from unittest.mock import patch
from unittest import TestCase
class MyTestCase(TestCase):
def test_my_function(self):
# because you used "with open(...) as f", we need a mock context
class MyContext:
def __enter__(self, *args, **kwargs):
return [1, 2] # note the two items
def __exit__(self, *args, **kwargs):
return None
# in case we want to know the arguments to open()
open_args = None
def f(*args, **kwargs):
def my_open(*args, **kwargs):
nonlocal open_args
open_args = args
return MyContext()
return my_open
# patch the gzip.open in our file under test
with patch('patch_one.gzip.open', new_callable=f):
# finally, we can call the function we want to test
ret_val = patch_one.my_function('not a real file path')
# note the two items, corresponding to the list in __enter__()
self.assertListEqual(['something from line', 'something from line'], ret_val)
# check the arguments, just for fun
self.assertEqual('rt', open_args[1])
If you want to try anything more complicated, I would recommend reading the unittest mock docs because how you import the "patch_one" file matters as does the string you pass to patch().
There will definitely be a way to do this with Mock or MagicMock but I find them a bit hard to debug so I went the long way round.
Using python 2 (atm) and ruamel.yaml 0.13.14 (RedHat EPEL)
I'm currently writing some code to load yaml definitions, but they are split up in multiple files. The user-editable part contains eg.
users:
xxxx1:
timestamp: '2018-10-22 11:38:28.541810'
<< : *userdefaults
xxxx2:
<< : *userdefaults
timestamp: '2018-10-22 11:38:28.541810'
the defaults are stored in another file, which is not editable:
userdefaults: &userdefaults
# Default values for user settings
fileCountQuota: 1000
diskSizeQuota: "300g"
I can process these together by loading both and concatinating the strings, and then running them through merged_data = list(yaml.load_all("{}\n{}".format(defaults_data, user_data), Loader=yaml.RoundTripLoader)) which correctly resolves everything. (when not using RoundTripLoader I get errors that the references cannot be resolved, which is normal)
Now, I want to do some updates via python code (eg. update the timestamp), and for that I need to just write back the user part. And that's where things get hairy. I sofar haven't found a way to just write that yaml document, not both.
First of all, unless there are multiple documents in your defaults file, you
don't have to use load_all, as you don't concatenate two documents into a
multiple-document stream. If you had by using a format string with a document-end
marker ("{}\n...\n{}") or with a directives-end marker ("{}\n---\n{}")
your aliases would not carry over from one document to another, as per the
YAML specification:
It is an error for an alias node to use an anchor that does not
previously occur in the document.
The anchor has to be in the document, not just in the stream (which can consist of multiple
documents).
I tried some hocus pocus, pre-populating the already represented dictionary
of anchored nodes:
import sys
import datetime
from ruamel import yaml
def load():
with open('defaults.yaml') as fp:
defaults_data = fp.read()
with open('user.yaml') as fp:
user_data = fp.read()
merged_data = yaml.load("{}\n{}".format(defaults_data, user_data),
Loader=yaml.RoundTripLoader)
return merged_data
class MyRTDGen(object):
class MyRTD(yaml.RoundTripDumper):
def __init__(self, *args, **kw):
pps = kw.pop('pre_populate', None)
yaml.RoundTripDumper.__init__(self, *args, **kw)
if pps is not None:
for pp in pps:
try:
anchor = pp.yaml_anchor()
except AttributeError:
anchor = None
node = yaml.nodes.MappingNode(
u'tag:yaml.org,2002:map', [], flow_style=None, anchor=anchor)
self.represented_objects[id(pp)] = node
def __init__(self, pre_populate=None):
assert isinstance(pre_populate, list)
self._pre_populate = pre_populate
def __call__(self, *args, **kw):
kw1 = kw.copy()
kw1['pre_populate'] = self._pre_populate
myrtd = self.MyRTD(*args, **kw1)
return myrtd
def update(md, file_name):
ud = md.pop('userdefaults')
MyRTD = MyRTDGen([ud])
yaml.dump(md, sys.stdout, Dumper=MyRTD)
with open(file_name, 'w') as fp:
yaml.dump(md, fp, Dumper=MyRTD)
md = load()
md['users']['xxxx2']['timestamp'] = str(datetime.datetime.utcnow())
update(md, 'user.yaml')
Since the PyYAML based API requires a class instead of an object, you need to
use a class generator, that actually adds the data elements to pre-populate on
the fly from withing yaml.load().
But this doesn't work, as a node only gets written out with an anchor once it is
determined that the anchor is used (i.e. there is a second reference). So actually the
first merge key gets written out as an anchor. And although I am quite familiar
with the code base, I could not get this to work properly in a reasonable amount of time.
So instead, I would just rely on the fact that there is only one key that matches
the first key of users.yaml at the root level of the dump of the combined updated
file and strip anything before that.
import sys
import datetime
from ruamel import yaml
with open('defaults.yaml') as fp:
defaults_data = fp.read()
with open('user.yaml') as fp:
user_data = fp.read()
merged_data = yaml.load("{}\n{}".format(defaults_data, user_data),
Loader=yaml.RoundTripLoader)
# find the key
for line in user_data.splitlines():
line = line.split('# ')[0].rstrip() # end of line comment, not checking for strings
if line and line[-1] == ':' and line[0] != ' ':
split_key = line
break
merged_data['users']['xxxx2']['timestamp'] = str(datetime.datetime.utcnow())
buf = yaml.compat.StringIO()
yaml.dump(merged_data, buf, Dumper=yaml.RoundTripDumper)
document = split_key + buf.getvalue().split('\n' + split_key)[1]
sys.stdout.write(document)
which gives:
users:
xxxx1:
<<: *userdefaults
timestamp: '2018-10-22 11:38:28.541810'
xxxx2:
<<: *userdefaults
timestamp: '2018-10-23 09:59:13.829978'
I had to make a virtualenv to make sure I could run the above with ruamel.yaml==0.13.14.
That version is from the time I was still young (I won't claim to have been innocent).
There have been over 85 releases of the library since then.
I can understand that you might not be able to run anything but
Python2 at the moment and cannot compile/use a newer version. But what
you really should do is install virtualenv (can be done using EPEL, but also without
further "polluting" your system installation), make a virtualenv for the
code you are developping and install the latest version of ruamel.yaml (and
your other libraries) in there. You can also do that if you need
to distribute your software to other systems, just install virtualenv there as well.
I have all my utilties under /opt/util, and managed
virtualenvutils a
wrapper around virtualenv.
For writing the user part, you will have to manually split the output of yaml.dump() multifile output and write the appropriate part back to users yaml file.
import datetime
import StringIO
import ruamel.yaml
yaml = ruamel.yaml.YAML(typ='rt')
data = None
with open('defaults.yaml', 'r') as defaults:
with open('users.yaml', 'r') as users:
raw = "{}\n{}".format(''.join(defaults.readlines()), ''.join(users.readlines()))
data = list(yaml.load_all(raw))
data[0]['users']['xxxx1']['timestamp'] = datetime.datetime.now().isoformat()
with open('users.yaml', 'w') as outfile:
sio = StringIO.StringIO()
yaml.dump(data[0], sio)
out = sio.getvalue()
outfile.write(out.split('\n\n')[1]) # write the second part here as this is the contents of users.yaml
I am aware of the way independent scripts are profiled using kerprof/profile/cProfile. But how can I profile python web application running as background service/long running application
After some drill down and exploring potential solutions; I came up with following solution:
Add the following function into source file and decorate the original function to profile with #do_cprofile
import cProfile
def do_cprofile(func):
def profiled_func(*args, **kwargs):
profile = cProfile.Profile()
try:
profile.enable()
result = func(*args, **kwargs)
profile.disable()
return result
finally:
profile.dump_stats('/tmp/profile_bin.prof')
return profiled_func
Convert the generated /tmp/profile_bin.prof to human readable file
import pstats
f = open('/tmp/human_readable_profile.prof', 'w')
stats = pstats.Stats('/tmp/profile_bin.prof', stream=f)
stats.sort_stats('cumulative').print_stats()
f.close()
I work on an application that uses texts from different languages, so, for viewing or reporting purposes, some texts (strings) need to be sorted in a specific language.
Currently I have a workaround messing with the global locale settings, which is bad, and I don't want to put it in production:
default_locale = locale.getlocale(locale.LC_COLLATE)
def sort_strings(strings, locale_=None):
if locale_ is None:
return sorted(strings)
locale.setlocale(locale.LC_COLLATE, locale_)
sorted_strings = sorted(strings, cmp=locale.strcoll)
locale.setlocale(locale.LC_COLLATE, default_locale)
return sorted_strings
The official python locale documentation explicitly says that saving and restoring is a bad idea, but does not give any suggestions: http://docs.python.org/library/locale.html#background-details-hints-tips-and-caveats
You could use a PyICU's collator to avoid changing global settings:
import icu # PyICU
def sorted_strings(strings, locale=None):
if locale is None:
return sorted(strings)
collator = icu.Collator.createInstance(icu.Locale(locale))
return sorted(strings, key=collator.getSortKey)
Example:
>>> L = [u'sandwiches', u'angel delight', u'custard', u'éclairs', u'glühwein']
>>> sorted_strings(L)
['angel delight', 'custard', 'glühwein', 'sandwiches', 'éclairs']
>>> sorted_strings(L, 'en_US')
['angel delight', 'custard', 'éclairs', 'glühwein', 'sandwiches']
Disadvantage: dependency on PyICU library; the behavior is slightly different from locale.strcoll.
I don't know how to get locale.strxfrm function given a locale name without changing it globally. As a hack you could run your function in a different child process:
pool = multiprocessing.Pool()
# ...
pool.apply(locale_aware_sort, [strings, loc])
Disadvantage: might be slow, resource hungry
Using ordinary threading.Lock won't work unless you can control every place where locale aware functions (they are not limited to locale module e.g., re) could be called from multiple threads.
You could compile your function using Cython to synchronize access using GIL. GIL will make sure that no other Python code can be executed while your function is running.
Disadvantage: not pure Python
The ctypes solution is fine, but if anyone in the future would like just to modify your original solution, here is a way how to do so:
Temporary changes of global settings can safely be accomplished with a context manager.
from contextlib import contextmanager
import locale
#contextmanager
def changedlocale(newone):
old_locale = locale.getlocale(locale.LC_COLLATE)
try:
locale.setlocale(locale.LC_COLLATE, newone)
yield locale.strcoll
finally:
locale.setlocale(locale.LC_COLLATE, old_locale)
def sort_strings(strings, locale_=None):
if locale_ is None:
return sorted(strings)
with changedlocale(locale_) as strcoll:
return sorted(strings, cmp=strcoll)
return sorted_strings
This ensures a clean restoration of the original locale - as long as you don't use threading.
Glibc does support a locale API with an explicit state. Here's a quick wrapper for that API made with ctypes.
# -*- coding: utf-8
import ctypes
class Locale(object):
def __init__(self, locale):
LC_ALL_MASK = 8127
# LC_COLLATE_MASK = 8
self.libc = ctypes.CDLL("libc.so.6")
self.ctx = self.libc.newlocale(LC_ALL_MASK, locale, 0)
def strxfrm(self, src, iteration=1):
size = 3 * iteration * len(src)
dest = ctypes.create_string_buffer('\000' * size)
n = self.libc.strxfrm_l(dest, src, size, self.ctx)
if n < size:
return dest.value
elif iteration<=4:
return self.strxfrm(src, iteration+1)
else:
raise Exception('max number of iterations trying to increase dest reached')
def __del__(self):
self.libc.freelocale(self.ctx)
and a short test
locale1 = Locale('C')
locale2 = Locale('mk_MK.UTF-8')
a_list = ['а', 'б', 'в', 'ј', 'ќ', 'џ', 'ш']
import random
random.shuffle(a_list)
assert sorted(a_list, key=locale1.strxfrm) == ['а', 'б', 'в', 'ш', 'ј', 'ќ', 'џ']
assert sorted(a_list, key=locale2.strxfrm) == ['а', 'б', 'в', 'ј', 'ќ', 'џ', 'ш']
what's left to do is implement all the locale functions, support for python unicode strings (with wchar* functions I guess), and automatically import the include file definitions or something