Cannot pickle a Python Class instance - python

Here I have this class definition class definition. When I run below code, it raises following errors.
sm = SaliencyMaskSlic()
operations = [('img_resize', img_resize), ('sal_mask', sm.transform)]
args_list = [{'h_size':258}, {'cropped':True}]
pre_pipeline = Pipeline(ops=operations, arg_list=args_list)
ch = ColorHist('RGB', [6,6,6], [2,2], center=True, pre_pipeline = pre_pipeline)
dill.dump(ch, open('erogol.pkl','wb'))
...
dill.loads('erogol.pkl')
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-11-c8a5937780b5> in <module>()
----> 1 dill.loads('erogol.pkl')
/usr/local/lib/python2.7/dist-packages/dill/dill.pyc in loads(str)
158 """unpickle an object from a string"""
159 file = StringIO(str)
--> 160 return load(file)
161
162 # def dumpzs(obj, protocol=None):
/usr/local/lib/python2.7/dist-packages/dill/dill.pyc in load(file)
148 pik = Unpickler(file)
149 pik._main_module = _main_module
--> 150 obj = pik.load()
151 if type(obj).__module__ == _main_module.__name__: # point obj class to main
152 try: obj.__class__ == getattr(pik._main_module, type(obj).__name__)
/usr/lib/python2.7/pickle.pyc in load(self)
856 while 1:
857 key = read(1)
--> 858 dispatch[key](self)
859 except _Stop, stopinst:
860 return stopinst.value
/usr/lib/python2.7/pickle.pyc in load_appends(self)
1185 def load_appends(self):
1186 stack = self.stack
-> 1187 mark = self.marker()
1188 list = stack[mark - 1]
1189 list.extend(stack[mark + 1:])
/usr/lib/python2.7/pickle.pyc in marker(self)
872 mark = self.mark
873 k = len(stack)-1
--> 874 while stack[k] is not mark: k = k-1
875 return k
876
IndexError: list index out of range
Basically I have one class instance using another class instance inside. I also used cPickle but it raises as I dump;
TypeError: can't pickle instancemethod objects
Any idea for the solution ?

This isn't a pickling error. You can't pickle class instances with pickle or cPickle, but you can with dill. Your code has a bug somewhere that's giving you an IndexError.
Also better than your class having a dump and load method, you might just use dump and load from dill directly... then if you are doing something complicated, you can still add a __getstate__ and __setstate__ method.
Also, your loading from a pickled file, has a bug. You are doing this:
self = dill.loads(in_path)
While you should (1) be using dill.load instead, and (2) load to _self, and then replace the relevant state.
_self = dill.load(in_path)
self.nbins = _self.nbins
self.mask = _self.mask
# and so on... (or update all at once using `__dict__`)

Related

Overriding * imports globally for jupyter

I'm running jupyter lab on windows and fastai.vision.utils.verify_images(fns) is giving me problems because it calls fastcore.parallel.parallel with default n_workers=8. There are many ways around it, but I was trying to figure out a code block that I could slap in any notebook and have it so all underlying calls to parallel will run with n_workers=1.
I tried the following cell:
import fastcore
import sys
_fastcore = fastcore
_parallel = lambda *args, **kwargs: fastcore.parallel.parallel(*args, **kwargs, n_workers=1)
_fastcore.parallel.parallel = _parallel
sys.modules['fastcore'] = _fastcore
fastcore.parallel.parallel
printing
<function __main__.<lambda>(*args, **kwargs)>
but when I try running verify_images it still fails as if the patch never happened
---------------------------------------------------------------------------
BrokenProcessPool Traceback (most recent call last)
<ipython-input-37-f1773f2c9e62> in <module>
3 # from mock import patch
4 # with patch('fastcore.parallel.parallel') as _parallel:
----> 5 failed = verify_images(fns)
6 # failed = L(fns[i] for i,o in enumerate(_parallel(verify_image, fns)) if not o)
7 failed
~\anaconda3\lib\site-packages\fastai\vision\utils.py in verify_images(fns)
59 def verify_images(fns):
60 "Find images in `fns` that can't be opened"
---> 61 return L(fns[i] for i,o in enumerate(parallel(verify_image, fns)) if not o)
62
63 # Cell
~\anaconda3\lib\site-packages\fastcore\parallel.py in parallel(f, items, n_workers, total, progress, pause, threadpool, timeout, chunksize, *args, **kwargs)
121 if total is None: total = len(items)
122 r = progress_bar(r, total=total, leave=False)
--> 123 return L(r)
124
125 # Cell
~\anaconda3\lib\site-packages\fastcore\foundation.py in __call__(cls, x, *args, **kwargs)
95 def __call__(cls, x=None, *args, **kwargs):
96 if not args and not kwargs and x is not None and isinstance(x,cls): return x
---> 97 return super().__call__(x, *args, **kwargs)
98
99 # Cell
~\anaconda3\lib\site-packages\fastcore\foundation.py in __init__(self, items, use_list, match, *rest)
103 def __init__(self, items=None, *rest, use_list=False, match=None):
104 if (use_list is not None) or not is_array(items):
--> 105 items = listify(items, *rest, use_list=use_list, match=match)
106 super().__init__(items)
107
~\anaconda3\lib\site-packages\fastcore\basics.py in listify(o, use_list, match, *rest)
54 elif isinstance(o, list): res = o
55 elif isinstance(o, str) or is_array(o): res = [o]
---> 56 elif is_iter(o): res = list(o)
57 else: res = [o]
58 if match is not None:
~\anaconda3\lib\concurrent\futures\process.py in _chain_from_iterable_of_lists(iterable)
482 careful not to keep references to yielded objects.
483 """
--> 484 for element in iterable:
485 element.reverse()
486 while element:
~\anaconda3\lib\concurrent\futures\_base.py in result_iterator()
609 # Careful not to keep a reference to the popped future
610 if timeout is None:
--> 611 yield fs.pop().result()
612 else:
613 yield fs.pop().result(end_time - time.monotonic())
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
437 raise CancelledError()
438 elif self._state == FINISHED:
--> 439 return self.__get_result()
440 else:
441 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
386 def __get_result(self):
387 if self._exception:
--> 388 raise self._exception
389 else:
390 return self._result
BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.
I suspect it has to do with fastai.vision.utils using * imports for fastcore. Is there a way to achieve what I want?
Since the parallel function has already been imported into the fastai.vision.utils module, the correct way is to monkeypatch that module rather than fastcore.parallel:
... # your code for custom `parallel` function goes here
import fastai.vision.utils
fastai.vision.utils.parallel = _parallel # assign your custom function here

Problem loading xlsx file with pandas in python

I imported a new pandas version and now cannot import xlsx files as I used to. I have looked at similar issues and most seem to work when adding engine=""openpyxl", however in my case when I run the following code:
df = pd.read_excel("IPO_10.xlsx", engine="openpyxl")
df.head()
I get the following error:
AttributeError Traceback (most recent call last) ~/opt/anaconda3/lib/python3.8/site packages/IPython/core/formatters.py in __call__(self, obj)
343 method = get_real_method(obj, self.print_method)
344 if method is not None:
--> 345 return method()
346 return None
347 else:
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in _repr_html_(self)
732 GH3541, GH3573
733 """
--> 734 width, height = console.get_console_size()
735 max_columns = get_option("display.max_columns")
736 nb_columns = len(self.columns)
~/opt/anaconda3/lib/python3.8/site-packages/pandas/io/formats/format.py in to_html(self, buf, encoding, classes, notebook, border)
980 Buffer to write to. If None, the output is returned as a string.
981 encoding : str, default “utf-8”
--> 982 Set character encoding.
983 classes : str or list-like
984 classes to include in the `class` attribute of the opening
~/opt/anaconda3/lib/python3.8/site-packages/pandas/io/formats/html.py in __init__(self, formatter, classes, border, table_id, render_links)
54 self.col_space = {
55 column: f"{value}px" if isinstance(value, int) else value
---> 56 for column, value in self.fmt.col_space.items()
57 }
58
AttributeError: 'NoneType' object has no attribute items
and my data frame is then printed as output. How can I solve this? Thank you.

H2O python rbind error

I have a 2000 rows data frame and I'm trying to slice the same data frame into two and combine them together.
t1 = test[:10, :]
t2 = test[20:, :]
temp = t1.rbind(t2)
temp.show()
Then I got this error:
---------------------------------------------------------------------------
EnvironmentError Traceback (most recent call last)
<ipython-input-37-8daeb3375743> in <module>()
2 t2 = test[20:, :]
3 temp = t1.rbind(t2)
----> 4 temp.show()
5 print len(temp)
6 print len(test)
/usr/local/lib/python2.7/dist-packages/h2o/frame.pyc in show(self, use_pandas)
383 print("This H2OFrame has been removed.")
384 return
--> 385 if not self._ex._cache.is_valid(): self._frame()._ex._cache.fill()
386 if H2ODisplay._in_ipy():
387 import IPython.display
/usr/local/lib/python2.7/dist-packages/h2o/frame.pyc in _frame(self, fill_cache)
423
424 def _frame(self, fill_cache=False):
--> 425 self._ex._eager_frame()
426 if fill_cache:
427 self._ex._cache.fill()
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in _eager_frame(self)
67 if not self._cache.is_empty(): return self
68 if self._cache._id is not None: return self # Data already computed under ID, but not cached locally
---> 69 return self._eval_driver(True)
70
71 def _eager_scalar(self): # returns a scalar (or a list of scalars)
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in _eval_driver(self, top)
81 def _eval_driver(self, top):
82 exec_str = self._do_it(top)
---> 83 res = ExprNode.rapids(exec_str)
84 if 'scalar' in res:
85 if isinstance(res['scalar'], list): self._cache._data = [float(x) for x in res['scalar']]
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in rapids(expr)
163 The JSON response (as a python dictionary) of the Rapids execution
164 """
--> 165 return H2OConnection.post_json("Rapids", ast=expr,session_id=H2OConnection.session_id(), _rest_version=99)
166
167 class ASTId:
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in post_json(url_suffix, file_upload_info, **kwargs)
515 if __H2OCONN__ is None:
516 raise ValueError("No h2o connection. Did you run `h2o.init()` ?")
--> 517 return __H2OCONN__._rest_json(url_suffix, "POST", file_upload_info, **kwargs)
518
519 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in _rest_json(self, url_suffix, method, file_upload_info, **kwargs)
518
519 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
--> 520 raw_txt = self._do_raw_rest(url_suffix, method, file_upload_info, **kwargs)
521 return self._process_tables(raw_txt.json())
522
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in _do_raw_rest(self, url_suffix, method, file_upload_info, **kwargs)
592 raise EnvironmentError(("h2o-py got an unexpected HTTP status code:\n {} {} (method = {}; url = {}). \n"+ \
593 "detailed error messages: {}")
--> 594 .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))
595
596
EnvironmentError: h2o-py got an unexpected HTTP status code:
500 Server Error (method = POST; url = http://localhost:54321/99/Rapids).
detailed error messages: []
If I count rows (len(temp)), it works find. Also if I change the slicing index a little bit, it works find too. For example, if I change to this, it shows the data frame.
t1 = test[:10, :]
t2 = test[:5, :]
Do I miss something here? Thanks.
Unclear what happened without more information (logs would probably say why the rbind did not take).
What version are you using? I tried your code with iris on the bleeding edge and it all worked as expected.
By the way, rbind is typically going to be expensive, especially since what you're semantically after is a subset:
test[range(10) + range(20,test.nrow),:]
should also give you the desired subset (with caveat that you make the full list of row indices in python and pass it over REST to h2o).

ipython parallel wokrs over default set up but not over ssh, why?

I have created a set up to connect via ssh to multiple machines. this is my configuration file:
c = get_config()
c.IPClusterEngines.engine_launcher_class = 'SSHEngineSetLauncher'
Clusters = [36,31,1,24,10,11,4,3,6,26,7,2,9]
c.SSHEngineSetLauncher.engines = dict( [ ('hostname%02d'%x,7) for x in Clusters ] )
c.SSHEngineSetLauncher.engine_args = ['--profile-dir=~/.ipython/profile_ssh']
c.LocalControllerLauncher.controller_args = ["--ip='*'"]
I have a custom class and get the error below. The thing I cannot understand is that if I connect to the standard ipcluster profile I get no error. Why the difference?
from IPython.parallel import Client
rc = Client() # standard
rcSSH = Client(profile='ssh') # SSH (this gives the error)
rc[:].use_dill()
rcSSH[:].use_dill()
rc[:].load_balanced_view().map_sync(customInstance.function, *args) # <- this runs fine
rcSSH[:].load_balanced_view().map_sync(customInstance.function, *args) # <- this gives the error
And the error
ImportError Traceback (most recent call last)~/.local/lib/python2.7/site-packages/IPython/kernel/zmq/serialize.pyc in unpack_apply_message(bufs, g, copy)
171 args = []
172 for i in range(info['nargs']):
--> 173 arg, arg_bufs = unserialize_object(arg_bufs, g)
174 args.append(arg)
175 args = tuple(args)
~/.local/lib/python2.7/site-packages/IPython/kernel/zmq/serialize.pyc in unserialize_object(buffers, g)
110 # a zmq message
111 pobj = bytes(pobj)
--> 112 canned = pickle.loads(pobj)
113 if istype(canned, sequence_types) and len(canned) < MAX_ITEMS:
114 for c in canned:
~/.local/lib/python2.7/site-packages/dill/dill.pyc in loads(str)
158 """unpickle an object from a string"""
159 file = StringIO(str)
--> 160 return load(file)
161
162 # def dumpzs(obj, protocol=None):
~/.local/lib/python2.7/site-packages/dill/dill.pyc in load(file)
148 pik = Unpickler(file)
149 pik._main_module = _main_module
--> 150 obj = pik.load()
151 if type(obj).__module__ == _main_module.__name__: # point obj class to main
152 try: obj.__class__ == getattr(pik._main_module, type(obj).__name__)
/usr/lib/python2.7/pickle.pyc in load(self)
856 while 1:
857 key = read(1)
--> 858 dispatch[key](self)
859 except _Stop, stopinst:
860 return stopinst.value
/usr/lib/python2.7/pickle.pyc in load_global(self)
1088 module = self.readline()[:-1]
1089 name = self.readline()[:-1]
-> 1090 klass = self.find_class(module, name)
1091 self.append(klass)
1092 dispatch[GLOBAL] = load_global
~/.local/lib/python2.7/site-packages/dill/dill.pyc in find_class(self, module, name)
224 if (module, name) == ('__builtin__', '__main__'):
225 return self._main_module.__dict__ #XXX: above set w/save_module_dict
--> 226 return StockUnpickler.find_class(self, module, name)
227 pass
228
/usr/lib/python2.7/pickle.pyc in find_class(self, module, name)
1122 def find_class(self, module, name):
1123 # Subclasses may override this
-> 1124 __import__(module)
1125 mod = sys.modules[module]
1126 klass = getattr(mod, name)
ImportError: No module named fiberModes.GRINmediumArbPrec
EDIT:
I should mention that doing the following doesn't change anything:
dview.execute('import fiberModes.GRINmediumArbPrec')

Whoosh - accessing search_page result items throws ReaderClosed exception

Following is a simple pagination function.
from whoosh import index
def _search(q):
wix = index.open_dir(settings.WHOOSH_INDEX_DIR)
term = Term("title", q) | Term("content", q)
page_id = 1
with wix.searcher() as s:
return s.search_page(term, page_id, pagelen=settings.ITEMS_PER_PAGE)
In [15]: p = _search("like")
In [16]: p.results[0].reader.is_closed
Out[16]: True
if I try to access an attribute of a Hit, i get ReaderClosed exception.
In [19]: p.results
Out[19]: <Top 10 Results for Or([Term('title', 'like'), Term('content', 'like')]) runtime=0.0214910507202>
[21]: p.results[0]["title"]
---------------------------------------------------------------------------
ReaderClosed Traceback (most recent call last)
/usr/local/lib/python2.7/dist-packages/Django-1.5.3-py2.7.egg/django/core/management/commands/shell.p yc in <module>()
----> 1 p.results[0]["title"]
/usr/local/lib/python2.7/dist-packages/whoosh/searching.pyc in __getitem__(self, fieldname)
1500
1501 def __getitem__(self, fieldname):
-> 1502 if fieldname in self.fields():
1503 return self._fields[fieldname]
1504
/usr/local/lib/python2.7/dist-packages/whoosh/searching.pyc in fields(self)
1388
1389 if self._fields is None:
-> 1390 self._fields = self.searcher.stored_fields(self.docnum)
1391 return self._fields
1392
/usr/local/lib/python2.7/dist-packages/whoosh/reading.pyc in stored_fields(self, docnum)
1197 def stored_fields(self, docnum):
1198 segmentnum, segmentdoc = self._segment_and_docnum(docnum)
-> 1199 return self.readers[segmentnum].stored_fields(segmentdoc)
1200
1201 # Per doc methods
/usr/local/lib/python2.7/dist-packages/whoosh/reading.pyc in stored_fields(self, docnum)
693 def stored_fields(self, docnum):
694 if self.is_closed:
--> 695 raise ReaderClosed
696 assert docnum >= 0
697 schema = self.schema
ReaderClosed:
How can i access hit's attributes?
Browsing through whoosh's documents http://whoosh.readthedocs.org/en/latest/quickstart.html#the-searcher-object I've understood the problem. Leaving it here in case anyone gets stuck with same issue.
Any file descriptor related to search is closed when "with" scope ended. Therefore it seems resultset should be copied into another data structure such as a list of dictionaries in the "with" block, to be used outside the block.

Categories