Whoosh - accessing search_page result items throws ReaderClosed exception - python

Following is a simple pagination function.
from whoosh import index
def _search(q):
wix = index.open_dir(settings.WHOOSH_INDEX_DIR)
term = Term("title", q) | Term("content", q)
page_id = 1
with wix.searcher() as s:
return s.search_page(term, page_id, pagelen=settings.ITEMS_PER_PAGE)
In [15]: p = _search("like")
In [16]: p.results[0].reader.is_closed
Out[16]: True
if I try to access an attribute of a Hit, i get ReaderClosed exception.
In [19]: p.results
Out[19]: <Top 10 Results for Or([Term('title', 'like'), Term('content', 'like')]) runtime=0.0214910507202>
[21]: p.results[0]["title"]
---------------------------------------------------------------------------
ReaderClosed Traceback (most recent call last)
/usr/local/lib/python2.7/dist-packages/Django-1.5.3-py2.7.egg/django/core/management/commands/shell.p yc in <module>()
----> 1 p.results[0]["title"]
/usr/local/lib/python2.7/dist-packages/whoosh/searching.pyc in __getitem__(self, fieldname)
1500
1501 def __getitem__(self, fieldname):
-> 1502 if fieldname in self.fields():
1503 return self._fields[fieldname]
1504
/usr/local/lib/python2.7/dist-packages/whoosh/searching.pyc in fields(self)
1388
1389 if self._fields is None:
-> 1390 self._fields = self.searcher.stored_fields(self.docnum)
1391 return self._fields
1392
/usr/local/lib/python2.7/dist-packages/whoosh/reading.pyc in stored_fields(self, docnum)
1197 def stored_fields(self, docnum):
1198 segmentnum, segmentdoc = self._segment_and_docnum(docnum)
-> 1199 return self.readers[segmentnum].stored_fields(segmentdoc)
1200
1201 # Per doc methods
/usr/local/lib/python2.7/dist-packages/whoosh/reading.pyc in stored_fields(self, docnum)
693 def stored_fields(self, docnum):
694 if self.is_closed:
--> 695 raise ReaderClosed
696 assert docnum >= 0
697 schema = self.schema
ReaderClosed:
How can i access hit's attributes?

Browsing through whoosh's documents http://whoosh.readthedocs.org/en/latest/quickstart.html#the-searcher-object I've understood the problem. Leaving it here in case anyone gets stuck with same issue.
Any file descriptor related to search is closed when "with" scope ended. Therefore it seems resultset should be copied into another data structure such as a list of dictionaries in the "with" block, to be used outside the block.

Related

How to get add aditional transform to get_transform in fastai?

I would like to add aditional augmenmentation this way:
additional_aug=[zoom_crop(scale=(0.75,1.25), do_rand=False),
brightness(),
contrast()
]
tfms = get_transforms(do_flip=True,flip_vert=True,max_lighting=0.2, xtra_tfms=additional_aug)
data = (ImageList.from_df(df=df,path='./',cols='path')
.split_by_rand_pct(0.2)
.label_from_df(cols='diagnosis',label_cls=FloatList)
.transform(tfms,size=sz,resize_method=ResizeMethod.SQUISH,padding_mode='zeros')
.databunch(bs=bs,num_workers=4)
.normalize(imagenet_stats)
)
But I get errors:
--------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/opt/conda/lib/python3.6/site-packages/fastai/data_block.py in _check_kwargs(ds, tfms, **kwargs)
590 x = ds[0]
--> 591 try: x.apply_tfms(tfms, **kwargs)
592 except Exception as e:
/opt/conda/lib/python3.6/site-packages/fastai/vision/image.py in apply_tfms(self, tfms, do_resolve, xtra, size, resize_method, mult, padding_mode, mode, remove_out)
105 if resize_method <= 2 and size is not None: tfms = self._maybe_add_crop_pad(tfms)
--> 106 tfms = sorted(tfms, key=lambda o: o.tfm.order)
107 if do_resolve: _resolve_tfms(tfms)
/opt/conda/lib/python3.6/site-packages/fastai/vision/image.py in <lambda>(o)
105 if resize_method <= 2 and size is not None: tfms = self._maybe_add_crop_pad(tfms)
--> 106 tfms = sorted(tfms, key=lambda o: o.tfm.order)
107 if do_resolve: _resolve_tfms(tfms)
AttributeError: 'list' object has no attribute 'tfm'
During handling of the above exception, another exception occurred:
Exception Traceback (most recent call last)
<ipython-input-27-3daf86c69a96> in <module>
2 .split_by_rand_pct(0.2)
3 .label_from_df(cols='diagnosis',label_cls=FloatList)
----> 4 .transform(tfms,size=sz,resize_method=ResizeMethod.SQUISH,padding_mode='zeros')
5 .databunch(bs=bs,num_workers=4)
6 .normalize(imagenet_stats)
/opt/conda/lib/python3.6/site-packages/fastai/data_block.py in transform(self, tfms, **kwargs)
500 if not tfms: tfms=(None,None)
501 assert is_listy(tfms) and len(tfms) == 2, "Please pass a list of two lists of transforms (train and valid)."
--> 502 self.train.transform(tfms[0], **kwargs)
503 self.valid.transform(tfms[1], **kwargs)
504 if self.test: self.test.transform(tfms[1], **kwargs)
/opt/conda/lib/python3.6/site-packages/fastai/data_block.py in transform(self, tfms, tfm_y, **kwargs)
719 def transform(self, tfms:TfmList, tfm_y:bool=None, **kwargs):
720 "Set the `tfms` and `tfm_y` value to be applied to the inputs and targets."
--> 721 _check_kwargs(self.x, tfms, **kwargs)
722 if tfm_y is None: tfm_y = self.tfm_y
723 tfms_y = None if tfms is None else list(filter(lambda t: getattr(t, 'use_on_y', True), listify(tfms)))
/opt/conda/lib/python3.6/site-packages/fastai/data_block.py in _check_kwargs(ds, tfms, **kwargs)
591 try: x.apply_tfms(tfms, **kwargs)
592 except Exception as e:
--> 593 raise Exception(f"It's not possible to apply those transforms to your dataset:\n {e}")
594
595 class LabelList(Dataset):
Exception: It's not possible to apply those transforms to your dataset:
'list' object has no attribute 'tfm'
According to documentation xtra_tfms : Optional [ Collection [ Transform ]]= None ) →  Collection [ Transform ]
How to get it work?
I have faced this problem and the solution is very simple. Just call each transform function that you want to apply each inside a separate list enclosed by a list and pass it into get_transforms function's xtra_tfms parameter. (It could even be a tuple of tuples or any collection)
additional_aug=[[zoom_crop(scale=(0.75,1.25), do_rand=False)],
[brightness()],
[contrast()]]
tfms = get_transforms(do_flip=True,
flip_vert=True,
max_lighting=0.2,
xtra_tfms=additional_aug)
Hope this solves your problem.

Read all range names and values

Follow up to Felix's response:
I am still getting errors after changing refers_to_range.value to name.
Below is the updated code and the error output from Jupyter.
If I use a single range name and then lookup the value, it works fine, but when I try to go through the list of values returned by the "for name in wb.names", I am not able to return a value for that list.
import xlwings as xw
wb = xw.Book(r'C:\Agency\wkbk utility\Uploaded to HDS\108 Place.xlsm')
for name in wb.names:
range_value = wb.names(name).name
print(range_value)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-3-e5c7c7a0215d> in <module>
2
3 for name in wb.names:
----> 4 range_value = wb.names(name).name
5
6 print(range_value)
~\Anaconda3\lib\site-packages\xlwings\main.py in __call__(self, name_or_index)
2519
2520 def __call__(self, name_or_index):
-> 2521 return Name(impl=self.impl(name_or_index))
2522
2523 def contains(self, name_or_index):
~\Anaconda3\lib\site-packages\xlwings\_xlwindows.py in __call__(self, name_or_index)
1348
1349 def __call__(self, name_or_index):
-> 1350 return Name(xl=self.xl(name_or_index))
1351
1352 def contains(self, name_or_index):
~\Anaconda3\lib\site-packages\xlwings\_xlwindows.py in __call__(self, *args, **kwargs)
150 for i in range(N_COM_ATTEMPTS + 1):
151 try:
--> 152 v = self._inner(*args, **kwargs)
153 t = type(v)
154 if t is CDispatch:
~\Anaconda3\lib\site-packages\win32com\client\dynamic.py in __call__(self, *args)
195 if invkind is not None:
196 allArgs = (dispid,LCID,invkind,1) + args
--> 197 return self._get_good_object_(self._oleobj_.Invoke(*allArgs),self._olerepr_.defaultDispatchName,None)
198 raise TypeError("This dispatch object does not define a default method")
199
TypeError: Objects of type 'Name' can not be converted to a COM VARIANT (but obtaining the buffer() of this object could)

H2O python rbind error

I have a 2000 rows data frame and I'm trying to slice the same data frame into two and combine them together.
t1 = test[:10, :]
t2 = test[20:, :]
temp = t1.rbind(t2)
temp.show()
Then I got this error:
---------------------------------------------------------------------------
EnvironmentError Traceback (most recent call last)
<ipython-input-37-8daeb3375743> in <module>()
2 t2 = test[20:, :]
3 temp = t1.rbind(t2)
----> 4 temp.show()
5 print len(temp)
6 print len(test)
/usr/local/lib/python2.7/dist-packages/h2o/frame.pyc in show(self, use_pandas)
383 print("This H2OFrame has been removed.")
384 return
--> 385 if not self._ex._cache.is_valid(): self._frame()._ex._cache.fill()
386 if H2ODisplay._in_ipy():
387 import IPython.display
/usr/local/lib/python2.7/dist-packages/h2o/frame.pyc in _frame(self, fill_cache)
423
424 def _frame(self, fill_cache=False):
--> 425 self._ex._eager_frame()
426 if fill_cache:
427 self._ex._cache.fill()
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in _eager_frame(self)
67 if not self._cache.is_empty(): return self
68 if self._cache._id is not None: return self # Data already computed under ID, but not cached locally
---> 69 return self._eval_driver(True)
70
71 def _eager_scalar(self): # returns a scalar (or a list of scalars)
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in _eval_driver(self, top)
81 def _eval_driver(self, top):
82 exec_str = self._do_it(top)
---> 83 res = ExprNode.rapids(exec_str)
84 if 'scalar' in res:
85 if isinstance(res['scalar'], list): self._cache._data = [float(x) for x in res['scalar']]
/usr/local/lib/python2.7/dist-packages/h2o/expr.pyc in rapids(expr)
163 The JSON response (as a python dictionary) of the Rapids execution
164 """
--> 165 return H2OConnection.post_json("Rapids", ast=expr,session_id=H2OConnection.session_id(), _rest_version=99)
166
167 class ASTId:
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in post_json(url_suffix, file_upload_info, **kwargs)
515 if __H2OCONN__ is None:
516 raise ValueError("No h2o connection. Did you run `h2o.init()` ?")
--> 517 return __H2OCONN__._rest_json(url_suffix, "POST", file_upload_info, **kwargs)
518
519 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in _rest_json(self, url_suffix, method, file_upload_info, **kwargs)
518
519 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
--> 520 raw_txt = self._do_raw_rest(url_suffix, method, file_upload_info, **kwargs)
521 return self._process_tables(raw_txt.json())
522
/usr/local/lib/python2.7/dist-packages/h2o/connection.pyc in _do_raw_rest(self, url_suffix, method, file_upload_info, **kwargs)
592 raise EnvironmentError(("h2o-py got an unexpected HTTP status code:\n {} {} (method = {}; url = {}). \n"+ \
593 "detailed error messages: {}")
--> 594 .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))
595
596
EnvironmentError: h2o-py got an unexpected HTTP status code:
500 Server Error (method = POST; url = http://localhost:54321/99/Rapids).
detailed error messages: []
If I count rows (len(temp)), it works find. Also if I change the slicing index a little bit, it works find too. For example, if I change to this, it shows the data frame.
t1 = test[:10, :]
t2 = test[:5, :]
Do I miss something here? Thanks.
Unclear what happened without more information (logs would probably say why the rbind did not take).
What version are you using? I tried your code with iris on the bleeding edge and it all worked as expected.
By the way, rbind is typically going to be expensive, especially since what you're semantically after is a subset:
test[range(10) + range(20,test.nrow),:]
should also give you the desired subset (with caveat that you make the full list of row indices in python and pass it over REST to h2o).

Cannot pickle a Python Class instance

Here I have this class definition class definition. When I run below code, it raises following errors.
sm = SaliencyMaskSlic()
operations = [('img_resize', img_resize), ('sal_mask', sm.transform)]
args_list = [{'h_size':258}, {'cropped':True}]
pre_pipeline = Pipeline(ops=operations, arg_list=args_list)
ch = ColorHist('RGB', [6,6,6], [2,2], center=True, pre_pipeline = pre_pipeline)
dill.dump(ch, open('erogol.pkl','wb'))
...
dill.loads('erogol.pkl')
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-11-c8a5937780b5> in <module>()
----> 1 dill.loads('erogol.pkl')
/usr/local/lib/python2.7/dist-packages/dill/dill.pyc in loads(str)
158 """unpickle an object from a string"""
159 file = StringIO(str)
--> 160 return load(file)
161
162 # def dumpzs(obj, protocol=None):
/usr/local/lib/python2.7/dist-packages/dill/dill.pyc in load(file)
148 pik = Unpickler(file)
149 pik._main_module = _main_module
--> 150 obj = pik.load()
151 if type(obj).__module__ == _main_module.__name__: # point obj class to main
152 try: obj.__class__ == getattr(pik._main_module, type(obj).__name__)
/usr/lib/python2.7/pickle.pyc in load(self)
856 while 1:
857 key = read(1)
--> 858 dispatch[key](self)
859 except _Stop, stopinst:
860 return stopinst.value
/usr/lib/python2.7/pickle.pyc in load_appends(self)
1185 def load_appends(self):
1186 stack = self.stack
-> 1187 mark = self.marker()
1188 list = stack[mark - 1]
1189 list.extend(stack[mark + 1:])
/usr/lib/python2.7/pickle.pyc in marker(self)
872 mark = self.mark
873 k = len(stack)-1
--> 874 while stack[k] is not mark: k = k-1
875 return k
876
IndexError: list index out of range
Basically I have one class instance using another class instance inside. I also used cPickle but it raises as I dump;
TypeError: can't pickle instancemethod objects
Any idea for the solution ?
This isn't a pickling error. You can't pickle class instances with pickle or cPickle, but you can with dill. Your code has a bug somewhere that's giving you an IndexError.
Also better than your class having a dump and load method, you might just use dump and load from dill directly... then if you are doing something complicated, you can still add a __getstate__ and __setstate__ method.
Also, your loading from a pickled file, has a bug. You are doing this:
self = dill.loads(in_path)
While you should (1) be using dill.load instead, and (2) load to _self, and then replace the relevant state.
_self = dill.load(in_path)
self.nbins = _self.nbins
self.mask = _self.mask
# and so on... (or update all at once using `__dict__`)

KeyError when using s.loc and s.first_valid_index()

I have data similar to this post: pandas: Filling missing values within a group
That is, I have data in a number of observation sessions, and there is a focal individual for each session. That focal individual is only noted once, but I want to fill in the focal ID data for each line during that session. So, the data look something like this:
Focal Session
0 NaN 1
1 50101 1
2 NaN 1
3 NaN 2
4 50408 2
5 NaN 2
Based on the post linked above, I was using this code:
g = data.groupby('Session')
g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
But this returns a KeyError (specifically, KeyError:None). According to the .loc documentation, KeyErrors can result when the data isn't found. So, I've checked and while I have 152 sessions, I only have 150 non-null data points in the Focal column. Before I decide to manually search my data for which of the sessions is missing a Focal ID, I have two questions:
I am very much a beginner. So is this a reasonable explanation for why I am getting a KeyError?
If it is reasonable, is there a way to figure out which Session is missing Focal ID data, that will save me from manually looking through the data?
Output here:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-330-0e4f27aa7e14> in <module>()
----> 1 data['Focal'] = g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
2 g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
//anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in transform(self, func, *args, **kwargs)
1540 for name, group in self:
1541 object.__setattr__(group, 'name', name)
-> 1542 res = wrapper(group)
1543 # result[group.index] = res
1544 indexer = self.obj.index.get_indexer(group.index)
//anaconda/lib/python2.7/site-packages/pandas/core/groupby.pyc in <lambda>(x)
1536 wrapper = lambda x: getattr(x, func)(*args, **kwargs)
1537 else:
-> 1538 wrapper = lambda x: func(x, *args, **kwargs)
1539
1540 for name, group in self:
<ipython-input-330-0e4f27aa7e14> in <lambda>(s)
----> 1 data['Focal'] = g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
2 g['Focal'].transform(lambda s: s.loc[s.first_valid_index()])
//anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in __getitem__(self, key)
669 return self._getitem_tuple(key)
670 else:
--> 671 return self._getitem_axis(key, axis=0)
672
673 def _getitem_axis(self, key, axis=0):
//anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in _getitem_axis(self, key, axis)
756 return self._getitem_iterable(key, axis=axis)
757 else:
--> 758 return self._get_label(key, axis=axis)
759
760 class _iLocIndexer(_LocationIndexer):
//anaconda/lib/python2.7/site-packages/pandas/core/indexing.pyc in _get_label(self, label, axis)
58 return self.obj._xs(label, axis=axis, copy=False)
59 except Exception:
---> 60 return self.obj._xs(label, axis=axis, copy=True)
61
62 def _get_loc(self, key, axis=0):
//anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in _xs(self, key, axis, level, copy)
570
571 def _xs(self, key, axis=0, level=None, copy=True):
--> 572 return self.__getitem__(key)
573
574 def _ixs(self, i, axis=0):
//anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
611 def __getitem__(self, key):
612 try:
--> 613 return self.index.get_value(self, key)
614 except InvalidIndexError:
615 pass
//anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_value(self, series, key)
761 """
762 try:
--> 763 return self._engine.get_value(series, key)
764 except KeyError, e1:
765 if len(self) > 0 and self.inferred_type == 'integer':
//anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2565)()
//anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_value (pandas/index.c:2380)()
//anaconda/lib/python2.7/site-packages/pandas/index.so in pandas.index.IndexEngine.get_loc (pandas/index.c:3166)()
KeyError: None
The problem is that first_valid_index returns None if there are no valid values (some groups in your DataFrame are all NaN):
In [1]: s = pd.Series([np.nan])
In [2]: s.first_valid_index() # None
Now, loc throws an error because there is no index None:
In [3]: s.loc[s.first_valid_index()]
KeyError: None
What do you want your code to do in this particular case? ...
If you wanted it to be NaN, you could backfill and then take the first element:
g['Focal'].transform(lambda s: s.bfill().iloc[0])
If you want to fix the problem that some groups contains only Nan you could do the following:
g = data.groupby('Session')
g['Focal'].transform(lambda s: 'No values to aggregate' if pd.isnull(s).all() == True else s.loc[s.first_valid_index()])
df['Focal'] = g['Focal'].transform(lambda s: 'No values to aggregate' if pd.isnull(s).all() == True else s.loc[s.first_valid_index()])
In this way you input 'No Values to aggregate' (or whatever you want) when the program find all Nan for a particular group, instead of blocking the execution to return an error.
Hope this helps :)
Federico

Categories