Sktime Assertion During Execution - python

I am trying to execute the the code below but i keep getting an assertion error:
from sktime.forecasting.base import ForecastingHorizon
fh = ForecastingHorizon(
pd.PeriodIndex(pd.date_range("2023-01-01", periods=6, freq="M")), is_relative=False
)
cutoff = pd.Period("2022-12-01", freq="M")
fh.to_relative(cutoff)
from sktime.forecasting.naive import NaiveForecaster
forecaster = NaiveForecaster(strategy="last", sp=12)
forecaster.fit(water_data)
water_data.info()
y_pred = forecaster.predict(fh)
The error stack is
looks like this
AssertionError Traceback (most recent call last)
Cell In[111], line 1
----> 1 y_pred = forecaster.predict(fh)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_base.py:404, in BaseForecaster.predict(self, fh, X)
402 # we call the ordinary _predict if no looping/vectorization needed
403 if not self._is_vectorized:
--> 404 y_pred = self._predict(fh=fh, X=X_inner)
405 else:
406 # otherwise we call the vectorized version of predict
407 y_pred = self._vectorize("predict", X=X_inner, fh=fh)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\naive.py:336, in NaiveForecaster._predict(self, fh, X)
326 def _predict(self, fh=None, X=None):
327 """Forecast time series at future horizon.
328
329 Parameters
(...)
334 Exogenous time series
335 """
--> 336 y_pred = super(NaiveForecaster, self)._predict(fh=fh, X=X)
338 # test_predict_time_index_in_sample_full[ForecastingPipeline-0-int-int-True]
339 # causes a pd.DataFrame to appear as y_pred, which upsets the next lines
340 # reasons are unclear, this is coming from the _BaseWindowForecaster
341 # todo: investigate this
342 if isinstance(y_pred, pd.DataFrame):
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_sktime.py:30, in _BaseWindowForecaster._predict(self, fh, X)
27 kwargs = {"X": X}
29 # all values are out-of-sample
---> 30 if fh.is_all_out_of_sample(self.cutoff):
31 return self._predict_fixed_cutoff(
32 fh.to_out_of_sample(self.cutoff), **kwargs
33 )
35 # all values are in-sample
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:612, in ForecastingHorizon.is_all_out_of_sample(self, cutoff)
597 def is_all_out_of_sample(self, cutoff=None) -> bool:
598 """Whether the forecasting horizon is purely out-of-sample for given cutoff.
599
600 Parameters
(...)
610 cutoff.
611 """
--> 612 return sum(self._is_out_of_sample(cutoff)) == len(self)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:595, in ForecastingHorizon._is_out_of_sample(self, cutoff)
593 def _is_out_of_sample(self, cutoff=None) -> np.ndarray:
594 """Get index location of out-of-sample values."""
--> 595 return np.logical_not(self._is_in_sample(cutoff))
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:574, in ForecastingHorizon._is_in_sample(self, cutoff)
572 def _is_in_sample(self, cutoff=None) -> np.ndarray:
573 """Get index location of in-sample values."""
--> 574 relative = self.to_relative(cutoff).to_pandas()
575 null = 0 if is_integer_index(relative) else pd.Timedelta(0)
576 return relative <= null
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:461, in ForecastingHorizon.to_relative(self, cutoff)
446 """Return forecasting horizon values relative to a cutoff.
447
448 Parameters
(...)
458 Relative representation of forecasting horizon.
459 """
460 cutoff = self._coerce_cutoff_to_index_element(cutoff)
--> 461 return _to_relative(fh=self, cutoff=cutoff)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:682, in _to_relative(fh, cutoff)
680 else:
681 absolute = fh.to_pandas()
--> 682 _check_cutoff(cutoff, absolute)
684 if isinstance(absolute, pd.DatetimeIndex):
685 # coerce to pd.Period for reliable arithmetics and computations of
686 # time deltas
687 absolute = _coerce_to_period(absolute, freq=fh.freq)
File D:\ML Projects\Bityarn-UtilitiesAnalysis\venv\lib\site-packages\sktime\forecasting\base\_fh.py:776, in _check_cutoff(cutoff, index)
773 raise ValueError("`cutoff` must be given, but found none.")
775 if isinstance(index, pd.PeriodIndex):
--> 776 assert isinstance(cutoff, pd.Period)
777 assert index.freqstr == cutoff.freqstr
779 if isinstance(index, pd.DatetimeIndex):
AssertionError:
The water_data has 49 data points with a datetime index and a floating point value.
What could be the issue.
I am following the tutorial on the official website.
Could the problem be the data am passing to the model.

Related

Failure to parallelize code trying to load the same numpy array with joblib

I am new to the world of parallelization, and encountered a very odd bug as I was trying to run a function trying to load the same npy file running on several cores.
My code is of the form:
import os
from pathlib import Path
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
mydir = 'path/of/your/choice'
myfile = 'myArray.npy'
mydir=Path(mydir)
myfile=mydir/myfile
os.chdir(mydir)
myarray = np.zeros((12345))
np.save(myfile, myarray)
def foo(myfile, x):
# function loading a myArray and working with it
arr=np.load(myfile)
return arr+x
if __name__=='__main__':
foo_results = Parallel(n_jobs=num_cores, backend="threading")(\
delayed(foo)(myfile,i) for i in range(10))
In my case, this script would run fine about 40% of the way, then return
--> 17 arr=np.load(mydir/'myArray.npy')
ValueError: cannot reshape array of size 0 into shape (12345,)
What blows my mind is that if I enter %pdb debug mode and actually try to run arr=np.load(mydir/'myArray.npy'), this works! So I assume that the issue stems from all the parallel processes running foo trying to load the same numpy array at the same time (as in debug mode, all the processes are paused and only the code that I execute actually runs).
This very minimal example actually works, presumably because the function is very simple and joblib handles this gracefully, but my code would be too long and complicated to be posted here - first of all, has anyone encountered a similar issue in the past? If no one manages to identify my issue, I will post my whole script.
Thanks for your help!
-------------------- EDIT ------------------
Given that there doesn't seem to be an easy answer with the toy code that I posted, here are the full error logs. I played around with the backends following #psarka recommendation and for some reason, the following error arises with the default loky backend (again, no problem to run the code in a non-parallel manner):
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in ccg_stack(dp, U_src, U_trg, cbin, cwin, normalize, all_to_all, name, sav, again, periods)
541
542 ccg_results=Parallel(n_jobs=num_cores)(\
--> 543 delayed(ccg)(*ccg_inputs[i]) for i in tqdm(range(len(ccg_inputs)), desc=f'Computing ccgs over {num_cores} cores'))
544 for ((i1, u1, i2, u2), CCG) in zip(ccg_ids,ccg_results):
545 if i1==i2:
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
~/miniconda3/envs/npyx/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)
~/miniconda3/envs/npyx/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: Cannot load file containing pickled data when allow_pickle=False
but this arises with the threading backend, which is more informative (which was originally used in my question) - again, it is possible to actually run train = np.load(Path(dprm,fn)) in debug mode:
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in ccg_stack(dp, U_src, U_trg, cbin, cwin, normalize, all_to_all, name, sav, again, periods)
541
542 ccg_results=Parallel(n_jobs=num_cores, backend='threading')(\
--> 543 delayed(ccg)(*ccg_inputs[i]) for i in tqdm(range(len(ccg_inputs)), desc=f'Computing ccgs over {num_cores} cores'))
544 for ((i1, u1, i2, u2), CCG) in zip(ccg_ids,ccg_results):
545 if i1==i2:
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
~/miniconda3/envs/npyx/lib/python3.7/multiprocessing/pool.py in get(self, timeout)
655 return self._value
656 else:
--> 657 raise self._value
658
659 def _set(self, i, obj):
~/miniconda3/envs/npyx/lib/python3.7/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
119 job, i, func, args, kwds = task
120 try:
--> 121 result = (True, func(*args, **kwds))
122 except Exception as e:
123 if wrap_exception and func is not _helper_reraises_exception:
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/_parallel_backends.py in __call__(self, *args, **kwargs)
593 def __call__(self, *args, **kwargs):
594 try:
--> 595 return self.func(*args, **kwargs)
596 except KeyboardInterrupt as e:
597 # We capture the KeyboardInterrupt and reraise it as
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in __call__(self)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
~/miniconda3/envs/npyx/lib/python3.7/site-packages/joblib-1.0.1-py3.7.egg/joblib/parallel.py in <listcomp>(.0)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in ccg(dp, U, bin_size, win_size, fs, normalize, ret, sav, verbose, periods, again, trains)
258 if verbose: print("File {} not found in routines memory.".format(fn))
259 crosscorrelograms = crosscorrelate_cyrille(dp, bin_size, win_size, sortedU, fs, True,
--> 260 periods=periods, verbose=verbose, trains=trains)
261 crosscorrelograms = np.asarray(crosscorrelograms, dtype='float64')
262 if crosscorrelograms.shape[0]<len(U): # no spikes were found in this period
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in crosscorrelate_cyrille(dp, bin_size, win_size, U, fs, symmetrize, periods, verbose, trains)
88 U=list(U)
89
---> 90 spike_times, spike_clusters = make_phy_like_spikeClustersTimes(dp, U, periods=periods, verbose=verbose, trains=trains)
91
92 return crosscorr_cyrille(spike_times, spike_clusters, win_size, bin_size, fs, symmetrize)
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/corr.py in make_phy_like_spikeClustersTimes(dp, U, periods, verbose, trains)
46 for iu, u in enumerate(U):
47 # Even lists of strings can be dealt with as integers by being replaced by their indices
---> 48 trains_dic[iu]=trn(dp, u, sav=True, periods=periods, verbose=verbose) # trains in samples
49 else:
50 assert len(trains)>1
/media/maxime/ut_data/Dropbox/NeuroPyxels/npyx/spk_t.py in trn(dp, unit, sav, verbose, periods, again, enforced_rp)
106 if op.exists(Path(dprm,fn)) and not again:
107 if verbose: print("File {} found in routines memory.".format(fn))
--> 108 train = np.load(Path(dprm,fn))
109
110 # if not, compute it
~/miniconda3/envs/npyx/lib/python3.7/site-packages/numpy-1.21.0rc2-py3.7-linux-x86_64.egg/numpy/lib/npyio.py in load(file, mmap_mode, allow_pickle, fix_imports, encoding)
443 # Try a pickle
444 if not allow_pickle:
--> 445 raise ValueError("Cannot load file containing pickled data "
446 "when allow_pickle=False")
447 try:
ValueError: Cannot load file containing pickled data when allow_pickle=False
The original error ValueError: cannot reshape array of size 0 into shape (12345,) doesn't show up anymore for some reason.

What is this error with sklearn's skpca.fit when doing a PCA analysis

I am doing a simple PCA analysis with some satellite data. All the land points are removed, the mean and standard deviation are close to 0 and 1. However I get
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
scaler_sst = scaler.fit(sss_data)
import joblib
joblib.dump(scaler_sst, './scaler_sst.pkl', compress=9)
scaler_sst = joblib.load('./scaler_sst.pkl')
X = scaler_sst.transform(sss_data)
print(X.mean())
print(X.std())
#X.shape
5.7725416769826885e-15
0.9999999999999993
from sklearn.decomposition import pca
skpca=pca.PCA()
skpca.fit(X)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/IPython/core/formatters.py in __call__(self, obj, include, exclude)
968
969 if method is not None:
--> 970 return method(include=include, exclude=exclude)
971 return None
972 else:
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/base.py in _repr_mimebundle_(self, **kwargs)
461 def _repr_mimebundle_(self, **kwargs):
462 """Mime bundle used by jupyter kernels to display estimator"""
--> 463 output = {"text/plain": repr(self)}
464 if get_config()["display"] == 'diagram':
465 output["text/html"] = estimator_html_repr(self)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/base.py in __repr__(self, N_CHAR_MAX)
273
274 # use ellipsis for sequences with a lot of elements
--> 275 pp = _EstimatorPrettyPrinter(
276 compact=True, indent=1, indent_at_name=True,
277 n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/utils/_pprint.py in __init__(self, indent, width, depth, stream, compact, indent_at_name, n_max_elements_to_show)
162 if self._indent_at_name:
163 self._indent_per_level = 1 # ignore indent param
--> 164 self._changed_only = get_config()['print_changed_only']
165 # Max number of elements in a list, dict, tuple until we start using
166 # ellipsis. This also affects the number of arguments of an estimators
KeyError: 'print_changed_only'
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/base.py in __repr__(self, N_CHAR_MAX)
273
274 # use ellipsis for sequences with a lot of elements
--> 275 pp = _EstimatorPrettyPrinter(
276 compact=True, indent=1, indent_at_name=True,
277 n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW)
~/miniconda3/envs/py3_std_maps/lib/python3.8/site-packages/sklearn/utils/_pprint.py in __init__(self, indent, width, depth, stream, compact, indent_at_name, n_max_elements_to_show)
162 if self._indent_at_name:
163 self._indent_per_level = 1 # ignore indent param
--> 164 self._changed_only = get_config()['print_changed_only']
165 # Max number of elements in a list, dict, tuple until we start using
166 # ellipsis. This also affects the number of arguments of an estimators
KeyError: 'print_changed_only'
The error occurs at the part skpca.fit(X). I reinstalled the sklearn package and scikit packages. I have used a PCA analysis before with sklearn and this has never happened.
I don't know the answer but maybe this is a bug in sklearn:
Try:
import sklearn
sklearn.get_config()
In my case it returns a dict:
{'assume_finite': False, 'working_memory': 1024, 'print_changed_only': False}
The error indicates that the print_changend_only does not exist for you.
My sklearn version is '0.21.2' on python 3.6. Maybe it helps to downgrade the sklearn version?

Pipeline error (ValueError: Specifying the columns using strings is only supported for pandas DataFrames)

The example is fully reproducible. Here is full notebook (which downloads data too): https://github.com/ageron/handson-ml2/blob/master/02_end_to_end_machine_learning_project.ipynb
After this part in notebook above:
full_pipeline_with_predictor = Pipeline([
("preparation", full_pipeline),
("linear", LinearRegression())
])
full_pipeline_with_predictor.fit(housing, housing_labels)
full_pipeline_with_predictor.predict(some_data)
I am trying to get predictions on the test set with this code:
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = full_pipeline_with_predictor.predict(X_test_prepared)
But I am receiving error:
C:\Users\Alex\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py:430: FutureWarning: Given feature/column names or counts do not match the ones for the data given during fit. This will fail from v0.24.
FutureWarning)
---------------------------------------------------------------------------
Empty Traceback (most recent call last)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
796 try:
--> 797 tasks = self._ready_batches.get(block=False)
798 except queue.Empty:
~\AppData\Local\Continuum\anaconda3\lib\queue.py in get(self, block, timeout)
166 if not self._qsize():
--> 167 raise Empty
168 elif timeout is None:
Empty:
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-141-dc87b1c9e658> in <module>
5
6 X_test_prepared = full_pipeline.transform(X_test)
----> 7 final_predictions = full_pipeline_with_predictor.predict(X_test_prepared)
8
9 final_mse = mean_squared_error(y_test, final_predictions)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\pipeline.py in predict(self, X, **predict_params)
417 Xt = X
418 for _, name, transform in self._iter(with_final=False):
--> 419 Xt = transform.transform(Xt)
420 return self.steps[-1][-1].predict(Xt, **predict_params)
421
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in transform(self, X)
586
587 self._validate_features(X.shape[1], X_feature_names)
--> 588 Xs = self._fit_transform(X, None, _transform_one, fitted=True)
589 self._validate_output(Xs)
590
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in _fit_transform(self, X, y, func, fitted)
455 message=self._log_message(name, idx, len(transformers)))
456 for idx, (name, trans, column, weight) in enumerate(
--> 457 self._iter(fitted=fitted, replace_strings=True), 1))
458 except ValueError as e:
459 if "Expected 2D array, got 1D array instead" in str(e):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\AppData\Local\Continuum\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
806 big_batch_size = batch_size * n_jobs
807
--> 808 islice = list(itertools.islice(iterator, big_batch_size))
809 if len(islice) == 0:
810 return False
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\compose\_column_transformer.py in <genexpr>(.0)
454 message_clsname='ColumnTransformer',
455 message=self._log_message(name, idx, len(transformers)))
--> 456 for idx, (name, trans, column, weight) in enumerate(
457 self._iter(fitted=fitted, replace_strings=True), 1))
458 except ValueError as e:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\__init__.py in _safe_indexing(X, indices, axis)
404 if axis == 1 and indices_dtype == 'str' and not hasattr(X, 'loc'):
405 raise ValueError(
--> 406 "Specifying the columns using strings is only supported for "
407 "pandas DataFrames"
408 )
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
Question: How can I correct that error? And why that error happens?
Since your final pipeline:
full_pipeline_with_predictor = Pipeline([
("preparation", full_pipeline),
("linear", LinearRegression())
])
clearly contains already the full_pipeline, you should not "prepare" again your X_test; doing so, you are "preparing" X_test twice, which is wrong. So, your code should be simply
final_predictions = full_pipeline_with_predictor.predict(X_test)
exactly as it is for getting predictions for some_data, i.e.
full_pipeline_with_predictor.predict(some_data)
which some_data you correctly do not "prepare" before feeding them into the final pipeline.
The whole point of using pipelines is exactly this, i.e. to avoid having to run separately fit-predict for possibly several preparation steps, having wrapped all of them into a single pipeline instead. You correctly apply this process here when you predict some_data, but you somehow seem to have forgotten it in the next step, when you try to predict X_test.

category_encoders: TargetEncoder error "TypeError: Categorical cannot perform the operation mean"

I'm getting the below error when I try to target encode a categorical column.
"TypeError: Categorical cannot perform the operation mean"
When I try to run similar code through Jupyter Notebook it works fine but when I try to run it as part of python file it errors out with the above error message.
I know this sounds a bit crazy but I am not able to understand what's going on in the background?
Error:
TypeError Traceback (most recent call last)
<ipython-input-4-9ba53c6b7375> in <module>()
----> 1 tpmodeller.initialize()
~/SageMaker/tp/tp_kvr.py in initialize(self)
127
128 # Target Encode Te_cat_col Features
--> 129 df_cat_te = target_encode_bin(self.train[self.Te_cat_col], self.train['vol_trmnt_in_4_quarters'])
130 self.train = pd.concat([self.train, df_cat_te], axis=1)
131
~/SageMaker/tp/tp_kvr.py in target_encode_bin(df_te, target)
366 te = TargetEncoder(smoothing = 1, min_samples_leaf = 5, handle_unknown='ignore')
--> 367 df_te = te.fit_transform(df_te, target)
368 #
369 # Binning and then placing it in {col}_bin feature
~/anaconda3/envs/python3/lib/python3.6/site-packages/category_encoders/target_encoder.py in fit_transform(self, X, y, **fit_params)
249 transform(X)
250
--> 251 return self.fit(X, y, **fit_params).transform(X, y)
252
~/anaconda3/envs/python3/lib/python3.6/site-packages/category_encoders/target_encoder.py in fit(self, X, y, **kwargs)
138 self.ordinal_encoder = self.ordinal_encoder.fit(X)
139 X_ordinal = self.ordinal_encoder.transform(X)
--> 140 self.mapping = self.fit_target_encoding(X_ordinal, y)
141
142 X_temp = self.transform(X, override_return_df=True)
~/anaconda3/envs/python3/lib/python3.6/site-packages/category_encoders/target_encoder.py in fit_target_encoding(self, X, y)
164 values = switch.get('mapping')
165
--> 166 prior = self._mean = y.mean()
167
168 stats = y.groupby(X[col]).agg(['count', 'mean'])
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/generic.py in stat_func(self, axis, skipna, level, numeric_only, **kwargs)
10954 skipna=skipna)
10955 return self._reduce(f, name, axis=axis, skipna=skipna,
> 10956 numeric_only=numeric_only)
10957
10958 return set_function_name(stat_func, name, cls)
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
3614 # TODO deprecate numeric_only argument for Categorical and use
3615 # skipna as well, see GH25303
-> 3616 return delegate._reduce(name, numeric_only=numeric_only, **kwds)
3617 elif isinstance(delegate, ExtensionArray):
3618 # dispatch to ExtensionArray interface
~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/arrays/categorical.py in _reduce(self, name, axis, **kwargs)
2170 if func is None:
2171 msg = 'Categorical cannot perform the operation {op}'
-> 2172 raise TypeError(msg.format(op=name))
2173 return func(**kwargs)
2174
TypeError: Categorical cannot perform the operation mean
Your target feature needs to be converted to a 'number' type.

Error running Theil-Sen Regression in Python

I have a dataframe similar to the following, which we'll call "df":
id value time
a 1 1
a 1.5 2
a 2 3
a 2.5 4
b 1 1
b 1.5 2
b 2 3
b 2.5 4
I am running various regressions by "id" in Python on this dataframe. Generally, this requires a grouping by "id" and then applying a function to those groupings that calculates the regression.
I am working with 2 similar regression techniques in Scipy's stats library:
Theil-Sen estimator:
(https://docs.scipy.org/doc/scipy-0.15.1/reference/generated/scipy.stats.mstats.theilslopes.html)
Siegel estimator:
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.siegelslopes.html).
Both of these intake the same type of data. Therefore the function to calculate them should be the same aside from the actual technique used.
For Theil-Sen, I wrote the following function and the groupby statement that would be applied to that function:
def theil_reg(df, xcol, ycol):
model = stats.theilslopes(ycol,xcol)
return pd.Series(model)
out = df.groupby('id').apply(theil_reg, xcol='time', ycol='value')
However, I get the following error, which I've been having the hardest time understanding how to address:
ValueError: could not convert string to float: 'time'
The actual variable time is a numpy float object, so it isn't a string. This makes me believe that the stats.theilslopes function is not recognizing that time is a column in the dataframe and is instead using 'time' as a string input into the function.
However if that's the case, then this seems to be a bug in the stats.theilslopes package, and would need to be addressed by Scipy. The reason I believe this to be the case is because the exact same function as above, but instead using the siegelslopes package, works perfectly fine and provides the output I'm expecting, and they're essentially the same estimation with the same inputs.
Doing the following on Siegel:
def siegel_reg(df, xcol, ycol):
model = stats.siegelslopes(ycol,xcol)
return pd.Series(model)
out = df.groupby('id').apply(siegel_reg, xcol='time',ycol='value')
Does not create any errors about the time variable and conducts the regression as needed.
Does anyone have thoughts on whether I'm missing something here? If so I would appreciate any thoughts, or if not, any thoughts on how to address this with Scipy.
Edit: here is the full error message that shows up when I run this script:
ValueError Traceback (most recent call last)
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
688 try:
--> 689 result = self._python_apply_general(f)
690 except Exception:
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
706 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707 self.axis)
708
C:\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in f(g)
678 with np.errstate(all='ignore'):
--> 679 return func(g, *args, **kwargs)
680 else:
<ipython-input-506-0a1696f0aecd> in theil_reg(df, xcol, ycol)
1 def theil_reg(df, xcol, ycol):
----> 2 model = stats.theilslopes(ycol,xcol)
3 return pd.Series(model)
C:\Anaconda\lib\site-packages\scipy\stats\_stats_mstats_common.py in
theilslopes(y, x, alpha)
221 else:
--> 222 x = np.array(x, dtype=float).flatten()
223 if len(x) != len(y):
ValueError: could not convert string to float: 'time'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-507-9a199e0ce924> in <module>
----> 1 df_accel_correct.groupby('chart').apply(theil_reg, xcol='time',
ycol='value')
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
699
700 with _group_selection_context(self):
--> 701 return self._python_apply_general(f)
702
703 return result
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
705 def _python_apply_general(self, f):
706 keys, values, mutated = self.grouper.apply(f,
self._selected_obj,
--> 707 self.axis)
708
709 return self._wrap_applied_output(
C:\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
188 # group might be modified
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
192 mutated = True
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in f(g)
677 def f(g):
678 with np.errstate(all='ignore'):
--> 679 return func(g, *args, **kwargs)
680 else:
681 raise ValueError('func must be a callable if args or '
<ipython-input-506-0a1696f0aecd> in theil_reg(df, xcol, ycol)
1 def theil_reg(df, xcol, ycol):
----> 2 model = stats.theilslopes(ycol,xcol)
3 return pd.Series(model)
C:\Anaconda\lib\site-packages\scipy\stats\_stats_mstats_common.py in theilslopes(y, x, alpha)
220 x = np.arange(len(y), dtype=float)
221 else:
--> 222 x = np.array(x, dtype=float).flatten()
223 if len(x) != len(y):
224 raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
ValueError: could not convert string to float: 'time'
Update 2: after calling df in the function, I received the following error message:
ValueError Traceback (most recent call last)
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
688 try:
--> 689 result = self._python_apply_general(f)
690 except Exception:
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
706 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707 self.axis)
708
C:\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in f(g)
678 with np.errstate(all='ignore'):
--> 679 return func(g, *args, **kwargs)
680 else:
<ipython-input-563-5db69048f347> in theil_reg(df, xcol, ycol)
1 def theil_reg(df, xcol, ycol):
----> 2 model = stats.theilslopes(df[ycol],df[xcol])
3 return pd.Series(model)
C:\Anaconda\lib\site-packages\scipy\stats\_stats_mstats_common.py in theilslopes(y, x, alpha)
248 sigma = np.sqrt(sigsq)
--> 249 Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
250 Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
ValueError: cannot convert float NaN to integer
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-564-d7794bd1d495> in <module>
----> 1 correct_theil = df_accel_correct.groupby('chart').apply(theil_reg, xcol='time', ycol='value')
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
699
700 with _group_selection_context(self):
--> 701 return self._python_apply_general(f)
702
703 return result
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f)
705 def _python_apply_general(self, f):
706 keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707 self.axis)
708
709 return self._wrap_applied_output(
C:\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
188 # group might be modified
189 group_axes = _get_axes(group)
--> 190 res = f(group)
191 if not _is_indexed_like(res, group_axes):
192 mutated = True
C:\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in f(g)
677 def f(g):
678 with np.errstate(all='ignore'):
--> 679 return func(g, *args, **kwargs)
680 else:
681 raise ValueError('func must be a callable if args or '
<ipython-input-563-5db69048f347> in theil_reg(df, xcol, ycol)
1 def theil_reg(df, xcol, ycol):
----> 2 model = stats.theilslopes(df[ycol],df[xcol])
3 return pd.Series(model)
C:\Anaconda\lib\site-packages\scipy\stats\_stats_mstats_common.py in theilslopes(y, x, alpha)
247 # Find the confidence interval indices in `slopes`
248 sigma = np.sqrt(sigsq)
--> 249 Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
250 Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
251 delta = slopes[[Rl, Ru]]
ValueError: cannot convert float NaN to integer
However, I have no null values in either column, and both columns are floats. Any suggestions on this error?
Essentially, you are passing the string values of column names (not any value entities) into methods but the slopes calls require numpy arrays (or pandas series that can be coerced into arrays). Specifically, you are attempting this call with no reference to df and hence your error:
model = stats.theilslopes('value', 'time')
Simply reference df in the calls:
model = stats.theilslopes(df['value'], df['time'])
model = stats.theilslopes(df[ycol], df[xcol])
Regarding different results across packages does not mean bugs with Scipy. Packages run different implementations. Read docs carefully to see how to call methods. Possibly, the other package you refer to allows a data input as argument inside call and the named strings reference the columns like below:
slopes_call(y='y_string', x='x_string', data=df)
In general, the Python object model always requires explicit named references to calls and objects and does not assume context.

Categories