sklearn image.PatchExtractor in a pipeline throws ValueError - python

I want to train my image classifier on extracted patches, so I developed the following pipeline, but I keep getting a ValueError and I'm not sure how can I solve it:
mlp_pipeline = pipeline.make_pipeline(
image.PatchExtractor(),
DictVectorizer(),
preprocessing.MinMaxScaler(**hyperparameters),
neural_network.MLPClassifier(**hyperparameters))
When I attempt to fit the model:
_ = mlp_pipeline.fit(train_features, train_target)
I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [18], in <cell line: 1>()
----> 1 _ = mlp_pipeline.fit(train_features, train_target)
3 _predi = mlp_pipeline.predict(train_features)
5 _report = metrics.classification_report(
6 train_target,
7 _predi,
8 )
File ~/anaconda3/envs/DSA/lib/python3.10/site-packages/sklearn/pipeline.py:378, in Pipeline.fit(self, X, y, **fit_params)
352 """Fit the model.
353
354 Fit all the transformers one after the other and transform the
(...)
375 Pipeline with fitted steps.
376 """
377 fit_params_steps = self._check_fit_params(**fit_params)
--> 378 Xt = self._fit(X, y, **fit_params_steps)
379 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
380 if self._final_estimator != "passthrough":
File ~/anaconda3/envs/DSA/lib/python3.10/site-packages/sklearn/pipeline.py:336, in Pipeline._fit(self, X, y, **fit_params_steps)
334 cloned_transformer = clone(transformer)
335 # Fit or load from cache the current transformer
--> 336 X, fitted_transformer = fit_transform_one_cached(
337 cloned_transformer,
338 X,
339 y,
340 None,
341 message_clsname="Pipeline",
342 message=self._log_message(step_idx),
343 **fit_params_steps[name],
344 )
345 # Replace the transformer of the step with the fitted
346 # transformer. This is necessary when loading the transformer
347 # from the cache.
348 self.steps[step_idx] = (name, fitted_transformer)
File ~/anaconda3/envs/DSA/lib/python3.10/site-packages/joblib/memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
File ~/anaconda3/envs/DSA/lib/python3.10/site-packages/sklearn/pipeline.py:872, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
870 res = transformer.fit_transform(X, y, **fit_params)
871 else:
--> 872 res = transformer.fit(X, y, **fit_params).transform(X)
874 if weight is None:
875 return res, transformer
File ~/anaconda3/envs/DSA/lib/python3.10/site-packages/sklearn/feature_extraction/image.py:551, in PatchExtractor.transform(self, X)
532 """Transform the image samples in `X` into a matrix of patch data.
533
534 Parameters
(...)
548 number of patches that can be extracted.
549 """
550 self.random_state = check_random_state(self.random_state)
--> 551 n_images, i_h, i_w = X.shape[:3]
552 X = np.reshape(X, (n_images, i_h, i_w, -1))
553 n_channels = X.shape[-1]
ValueError: not enough values to unpack (expected 3, got 2)

Related

Confusion matrix visualization: Specifying the columns using strings is only supported for pandas DataFrames

Based on this post: https://towardsdatascience.com/how-to-plot-a-confusion-matrix-from-a-k-fold-cross-validation-b607317e9874, I want to visualize the confusion matrix of my dataset. But I get the following error message:
AttributeError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
408 try:
--> 409 all_columns = X.columns
410 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
8 frames
<ipython-input-76-827f7c5dca49> in <module>()
1 # call the functions
----> 2 actual_classes, predicted_classes, _ = cross_val_predict(best_grid, kf, X.to_numpy(), y.to_numpy())
3 plot_confusion_matrix(actual_classes, predicted_classes, ['Rating_1', 'Rating_2', 'Rating_3', 'Rating_4', 'Rating_5'])
<ipython-input-68-f1bff3bd9ebf> in cross_val_predict(best_grid, kf, X, y)
19 actual_classes = np.append(actual_classes, test_y)
20
---> 21 model_.fit(train_X, train_y)
22 predicted_classes = np.append(predicted_classes, model_.predict(test_X))
23
/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
353 message_clsname="Pipeline",
354 message=self._log_message(step_idx),
--> 355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
/usr/local/lib/python3.7/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
/usr/local/lib/python3.7/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
670 self._check_n_features(X, reset=True)
671 self._validate_transformers()
--> 672 self._validate_column_callables(X)
673 self._validate_remainder(X)
674
/usr/local/lib/python3.7/dist-packages/sklearn/compose/_column_transformer.py in _validate_column_callables(self, X)
350 columns = columns(X)
351 all_columns.append(columns)
--> 352 transformer_to_input_indices[name] = _get_column_indices(X, columns)
353
354 self._columns = all_columns
/usr/local/lib/python3.7/dist-packages/sklearn/utils/__init__.py in _get_column_indices(X, key)
410 except AttributeError:
411 raise ValueError(
--> 412 "Specifying the columns using strings is only "
413 "supported for pandas DataFrames"
414 )
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
Here is my code:
def cross_val_predict(best_grid, kf:KFold, X:np.array, y:np.array) -> Tuple[np.array, np.array, np.array]:
model_ = cp.deepcopy(best_grid)
no_classes = len(np.unique(y))
actual_classes = np.empty([0], dtype=int)
predicted_classes = np.empty([0], dtype=int)
predicted_proba = np.empty([0, no_classes])
for train_idx, test_idx in kf.split(X):
train_X, train_y, test_X, test_y = X[train_idx], y[train_idx], X[test_idx], y[test_idx]
actual_classes = np.append(actual_classes, test_y)
model_.fit(train_X, train_y)
predicted_classes = np.append(predicted_classes, model_.predict(test_X))
try:
predicted_proba = np.append(predicted_proba, model_.predict_proba(test_X), axis=0)
except:
predicted_proba = np.append(predicted_proba, np.zeros((len(test_X), no_classes), dtype=float), axis=0)
return actual_classes, predicted_classes, predicted_proba
# Visualise the Confusion Matrix
def plot_confusion_matrix(actual_classes : np.array, predicted_classes : np.array, sorted_labels : list):
matrix = confusion_matrix(actual_classes, predicted_classes, labels=sorted_labels)
plt.figure(figsize=(12.8,6))
sns.heatmap(matrix, annot=True, xticklabels=sorted_labels, yticklabels=sorted_labels, cmap='Blues', fmt='g')
plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix Visualization')
plt.show()
# call the functions
actual_classes, predicted_classes, _ = cross_val_predict(best_grid, kf, X.to_numpy(), y.to_numpy())
plot_confusion_matrix(actual_classes, predicted_classes, ['Rating_1', 'Rating_2', 'Rating_3', 'Rating_4', 'Rating_5'])
The error message came after I ran the code to call the functions. Thanks in advance.

Feature-Engine RareLabelEncoder: ValueError: could not convert string to float: 'Rare'

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import RareLabelEncoder
from feature_engine.encoding import MeanEncoder
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import BoxCoxTransformer, PowerTransformer
high_card_cols = ['brand', 'model', 'location']
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
processor = make_column_transformer(
(RareLabelEncoder(n_categories = 9), ['brand', 'model']), #to group rare cateogircal observations
(MeanEncoder(), high_card_cols), # To encode categorical observations with target mean
(OrdinalEncoder(), cat_cols), #to encode low cardinal variables
(PowerTransformer(), ['milage_kmpl']), # transform continuous variables using Exponential transformation
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),# transform continuous variables using BoxCox
remainder = "passthrough"
)
I am currently on a regression task. I have 2 categorical columns that have high cardinality and rare observations. I created a pipeline that includes rarelabelencoder followed by meanencoder and other encoders.
When I try to fit a simple linear regression model, I get the following error:
ValueError: could not convert string to float: 'Rare'
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
lr_pipe = make_pipeline(
(processor),
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-91-1c31eaf7c59a> in <module>
8 )
9
---> 10 lr_pipe.fit(X_train, y_train.price)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
339 """
340 fit_params_steps = self._check_fit_params(**fit_params)
--> 341 Xt = self._fit(X, y, **fit_params_steps)
342 with _print_elapsed_time('Pipeline',
343 self._log_message(len(self.steps) - 1)):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
301 cloned_transformer = clone(transformer)
302 # Fit or load from cache the current transformer
--> 303 X, fitted_transformer = fit_transform_one_cached(
304 cloned_transformer, X, y, None,
305 message_clsname='Pipeline',
~\anaconda3\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
752 with _print_elapsed_time(message_clsname, message):
753 if hasattr(transformer, 'fit_transform'):
--> 754 res = transformer.fit_transform(X, y, **fit_params)
755 else:
756 res = transformer.fit(X, y, **fit_params).transform(X)
~\anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
700 else:
701 # fit method of arity 2 (supervised transformation)
--> 702 return self.fit(X, y, **fit_params).transform(X)
703
704
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in fit(self, X, y, sample_weight)
728 # Reset internal state before fitting
729 self._reset()
--> 730 return self.partial_fit(X, y, sample_weight)
731
732 def partial_fit(self, X, y=None, sample_weight=None):
~\anaconda3\lib\site-packages\sklearn\preprocessing\_data.py in partial_fit(self, X, y, sample_weight)
764 """
765 first_call = not hasattr(self, "n_samples_seen_")
--> 766 X = self._validate_data(X, accept_sparse=('csr', 'csc'),
767 estimator=self, dtype=FLOAT_DTYPES,
768 force_all_finite='allow-nan', reset=first_call)
~\anaconda3\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
419 out = X
420 elif isinstance(y, str) and y == 'no_validation':
--> 421 X = check_array(X, **check_params)
422 out = X
423 else:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
671 array = array.astype(dtype, casting="unsafe", copy=False)
672 else:
--> 673 array = np.asarray(array, order=order, dtype=dtype)
674 except ComplexWarning as complex_warning:
675 raise ValueError("Complex data not supported\n"
~\anaconda3\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
81
82 """
---> 83 return array(a, dtype, copy=False, order=order)
84
85
ValueError: could not convert string to float: 'Rare'
How to overcome this issue?
The beauty of Feature-engine transformers is that you can select the variables directly at the transformer, so there is no need to use sklearn's column transformer at all. You can place all Feature-engine transformers directly within a Pipeline.
lr_pipe = make_pipeline(
(RareLabelEncoder(n_categories = 9,variables= ['brand', 'model']),
(MeanEncoder(variables=high_card_cols),
(OrdinalEncoder(variables = cat_cols),
etc...
(StandardScaler()),
(LinearRegression())
)
lr_pipe.fit(X_train, y_train.price)
Update:
I managed to solve the problem the following way:
I added rarelabelencoder to the pipeline rather than column-transformer. This solved the issue for me.
lr_pipe = make_pipeline(
(RareLabelEncoder(0.002, variables = ['brand', 'model'])),
(nontree_processor),
(StandardScaler()),
(LinearRegression())
)
ColumnTransformer applies its transformers in parallel, so the brand column actually shows up twice coming out of the processor: once with rare labels grouped, but not otherwise encoded (throwing the error), and then again mean-encoded (but with rare groups getting different values). You can use pipelines to get around that:
cat_cols = ['fuel_type', 'transmission', 'is_first_owner']
brandmodel_pipe = make_pipeline(
RareLabelEncoder(n_categories=9),
MeanEncoder(),
)
processor = make_column_transformer(
(brandmodel_pipe, ['brand', 'model']),
(MeanEncoder(), ['location']),
(OrdinalEncoder(), cat_cols),
(PowerTransformer(), ['milage_kmpl']),
(BoxCoxTransformer(), ['kilometers_driven', 'engine', 'power']),
remainder = "passthrough"
)

scikit-learn FeatureUnion not working to combine text & numeric features

I'm trying to combine a textual column of movie plots I have in a dataset with a categorical column of each movie's rating (the MPAA rating - G, PG, PG-13, R; not an IMDb user's score). I'm using sklearn's FeatureUnion object, but I keep getting en error about the fit_transform method being called with too many named arguments. Here's my code:
# create training and testing sets
X_train, X_test, y_train, y_test = train_test_split(movie_ratings[['Genre', 'Plot']], pd.get_dummies(movie_ratings['Rated']), random_state=56)
''' create a processing pipeline and feature union '''
# create function transformers
get_genre_data = FunctionTransformer(lambda x: x['Genre'], validate=False)
get_plot_data = FunctionTransformer(lambda x: x['Plot'], validate=False)
# obtain the data
genres = get_genre_data.fit_transform(movie_ratings)
plots = get_plot_data.fit_transform(movie_ratings)
# # join the processing in a feature union
join_data_formats = FeatureUnion(
transformer_list = [
('genres', Pipeline([
('selector', get_genre_data),
('one_hot_encoder', LabelEncoder())
])),
('plots', Pipeline([
('selector', get_plot_data),
('count_vectorizer', CountVectorizer(tokenizer=nltk.tokenize)),
('tfidf_transformer', TfidfTransformer())
]))
]
)
# # instantiate a nested pipeline
pipeline = Pipeline([
('feature_union', join_data_formats),
('neural_network', MLPClassifier(alpha=0.01, hidden_layer_sizes=(100,), early_stopping=False, verbose=True))
])
# # fit the pipeline to the training data
pipeline.fit(X_train, y_train)
...and the error being thrown is:
34 # # fit the pipeline to the training data
---> 35 pipeline.fit(X_train, y_train)
...
TypeError: fit_transform() takes 2 positional arguments but 3 were given
Where am I going wrong? Thanks much for the help!
UPDATE: here's the full stack trace:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-171-f57d9b24a9c8> in <module>()
28 # print(y_test.shape)
29
---> 30 pipeline.fit(X_train, y_train)
31 y_pred = pipeline.predict(X_test)
32
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
246 This estimator
247 """
--> 248 Xt, fit_params = self._fit(X, y, **fit_params)
249 if self._final_estimator is not None:
250 self._final_estimator.fit(Xt, y, **fit_params)
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params)
211 Xt, fitted_transformer = fit_transform_one_cached(
212 cloned_transformer, None, Xt, y,
--> 213 **fit_params_steps[name])
214 # Replace the transformer of the step with the fitted
215 # transformer. This is necessary when loading the transformer
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py in __call__(self, *args, **kwargs)
360
361 def __call__(self, *args, **kwargs):
--> 362 return self.func(*args, **kwargs)
363
364 def call_and_shelve(self, *args, **kwargs):
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
579 **fit_params):
580 if hasattr(transformer, 'fit_transform'):
--> 581 res = transformer.fit_transform(X, y, **fit_params)
582 else:
583 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
737 delayed(_fit_transform_one)(trans, weight, X, y,
738 **fit_params)
--> 739 for name, trans, weight in self._iter())
740
741 if not result:
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
579 **fit_params):
580 if hasattr(transformer, 'fit_transform'):
--> 581 res = transformer.fit_transform(X, y, **fit_params)
582 else:
583 res = transformer.fit(X, y, **fit_params).transform(X)
~\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params)
281 Xt, fit_params = self._fit(X, y, **fit_params)
282 if hasattr(last_step, 'fit_transform'):
--> 283 return last_step.fit_transform(Xt, y, **fit_params)
284 elif last_step is None:
285 return Xt
TypeError: fit_transform() takes 2 positional arguments but 3 were given

AttributeError: 'NoneType' object has no attribute 'lower' python using spacy

Here is the full error message:
AttributeErrorTraceback (most recent call last)
in ()
24
25 # train
---> 26 pipe.fit(train1, labelsTrain1)
27
28 # test
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\pipeline.pyc
in fit(self, X, y, **fit_params)
246 This estimator
247 """
--> 248 Xt, fit_params = self._fit(X, y, **fit_params)
249 if self._final_estimator is not None:
250 self._final_estimator.fit(Xt, y, **fit_params)
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\pipeline.pyc
in _fit(self, X, y, **fit_params)
211 Xt, fitted_transformer = fit_transform_one_cached(
212 cloned_transformer, None, Xt, y,
--> 213 **fit_params_steps[name])
214 # Replace the transformer of the step with the fitted
215 # transformer. This is necessary when loading the transformer
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\externals\joblib\memory.pyc
in call(self, *args, **kwargs)
360
361 def call(self, *args, **kwargs):
--> 362 return self.func(*args, **kwargs)
363
364 def call_and_shelve(self, *args, **kwargs):
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\pipeline.pyc
in _fit_transform_one(transformer, weight, X, y, **fit_params)
579 **fit_params):
580 if hasattr(transformer, 'fit_transform'):
--> 581 res = transformer.fit_transform(X, y, **fit_params)
582 else:
583 res = transformer.fit(X, y, **fit_params).transform(X)
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\feature_extraction\text.pyc
in fit_transform(self, raw_documents, y)
867
868 vocabulary, X = self._count_vocab(raw_documents,
--> 869 self.fixed_vocabulary_)
870
871 if self.binary:
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\feature_extraction\text.pyc
in _count_vocab(self, raw_documents, fixed_vocab)
790 for doc in raw_documents:
791 feature_counter = {}
--> 792 for feature in analyze(doc):
793 try:
794 feature_idx = vocabulary[feature]
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\feature_extraction\text.pyc
in (doc)
264
265 return lambda doc: self._word_ngrams(
--> 266 tokenize(preprocess(self.decode(doc))), stop_words)
267
268 else:
C:\Users\mcichonski\AppData\Local\Continuum\anaconda3\envs\py27\lib\site-packages\sklearn\feature_extraction\text.pyc
in (x)
230
231 if self.lowercase:
--> 232 return lambda x: strip_accents(x.lower())
233 else:
234 return strip_accents
AttributeError: 'NoneType' object has no attribute 'lower'
Here is the code:
def printNMostInformative(vectorizer, clf, N):
feature_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
topClass1 = coefs_with_fns[:N]
topClass2 = coefs_with_fns[:-(N + 1):-1]
print("Class 1 best: ")
for feat in topClass1:
print(feat)
print("Class 2 best: ")
for feat in topClass2:
print(feat)
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC()
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])
# data
train1 = train['Title'].tolist()
labelsTrain1 = train['Conference'].tolist()
test1 = test['Title'].tolist()
labelsTest1 = test['Conference'].tolist()
# train
pipe.fit(train1, labelsTrain1)
# test
preds = pipe.predict(test1)
print("accuracy:", accuracy_score(labelsTest1, preds))
print("Top 10 features used to predict: ")
printNMostInformative(vectorizer, clf, 10)
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
transform = pipe.fit_transform(train1, labelsTrain1)
vocab = vectorizer.get_feature_names()
for i in range(len(train1)):
s = ""
indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
for idx, num in zip(indexIntoVocab, numOccurences):
s += str((vocab[idx], num))
Looks like it has something to do with the train1 data. Not sure how to fix this.
This is after cleaning the data ad now trying to use this function to print out the most important features, the features that have the highest coefficients:
For those looking for more information - this is based off a Tutorial
https://towardsdatascience.com/machine-learning-for-text-classification-using-spacy-in-python-b276b4051a49. I was also getting the same error:
This is in relation to the cleanText() function which did not return anything for the pipeline to work with - Hence the NoneType Object traceback
def cleanText(text):
text = text.strip().replace("\n", " ").replace("\r", " ")
text = text.lower()
If you add return text it should fix your error
def cleanText(text):
text = text.strip().replace("\n", " ").replace("\r", " ")
text = text.lower()
return text

How to use numeric indices in cross-validation with pd.DataFrame in scikit-learn (disable _safe_split)?

I want to disable the safe_indexing and force the indicies that I've given my model.
I can't simply do X.values and y.values because I have a custom classifier that I've made where the column/attribute labels used during the __init__ (crucial for the algorithm).
This is from the following line of code:
model_selection.cross_val_score(model, X=X, y=y, cv=cv, n_jobs=1, scoring="accuracy")
where cv is a list of lists with numeric indices
X has to be a pd.DataFrame and cv has to be predefined indicies. How I can make this work?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-74-e1775ca32abb> in <module>()
1 smc.fit(X,y)
----> 2 smc.cross_validate(X,y,cv=cv, n_jobs=1)
<ipython-input-72-61f814fd075c> in cross_validate(self, X, y, cv, scoring, n_jobs, **args)
150 cv_idx.append((idx_tr.map(lambda x:X.index.get_loc(x)), idx_te.map(lambda x:X.index.get_loc(x))))
151 cv = cv_idx
--> 152 return model_selection.cross_val_score(self, X=X, y=y, cv=cv, n_jobs=n_jobs, scoring=scoring, **args)
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
446 start_time = time.time()
447
--> 448 X_train, y_train = _safe_split(estimator, X, y, train)
449 X_test, y_test = _safe_split(estimator, X, y, test, train)
450
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
198 X_subset = X[np.ix_(indices, train_indices)]
199 else:
--> 200 X_subset = safe_indexing(X, indices)
201
202 if y is not None:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/__init__.py in safe_indexing(X, indices)
144 if hasattr(X, "iloc"):
145 # Work-around for indexing with read-only indices in pandas
--> 146 indices = indices if indices.flags.writeable else indices.copy()
147 # Pandas Dataframes and Series
148 try:
AttributeError: 'list' object has no attribute 'flags'
In response to the suggestions in the comments (2018-June-04):

Categories