Custom Transformer and FeatureUnion for word2vec - python

I am trying to classify a set of text documents using multiple sets of features. I am using sklearn's Feature Union to combine different features for fitting into a single model. One of the features includes word embeddings using gensim's word2vec.
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
data = fetch_20newsgroups(subset='train', categories=categories)#dummy dataset
w2v_model= Word2Vec(data .data, size=100, window=5, min_count=5, workers=2)
word2vec={w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)} #dictionary of word embeddings
feat_select = SelectKBest(score_func=chi2, k=10) #other features
TSVD = TruncatedSVD(n_components=50, algorithm = "randomized", n_iter = 5)
#other features
In order to include transformers/estimators not already available in sklearn, I am attempting to wrap my word2vec results into a custom transformer class that returns the vector averages.
class w2vTransformer(TransformerMixin):
"""
Wrapper class for running word2vec into pipelines and FeatureUnions
"""
def __init__(self,word2vec,**kwargs):
self.word2vec=word2vec
self.kwargs=kwargs
self.dim = len(word2vec.values())
def fit(self,x, y=None):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
However when it comes time to fit the model I receive an error.
combined_features = FeatureUnion([("w2v_class",w2vTransformer(word2vec)),
("feat",feat_select),("TSVD",TSVD)])#join features into combined_features
#combined_features = FeatureUnion([("feat",feat_select),("TSVD",TSVD)])#runs when word embeddings are not included
text_clf_svm = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('feature_selection', combined_features),
('clf-svm', SGDClassifier( loss="modified_huber")),
])
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
Traceback (most recent call last):
File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 739, in fit_transform
for name, trans, weight in self._iter())
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py", line 520, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
TypeError: unhashable type: 'csr_matrix'
Traceback (most recent call last):
File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 739, in fit_transform
for name, trans, weight in self._iter())
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py", line 520, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
TypeError: unhashable type: 'csr_matrix'
I understand that the error is because the variable "words" is a csr_matrix, but it needs to be an iterable such as a list. My question is how do I modify the transformer class or data so I can use the word embeddings as features to feed into FeatureUnion? This is my first SO post, please be gentle.

Instead of your custom transformer you can avoid the bug using the new scikit-learn API directly provided by Gensim! https://radimrehurek.com/gensim/sklearn_api/w2vmodel.html
Also, it depends on your version of Gensim, but in my case I could solve the same bug using the wv attribute of your word2vec object, instead of indexing on the object itself.
In the transform method of your w2vTransformer class:
self.word2vec.wv[w]
instead of
self.word2vec[w]
Hope it helps!

Related

Simultaneous feature selection and hyperparameter tuning

I'm trying to conduct both hyperparameter tuning and feature selection on a sklearn SVC model.
I tried the below code, but am getting an error which I have included.
clf = Pipeline([('anova', SelectPercentile(f_classif)),
('svc', SVC( probability = True))])
score_means = list()
score_params = list()
percentiles = (1, 3, 6, 10, 15, 20, 30, 40, 60, 80, 100)
params = {
"C": np.logspace(-3, 17, 21),
"gamma": np.logspace(-20, 1, 21),
'class_weight' : [None, 'balanced']
}
halving_search = HalvingGridSearchCV(estimator = clf,
param_grid = params,
scoring = 'neg_brier_score',
factor = 2,
verbose = 2,
cv = 2)
for percentile in percentiles:
clf.set_params(anova__percentile=percentile)
this_scores = halving_search.fit(x_train, y_train)
score_means.append(this_scores.best_score_)
score_params.append(this_scores.best_params)
Running the pipeline code with a cross_val_score separate from the HalvingGridSearchCV works, but I want to conduct both feature selection and hyperparameter tuning to find which combination of features and hyperparameters produces the best model.
When I run the above code, I get the following error:
Traceback (most recent call last):
File "<ipython-input-83-cf714445297c>", line 4, in <module>
this_scores = halving_search.fit(x_train, y_train)
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\model_selection\_search_successive_halving.py", line 213, in fit
super().fit(X, y=y, groups=None, **fit_params)
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 841, in fit
self._run_search(evaluate_candidates)
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\model_selection\_search_successive_halving.py", line 320, in _run_search
more_results=more_results)
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 809, in evaluate_candidates
enumerate(cv.split(X, y, groups))))
File "C:\Users\fredd\Anaconda3\lib\site-packages\joblib\parallel.py", line 1041, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\fredd\Anaconda3\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\fredd\Anaconda3\lib\site-packages\joblib\parallel.py", line 777, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\fredd\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "C:\Users\fredd\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 572, in __init__
self.results = batch()
File "C:\Users\fredd\Anaconda3\lib\site-packages\joblib\parallel.py", line 263, in __call__
for func, args, kwargs in self.items]
File "C:\Users\fredd\Anaconda3\lib\site-packages\joblib\parallel.py", line 263, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 581, in _fit_and_score
estimator = estimator.set_params(**cloned_parameters)
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 150, in set_params
self._set_params('steps', **kwargs)
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\utils\metaestimators.py", line 54, in _set_params
super().set_params(**params)
File "C:\Users\fredd\Anaconda3\lib\site-packages\sklearn\base.py", line 233, in set_params
(key, self))
ValueError: Invalid parameter C for estimator Pipeline(steps=[('anova', SelectPercentile(percentile=1)),
('svc', SVC(probability=True))]). Check the list of available parameters with `estimator.get_params().keys()`.
It reads like the halvingsearch is trying to pass the pipeline as an input for C.
You want to perform a grid search over a Pipeline object. When defining the parameters for the different steps of the pipeline, you have to use the <step>__<parameter> syntax:
params = {
"svc__C": np.logspace(-3, 17, 21),
"svc__gamma": np.logspace(-20, 1, 21),
"svc__class_weight" : [None, 'balanced']
}
See the user guide for more information.

Sklearn: not receiving transformer parameters while trying to crossvalidate pipeline

I am trying to use the cross_validate function on a pipeline. The pipeline works correctly if I train it normally with fit but I am getting an error when i use the cross_validate. Basically, the parameters I pass to the transformer on the pipeline are NoneType when I use the cross_validate function. Why is this the case and how could I fix it? I tried to do a minimum example here
from sklearn.model_selection import cross_validate
from simpletransformers.classification import ClassificationModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
class Transformer(BaseEstimator, TransformerMixin):
def __init__(self, model, mtype, mname, num_labels: int):
self._model = model
self._inst_model = None
self._num_labels = num_labels
self._type = mtype
self._name =mname
def fit(self, train_input, y=None):
self._create_model()
self._train_model(train_input)
return self
def transform(self, eval_df, y=None):
result, model_outputs, wrong_predictions = self._inst_model.eval_model(eval_df=eval_df)
return model_outputs
def _create_model(self):
self._inst_model = self._model(self._type, self._name, args={"output_dir": 'min_ex'},
num_labels=self._num_labels)
def _train_model(self, train_input):
train_df, eval_df = train_test_split(train_input, test_size=0.20)
return self._inst_model.train_model(train_df, eval_df=eval_df)
if __name__ == '__main__':
categories = ['sci.med', 'sci.space']
X_t, y_t = fetch_20newsgroups(random_state=1,
subset='train',
categories=categories,
remove=('footers', 'quotes'),
return_X_y=True)
X = pd.DataFrame({
'text': X_t,
'labels': y_t,
})
y = y_t
transformer_grid = {
"model": ClassificationModel,
"num_labels": 14,
"mtype": "electra",
"mname": "german-nlp-group/electra-base-german-uncased"
}
classifier_grid = {
'n_estimators' : 100,
'random_state': 42
}
pipe = Pipeline([
('feats', FeatureUnion([
('transformer', Pipeline([
('transformer', Transformer(**transformer_grid)),
])),
])),
('classifier', RandomForestClassifier(**classifier_grid))
])
# pipe.fit(X, y)
cv_results = cross_validate(pipe, X, y, cv=5, scoring='accuracy', n_jobs=1)
And the error I am getting is this
2021-03-04 16:05:54.861544: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
/root/complex_semantics/lib/python3.8/site-packages/sklearn/base.py:209: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
warnings.warn('From version 0.24, get_params will raise an '
/root/complex_semantics/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:548: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
Traceback (most recent call last):
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 330, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 292, in _fit
X, fitted_transformer = fit_transform_one_cached(
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
return self.func(*args, **kwargs)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 953, in fit_transform
results = self._parallel_func(X, y, fit_params, _fit_transform_one)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 978, in _parallel_func
return Parallel(n_jobs=self.n_jobs)(delayed(func)(
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 1029, in __call__
if self.dispatch_one_batch(iterator):
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 847, in dispatch_one_batch
self._dispatch(tasks)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 765, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 252, in __call__
return [func(*args, **kwargs)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 252, in <listcomp>
return [func(*args, **kwargs)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 376, in fit_transform
return last_step.fit_transform(Xt, y, **fit_params_last_step)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/base.py", line 693, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "min_ex.py", line 23, in fit
self._create_model()
File "min_ex.py", line 32, in _create_model
self._inst_model = self._model(self._type, self._name, args={"output_dir": 'min_ex'},
TypeError: 'NoneType' object is not callable
warnings.warn("Estimator fit failed. The score on this train-test"
Edit:
I changed to use another variable for when the model is instantiated, still the same issue. Took out print statements for easier reading. The only thing missing for the code to run is the loading of data. It still gives me same error only when crossvalidate
Edit 2:
Created a minimal reproducible example by adding a synthetic dataset

Custom estimator can't be deepcopied by cross_val_score

I have a custom estimator that i implemented myself and i am not able to use cross_val_score(), which i believe it has something to do with my predict() method. Here is the full error trace:
Traceback (most recent call last):
File "/Users/joann/Desktop/Implementações ML/Adaboost Classifier/test.py", line 30, in <module>
ada2_score = cross_val_score(ada_2, X, y, cv=5)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 390, in cross_val_score
error_score=error_score)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 236, in cross_validate
for train, test in cv.split(X, y, groups))
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 1004, in __call__
if self.dispatch_one_batch(iterator):
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 590, in __init__
self.results = batch()
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 544, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 591, in _score
scores = scorer(estimator, X_test, y_test)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 89, in __call__
score = scorer(estimator, *args, **kwargs)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 371, in _passthrough_scorer
return estimator.score(*args, **kwargs)
File "/Users/joann/Desktop/Implementações ML/Adaboost Classifier/Adaboost.py", line 92, in score
scr_pred = self.predict(X)
File "/Users/joann/Desktop/Implementações ML/Adaboost Classifier/Adaboost.py", line 73, in predict
clf_pred = clf.predict(X)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn_extensions/extreme_learning_machines/elm.py", line 614, in predict
class_predictions = self.binarizer.inverse_transform(raw_predictions)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_label.py", line 528, in inverse_transform
self.classes_, threshold)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_label.py", line 750, in _inverse_binarize_thresholding
format(y.shape))
ValueError: output_type='binary', but y.shape = (30, 3)
My predict(self, X) method returns a vector of size n_samples with the predictions for the X parameter. I also made a score() function as follows:
def score(self, X, y):
scr_pred = self.predict(X)
return sum(scr_pred == y) / X.shape[0]
This method simply computes the accuracy of the model given the samples. Either if i use this score() method or set a cross_val_score(... , scoring="accuracy") it is not working.
Note: i am aware of this question/answer but this doesn't apply to my case because i can confirm the consistence of my constructor:
def __init__(self, estimators=["MLP"], n_rounds=5, random_state=10):
self.estimators = estimators
self.n_rounds = n_rounds
self.random_state = random_state
UPDATE:
Further research led me to this topic, where it is explained that sklearn can't deepcopy Estimators with transformers. However, it is mandatory for my estimator to run LabelBinarizer to transform data to get the predictions. So i update the question title to the proper issue.`
However the problem statement of yours is not clear here but however looking at the error it seems you are trying a multiclass classification.
The problem here is that you might have in your code at some point have not done the preprocessing correctly as the error is logged from inverse_binarize_thresholding which is raised due to below functionality of sklearn pre-prosessing:
def _inverse_binarize_thresholding(y, output_type, classes, threshold):
if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
raise ValueError("output_type='binary', but y.shape = {0}".
format(y.shape))
There must be some missing transformation or pre-prosessing in your code and you have to use LabelBinarizer correctly
Go through the below documentation and backtrack the error to fix your code
documentation

RandomizedSearchCV example from "Machine Learning with Python and H2O" manual not working

I'm a bit puzzled since I don't get the last example from "Machine Learning with Python and H2O" manual working (page 36).
Here's the code:
import h2o
h2o.init()
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.transforms.preprocessing import H2OScaler
from h2o.cross_validation import H2OKFold
from h2o.model.regression import h2o_r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics.scorer import make_scorer
h2o.__PROGRESS_BAR__=False
h2o.no_progress()
iris_data_path = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris.csv"# load demonstration data1819In [5]:
iris_df = h2o.import_file(path=iris_data_path)
params = {"standardize__center": [True, False],
"standardize__scale": [True, False],
"gbm__ntrees": [10,20],
"gbm__max_depth": [1,2,3],
"gbm__learn_rate": [0.1,0.2]}
custom_cv = H2OKFold(iris_df, n_folds=5, seed=42)
pipeline = Pipeline([("standardize", H2OScaler()),
("gbm", H2OGradientBoostingEstimator(distribution="gaussian"))])
random_search = RandomizedSearchCV(pipeline, params, n_iter=5, scoring=make_scorer(h2o_r2_score),
cv=custom_cv, random_state=42, n_jobs=1)
random_search.fit(iris_df[1:], iris_df[0])
It returns the error ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed.
The full terminal message:
Traceback (most recent call last):
File "untitled-Copy1.py", line 34, in <module>
random_search.fit(iris_df[1:], iris_df[0])
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/sklearn/model_selection/_search.py", line 710, in fit
self._run_search(evaluate_candidates)
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/sklearn/model_selection/_search.py", line 1484, in _run_search
random_state=self.random_state))
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/sklearn/model_selection/_search.py", line 689, in evaluate_candidates
cv.split(X, y, groups)))
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/joblib/parallel.py", line 1004, in __call__
if self.dispatch_one_batch(iterator):
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/joblib/parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/joblib/parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/joblib/_parallel_backends.py", line 590, in __init__
self.results = batch()
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/joblib/parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/joblib/parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/sklearn/model_selection/_validation.py", line 508, in _fit_and_score
X_train, y_train = _safe_split(estimator, X, y, train)
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/sklearn/utils/metaestimators.py", line 201, in _safe_split
X_subset = _safe_indexing(X, indices)
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/sklearn/utils/__init__.py", line 390, in _safe_indexing
indices_dtype = _determine_key_type(indices)
File "/department/jupyter-dev/anaconda3/envs/python36/lib/python3.6/site-packages/sklearn/utils/__init__.py", line 288, in _determine_key_type
raise ValueError(err_msg)
ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed
Closing connection _sid_b8c1 at exit
H2O session _sid_b8c1 closed.
I'm using python 3.6.10 with sklearn 0.22.1 and h2o 3.28.0.3.
What am I doing wrong? Any help appreciated!
Have a great day :)

How can I pass multiple columns from a DataFrame to a scikit-learn FeatureUnion?

Solved: see edit below
I'm experimenting with scikit-learn's SGDClassifier to classify a small number of bank transactions. The data is in a Pandas DataFrame – each row is a transaction, with a text description field (which I'm feeding into TfidfVectorizer) and a continuous amount field:
Date Description Amount Balance Category
0 2011-03-23 CO-OP GROUP 102786 STEVENAGE -22.55 483.14 Petrol
1 2011-03-23 O2 UK LTD -12.77 505.69 Phone
2 2011-03-22 SAINSBURYS COV GDN -30.84 691.75 Food
So far I've been classifying solely on the Description field, but I now want to add in the Amount field. I've been using the FeatureUnion example on this page to get started, but I'm struggling to get my data into the expected format.
My code currently looks like this:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, column):
self.column = column
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[self.column]
pipeline = Pipeline([
("union", FeatureUnion(
transformer_list=[
("description", Pipeline([
("selector", ItemSelector(column="Description")),
("tfidf", TfidfVectorizer())
]))
]
)),
("classifier", SGDClassifier(loss="log", penalty="l2", alpha=1e-3, n_iter=5, random_state=42))
])
print(training_data.head(n=3))
print(training_data.shape)
print(len(training_data["Category"].tolist()))
model = pipeline.fit(training_data, training_data["Category"].tolist())
Where training_data is the DataFrame. The df.shape of training_data is (538, 5) and the len() of training_data["Category"].tolist() is 538.
When I run this, I'm getting TypeError: list indices must be integers or slices, not str. I assume something is wrong with the format of the data returned by the ItemSelector.transform method, but I'm not sure what.
Here's the full traceback:
Traceback (most recent call last):
File "sci-class.py", line 80, in <module>
predictions = model.predict(test_data["Description"].tolist())
File "/usr/local/lib/python3.5/site-packages/sklearn/utils/metaestimators.py", line 54, in <lambda>
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
File "/usr/local/lib/python3.5/site-packages/sklearn/pipeline.py", line 326, in predict
Xt = transform.transform(Xt)
File "/usr/local/lib/python3.5/site-packages/sklearn/pipeline.py", line 763, in transform
for name, trans, weight in self._iter())
File "/usr/local/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/usr/local/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/usr/local/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/usr/local/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/usr/local/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/usr/local/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/usr/local/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/usr/local/lib/python3.5/site-packages/sklearn/pipeline.py", line 567, in _transform_one
res = transformer.transform(X)
File "/usr/local/lib/python3.5/site-packages/sklearn/pipeline.py", line 445, in _transform
Xt = transform.transform(Xt)
File "sci-class.py", line 60, in transform
return X[self.column]
TypeError: list indices must be integers or slices, not str
What's the correct way to pass multiple columns from a DataFrame to a scikit-learn FeatureUnion?
Edit
#Vivek Kumar pointed out that the error was thrown by a line I didn't include in the above extract: predictions = model.predict(test_data["Description"].tolist()). I was using tolist() in the earlier iteration of the code before I added the FeatureUnion. So it was all a silly mistake, really.
The below (in combination with the above) is working for me:
model = pipeline.fit(training_data, training_data["Category"].tolist())
predictions = model.predict(test_data)
probabilities = model.predict_proba(test_data)
score = model.score(test_data, test_data["Category"].tolist())
labels = sorted(set(training_labels + test_labels))
matrix = metrics.confusion_matrix(test_data["Category"].tolist(), predictions, labels=labels)

Categories