Sklearn: not receiving transformer parameters while trying to crossvalidate pipeline - python

I am trying to use the cross_validate function on a pipeline. The pipeline works correctly if I train it normally with fit but I am getting an error when i use the cross_validate. Basically, the parameters I pass to the transformer on the pipeline are NoneType when I use the cross_validate function. Why is this the case and how could I fix it? I tried to do a minimum example here
from sklearn.model_selection import cross_validate
from simpletransformers.classification import ClassificationModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
class Transformer(BaseEstimator, TransformerMixin):
def __init__(self, model, mtype, mname, num_labels: int):
self._model = model
self._inst_model = None
self._num_labels = num_labels
self._type = mtype
self._name =mname
def fit(self, train_input, y=None):
self._create_model()
self._train_model(train_input)
return self
def transform(self, eval_df, y=None):
result, model_outputs, wrong_predictions = self._inst_model.eval_model(eval_df=eval_df)
return model_outputs
def _create_model(self):
self._inst_model = self._model(self._type, self._name, args={"output_dir": 'min_ex'},
num_labels=self._num_labels)
def _train_model(self, train_input):
train_df, eval_df = train_test_split(train_input, test_size=0.20)
return self._inst_model.train_model(train_df, eval_df=eval_df)
if __name__ == '__main__':
categories = ['sci.med', 'sci.space']
X_t, y_t = fetch_20newsgroups(random_state=1,
subset='train',
categories=categories,
remove=('footers', 'quotes'),
return_X_y=True)
X = pd.DataFrame({
'text': X_t,
'labels': y_t,
})
y = y_t
transformer_grid = {
"model": ClassificationModel,
"num_labels": 14,
"mtype": "electra",
"mname": "german-nlp-group/electra-base-german-uncased"
}
classifier_grid = {
'n_estimators' : 100,
'random_state': 42
}
pipe = Pipeline([
('feats', FeatureUnion([
('transformer', Pipeline([
('transformer', Transformer(**transformer_grid)),
])),
])),
('classifier', RandomForestClassifier(**classifier_grid))
])
# pipe.fit(X, y)
cv_results = cross_validate(pipe, X, y, cv=5, scoring='accuracy', n_jobs=1)
And the error I am getting is this
2021-03-04 16:05:54.861544: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
/root/complex_semantics/lib/python3.8/site-packages/sklearn/base.py:209: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.
warnings.warn('From version 0.24, get_params will raise an '
/root/complex_semantics/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:548: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
Traceback (most recent call last):
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 330, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 292, in _fit
X, fitted_transformer = fit_transform_one_cached(
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
return self.func(*args, **kwargs)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 953, in fit_transform
results = self._parallel_func(X, y, fit_params, _fit_transform_one)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 978, in _parallel_func
return Parallel(n_jobs=self.n_jobs)(delayed(func)(
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 1029, in __call__
if self.dispatch_one_batch(iterator):
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 847, in dispatch_one_batch
self._dispatch(tasks)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 765, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 252, in __call__
return [func(*args, **kwargs)
File "/root/complex_semantics/lib/python3.8/site-packages/joblib/parallel.py", line 252, in <listcomp>
return [func(*args, **kwargs)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/pipeline.py", line 376, in fit_transform
return last_step.fit_transform(Xt, y, **fit_params_last_step)
File "/root/complex_semantics/lib/python3.8/site-packages/sklearn/base.py", line 693, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "min_ex.py", line 23, in fit
self._create_model()
File "min_ex.py", line 32, in _create_model
self._inst_model = self._model(self._type, self._name, args={"output_dir": 'min_ex'},
TypeError: 'NoneType' object is not callable
warnings.warn("Estimator fit failed. The score on this train-test"
Edit:
I changed to use another variable for when the model is instantiated, still the same issue. Took out print statements for easier reading. The only thing missing for the code to run is the loading of data. It still gives me same error only when crossvalidate
Edit 2:
Created a minimal reproducible example by adding a synthetic dataset

Related

GirdSearchCV for multioutput RandomForest Regressor

I have created a multioutput RandomForestRegressor using the sklearn.ensemble.RandomForestRegressor. I now want to perform a GridSearchCV to find good hyperparameters and output the r^2 scores for each individual target feature. The code is use looks as follows:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
param_grid = {
'model__bootstrap': [True],
'model__max_depth': [8,10,12],
'model__max_features': [3,4,5],
'model__min_samples_leaf': [3,4,5],
'model__min_samples_split': [3, 5, 7],
'model__n_estimators': [100, 200, 300]
}
model = RandomForestRegressor()
pipe = Pipeline(steps=[
('scaler', StandardScaler()),
('model', model)])
scorer = make_scorer(r2_score, multioutput='raw_values')
search = GridSearchCV(pipe, param_grid, scoring=scorer)
search.fit(X_train, y_train)
print(f'Best parameter score {ship_type} {target}: {search.best_score_}')
When running this code I get the following error
File "run_xgb_rf_regressor.py", line 75, in <module>
model, X = run_regression(ship_types[2], targets)
File "run_xgb_rf_regressor.py", line 50, in run_regression
search.fit(X_train, y_train)
File "/home/lucas/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/home/lucas/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 841, in fit
self._run_search(evaluate_candidates)
File "/home/lucas/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 1296, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File "/home/lucas/.local/lib/python3.8/site-packages/sklearn/model_selection/_search.py", line 795, in evaluate_candidates
out = parallel(delayed(_fit_and_score)(clone(base_estimator),
File "/home/lucas/.local/lib/python3.8/site-packages/joblib/parallel.py", line 1043, in __call__
if self.dispatch_one_batch(iterator):
File "/home/lucas/.local/lib/python3.8/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
self._dispatch(tasks)
File "/home/lucas/.local/lib/python3.8/site-packages/joblib/parallel.py", line 779, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/home/lucas/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/home/lucas/.local/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/home/lucas/.local/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
return [func(*args, **kwargs)
File "/home/lucas/.local/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
return [func(*args, **kwargs)
File "/home/lucas/.local/lib/python3.8/site-packages/sklearn/utils/fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "/home/lucas/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 625, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
File "/home/lucas/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 721, in _score
raise ValueError(error_msg % (scores, type(scores), scorer))
ValueError: scoring must return a number, got [0.57359176 0.54407165 0.40313057 0.32515033 0.346224 0.39513717
0.34375699] (<class 'numpy.ndarray'>) instead. (scorer=make_scorer(r2_score, multioutput=raw_values))
Clearly the error suggests that I can only use a single numeric value, which in my case would be the average r^2 score over all target features. Does anybody know how I can use GridSearchCV so that I can output the individual r^2 scores?
Many thanks in advance.
I think I would use the following option for scoring parameter (from the docs):
a callable returning a dictionary where the keys are the metric names and the values are the metric scores;
So something like
def my_scorer(estimator, X, y):
preds = estimator.predict(X)
scores = r2_score(y, preds, multioutput='raw_values')
return {f'r2_y{i}': score for i, score in enumerate(scores)}
Note though in the docs that refit will need to be set more carefully with multimetric searches. Maybe deciding the "best" parameters should be done by some average, in which case you can add another entry to the custom scorer.
Other useful parts of the User Guide:
https://scikit-learn.org/stable/modules/grid_search.html#multimetric-grid-search
https://scikit-learn.org/stable/modules/model_evaluation.html#implementing-your-own-scoring-object

performing K-fold Cross Validation with scoring = 'f1 or Recall or Precision' for multi-class problem

I know this can easily be implemented for a binary classification problem. But it seems to be a bit tough in the case of a multi-class problem.
I have a dataset that is un-balanced and is an example of a 4-class classification problem. I have applied the RandomForestClassifier() on it to test various measures of the algorithm such as accuracy, precision, recall, f1_score, etc. Now I wanted to perform the K-fold Cross Validation on the training set with 10 splits and I want the 'scoring' parameter of the cross_val_score() function to be 'f1' instead of 'accuracy'.
My code:
# Random Forest
np.random.seed(123)
from sklearn.ensemble import RandomForestClassifier
classifier_RF = RandomForestClassifier(random_state = 0)
classifier_RF.fit(X_train, Y_train)
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier_RF, X = X_train, y = Y_train, cv = 10, scoring = 'f1')
print("F1_Score: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))
However, when I try to run this code, I am getting an error as follows:
ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
I have tried setting the average parameter to 'weighted' in the cross_val_function() as follows:
accuracies = cross_val_score(estimator = classifier_RF, X = X_train, y = Y_train, cv = 10, scoring = 'f1', average = 'weighted')
but that's giving an error as follows:
TypeError: cross_val_score() got an unexpected keyword argument 'average'
The entire traceback is as follows:
Traceback (most recent call last):
File "<ipython-input-1-ba4a5e1de09a>", line 97, in <module>
accuracies = cross_val_score(estimator = classifier_RF, X = X_train, y = Y_train, cv = 10, scoring = 'f1')
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 406, in cross_val_score
error_score=error_score)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 248, in cross_validate
for train, test in cv.split(X, y, groups))
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 1048, in __call__
if self.dispatch_one_batch(iterator):
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 866, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 784, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 263, in __call__
for func, args, kwargs in self.items]
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 263, in <listcomp>
for func, args, kwargs in self.items]
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 560, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 607, in _score
scores = scorer(estimator, X_test, y_test)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 88, in __call__
*args, **kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 213, in _score
**self._kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1047, in f1_score
zero_division=zero_division)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1175, in fbeta_score
zero_division=zero_division)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py", line 72, in inner_f
return f(**kwargs)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1434, in precision_recall_fscore_support
pos_label)
File "/Users/vivekchowdary/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py", line 1265, in _check_set_wise_labels
% (y_type, average_options))
ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].
You need to use make_score to define your metric and its parameters:
from sklearn.metrics import make_scorer, f1_score
scoring = {'f1_score' : make_scorer(f1_score, average='weighted')}
and then use this in your cross_val_score:
results = cross_val_score(estimator = classifier_RF,
X = X_train,
y = Y_train,
cv = 10,
scoring = scoring)

Custom estimator can't be deepcopied by cross_val_score

I have a custom estimator that i implemented myself and i am not able to use cross_val_score(), which i believe it has something to do with my predict() method. Here is the full error trace:
Traceback (most recent call last):
File "/Users/joann/Desktop/Implementações ML/Adaboost Classifier/test.py", line 30, in <module>
ada2_score = cross_val_score(ada_2, X, y, cv=5)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 390, in cross_val_score
error_score=error_score)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 236, in cross_validate
for train, test in cv.split(X, y, groups))
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 1004, in __call__
if self.dispatch_one_batch(iterator):
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 590, in __init__
self.results = batch()
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 544, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 591, in _score
scores = scorer(estimator, X_test, y_test)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 89, in __call__
score = scorer(estimator, *args, **kwargs)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 371, in _passthrough_scorer
return estimator.score(*args, **kwargs)
File "/Users/joann/Desktop/Implementações ML/Adaboost Classifier/Adaboost.py", line 92, in score
scr_pred = self.predict(X)
File "/Users/joann/Desktop/Implementações ML/Adaboost Classifier/Adaboost.py", line 73, in predict
clf_pred = clf.predict(X)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn_extensions/extreme_learning_machines/elm.py", line 614, in predict
class_predictions = self.binarizer.inverse_transform(raw_predictions)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_label.py", line 528, in inverse_transform
self.classes_, threshold)
File "/Users/joann/opt/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/_label.py", line 750, in _inverse_binarize_thresholding
format(y.shape))
ValueError: output_type='binary', but y.shape = (30, 3)
My predict(self, X) method returns a vector of size n_samples with the predictions for the X parameter. I also made a score() function as follows:
def score(self, X, y):
scr_pred = self.predict(X)
return sum(scr_pred == y) / X.shape[0]
This method simply computes the accuracy of the model given the samples. Either if i use this score() method or set a cross_val_score(... , scoring="accuracy") it is not working.
Note: i am aware of this question/answer but this doesn't apply to my case because i can confirm the consistence of my constructor:
def __init__(self, estimators=["MLP"], n_rounds=5, random_state=10):
self.estimators = estimators
self.n_rounds = n_rounds
self.random_state = random_state
UPDATE:
Further research led me to this topic, where it is explained that sklearn can't deepcopy Estimators with transformers. However, it is mandatory for my estimator to run LabelBinarizer to transform data to get the predictions. So i update the question title to the proper issue.`
However the problem statement of yours is not clear here but however looking at the error it seems you are trying a multiclass classification.
The problem here is that you might have in your code at some point have not done the preprocessing correctly as the error is logged from inverse_binarize_thresholding which is raised due to below functionality of sklearn pre-prosessing:
def _inverse_binarize_thresholding(y, output_type, classes, threshold):
if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
raise ValueError("output_type='binary', but y.shape = {0}".
format(y.shape))
There must be some missing transformation or pre-prosessing in your code and you have to use LabelBinarizer correctly
Go through the below documentation and backtrack the error to fix your code
documentation

How to use SMOTENC inside pipeline (Error: Some of the categorical indices are out of range)?

I would greatly appreciate if you could let me know how to use SMOTENC. I wrote:
# Data
XX = pd.read_csv('Financial Distress.csv')
y = np.array(XX['Financial Distress'].values.tolist())
y = np.array([0 if i > -0.50 else 1 for i in y])
Na = np.array(pd.read_csv('Na.csv', header=None).values)
XX = XX.iloc[:, 3:127]
# Use get-dummies to convert categorical features into dummy ones
dis_features = ['x121']
X = pd.get_dummies(XX, columns=dis_features)
# # Divide Data into Train and Test
indices = np.arange(y.shape[0])
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, indices, stratify=y, test_size=0.3,
random_state=42)
num_indices=list(X)[:X.shape[1]-37]
cat_indices=list(X)[X.shape[1]-37:]
num_indices1 = list(X.iloc[:,np.r_[0:94,95,97,100:123]].columns.values)
cat_indices1 = list(X.iloc[:,np.r_[94,96,98,99,123:160]].columns.values)
print(len(num_indices1))
print(len(cat_indices1))
pipeline=Pipeline(steps= [
# Categorical features
('feature_processing', FeatureUnion(transformer_list = [
('categorical', MultiColumn(cat_indices)),
#numeric
('numeric', Pipeline(steps = [
('select', MultiColumn(num_indices)),
('scale', StandardScaler())
]))
])),
('clf', rg)
]
)
pipeline_with_resampling = make_pipeline(SMOTENC(categorical_features=cat_indices1), pipeline)
# # Grid Search to determine best params
cv=StratifiedKFold(n_splits=5,random_state=42)
rg_cv = GridSearchCV(pipeline_with_resampling, param_grid, cv=cv, scoring = 'f1')
rg_cv.fit(X_train, y_train)
Therefore, as it is indicated I have 5 categorical features. Really, indices 123 to 160 are related to one categorical feature with 37 possible values which is converted into 37 columns using get_dummies. Unfortunately, it throws the following error:
Traceback (most recent call last):
File "D:/mifs-master_2/MU/learning-from-imbalanced-classes-master/learning-from-imbalanced-classes-master/continuous/Final Logit/SMOTENC/logit-final - Copy.py", line 424, in <module>
rg_cv.fit(X_train, y_train)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 722, in fit
self._run_search(evaluate_candidates)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 1191, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 711, in evaluate_candidates
cv.split(X, y, groups)))
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 182, in apply_async
result = ImmediateResult(func)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 549, in __init__
self.results = batch()
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 528, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\pipeline.py", line 237, in fit
Xt, yt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\pipeline.py", line 200, in _fit
cloned_transformer, Xt, yt, **fit_params_steps[name])
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 342, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\pipeline.py", line 576, in _fit_resample_one
X_res, y_res = sampler.fit_resample(X, y, **fit_params)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\base.py", line 85, in fit_resample
output = self._fit_resample(X, y)
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\over_sampling\_smote.py", line 940, in _fit_resample
self._validate_estimator()
File "C:\Users\Markazi.co\Anaconda3\lib\site-packages\imblearn\over_sampling\_smote.py", line 933, in _validate_estimator
' should be between 0 and {}'.format(self.n_features_))
ValueError: Some of the categorical indices are out of range. Indices should be between 0 and 160
Thanks in advance.
As it follows, two pipelines should be used:
num_indices1 = list(X.iloc[:,np.r_[0:94,95,97,100:120,121:123]].columns.values)
cat_indices1 = list(X.iloc[:,np.r_[94,96,98,99,120]].columns.values)
print(len(num_indices1))
print(len(cat_indices1))
cat_indices = [94, 96, 98, 99, 120]
from imblearn.pipeline import make_pipeline
pipeline=Pipeline(steps= [
# Categorical features
('feature_processing', FeatureUnion(transformer_list = [
('categorical', MultiColumn(cat_indices1)),
#numeric
('numeric', Pipeline(steps = [
('select', MultiColumn(num_indices1)),
('scale', StandardScaler())
]))
])),
('clf', rg)
]
)
pipeline_with_resampling = make_pipeline(SMOTENC(categorical_features=cat_indices), pipeline)
You can not dummies your categorical variables and use it later SMOTENC because it already implements in its algorithm get_dummies what will bias your model.
However, I recommend using SMOTE () instead of SMOTENC (), but in this case you must first apply get_demmies.
You cannot use scikit learn pipeline with imblearn pipeline. The imblearn pipeline implements fit_sample as well as fit_predict. Sklearn pipeline onle implements fit_predict. You cannot combine them.
First, don't do the get_dummies. Then, change the way you do your categorical_features, and put a list of booleans for if it's categorical or not.
Try this:
cat_cols = []
for col in x.columns:
if x[col].dtype == 'object': #or 'category' if that's the case
cat_cols.append(True)
else:
cat_cols.append(False)
Then pass cat_cols to your SMOTENC:
smote_nc = SMOTENC(categorical_features=cat_cols, random_state=0)

Custom Transformer and FeatureUnion for word2vec

I am trying to classify a set of text documents using multiple sets of features. I am using sklearn's Feature Union to combine different features for fitting into a single model. One of the features includes word embeddings using gensim's word2vec.
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
data = fetch_20newsgroups(subset='train', categories=categories)#dummy dataset
w2v_model= Word2Vec(data .data, size=100, window=5, min_count=5, workers=2)
word2vec={w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)} #dictionary of word embeddings
feat_select = SelectKBest(score_func=chi2, k=10) #other features
TSVD = TruncatedSVD(n_components=50, algorithm = "randomized", n_iter = 5)
#other features
In order to include transformers/estimators not already available in sklearn, I am attempting to wrap my word2vec results into a custom transformer class that returns the vector averages.
class w2vTransformer(TransformerMixin):
"""
Wrapper class for running word2vec into pipelines and FeatureUnions
"""
def __init__(self,word2vec,**kwargs):
self.word2vec=word2vec
self.kwargs=kwargs
self.dim = len(word2vec.values())
def fit(self,x, y=None):
return self
def transform(self, X):
return np.array([
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
or [np.zeros(self.dim)], axis=0)
for words in X
])
However when it comes time to fit the model I receive an error.
combined_features = FeatureUnion([("w2v_class",w2vTransformer(word2vec)),
("feat",feat_select),("TSVD",TSVD)])#join features into combined_features
#combined_features = FeatureUnion([("feat",feat_select),("TSVD",TSVD)])#runs when word embeddings are not included
text_clf_svm = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('feature_selection', combined_features),
('clf-svm', SGDClassifier( loss="modified_huber")),
])
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
Traceback (most recent call last):
File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 739, in fit_transform
for name, trans, weight in self._iter())
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py", line 520, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
TypeError: unhashable type: 'csr_matrix'
Traceback (most recent call last):
File "<ipython-input-8-a085b7d40f8f>", line 1, in <module>
text_clf_svm_1 = text_clf_svm.fit(data.data,data.target) # fits data
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 739, in fit_transform
for name, trans, weight in self._iter())
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rlusk\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\base.py", line 520, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "<ipython-input-6-cbc52cd420cd>", line 16, in transform
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 16, in <listcomp>
for words in X
File "<ipython-input-6-cbc52cd420cd>", line 14, in <listcomp>
np.mean([self.word2vec[w] for w in words if w in self.word2vec]
TypeError: unhashable type: 'csr_matrix'
I understand that the error is because the variable "words" is a csr_matrix, but it needs to be an iterable such as a list. My question is how do I modify the transformer class or data so I can use the word embeddings as features to feed into FeatureUnion? This is my first SO post, please be gentle.
Instead of your custom transformer you can avoid the bug using the new scikit-learn API directly provided by Gensim! https://radimrehurek.com/gensim/sklearn_api/w2vmodel.html
Also, it depends on your version of Gensim, but in my case I could solve the same bug using the wv attribute of your word2vec object, instead of indexing on the object itself.
In the transform method of your w2vTransformer class:
self.word2vec.wv[w]
instead of
self.word2vec[w]
Hope it helps!

Categories