I want to use UMAP in my sklearn's Pipeline, and I would like to cache that step to speed things up. However, since I have custom Transformer, the suggested method doesn't work.
Example code:
from sklearn.preprocessing import FunctionTransformer
from tempfile import mkdtemp
from sklearn.pipeline import Pipeline
from umap import UMAP
from hdbscan import HDBSCAN
import seaborn as sns
iris = sns.load_dataset("iris")
X = iris.drop(columns='species')
y = iris.species
#FunctionTransformer
def transform_something(iris):
iris = iris.copy()
iris['sepal_sum'] = iris.sepal_length + iris.sepal_width
return iris
cachedir = mkdtemp()
pipe = Pipeline([
('transformer', transform_something),
('umap', UMAP()),
('hdb', HDBSCAN()),
],
memory=cachedir
)
pipe.fit_predict(X)
If you run this, you will get a PicklingError, saying it cannot pickle the custom transformer. But I only need to cache the UMAP step. Any suggestions to make it work?
Not the cleanest, but you could nest pipelines?
pipe = Pipeline(
[
('transformer', transform_something),
('the_rest', Pipeline([
('umap', UMAP()),
('hdb', HDBSCAN()),
], memory=cachedir))
]
)
What also works is, instead of using the FunctionTransformer, writing your custom transform function from scratch like this:
from sklearn.base import BaseEstimator, TransformerMixin
class transform_something(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X):
return self
def transform(self, X):
X = X.copy()
X['sepal_sum'] = X.sepal_length + X.sepal_width
return X
Unfortunately it is a bit more code, but it is picklable.
Related
I'm using a StackingClassifier in sklearn, where I want the component models to be custom classifiers. In order to do this, I wanted to test it out with some dummy code where the custom classifier is the exact same as an already existing model (KNN, in this example). However this throws an error, and I'm not sure I understand why, and looking for help with this. It's probably something fairly obvious (I'm new to trying to write custom classifiers and using ClassiferMixIn), but I can't seem to figure out what I'm missing:
Code -- the baseline example without my custom class (works):
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
model = StackingClassifier(estimators=[
('tree', Pipeline([('tree', DecisionTreeClassifier(random_state=42))])),
('knn', Pipeline([('knn', KNeighborsClassifier())])),
])
model.fit(X, y)
Code -- the with my custom class (doesn't work):
class MyOwnClassifier(ClassifierMixin):
def __init__(self,classifier):
self.classifier = classifier
def fit(self, X, y):
self.classifier.fit(X,y)
return self
def predict(self, X):
return self.classifier.predict(X)
def predict_proba(self, X):
return self.classifier.predict_proba(X)
model = StackingClassifier(estimators=[
('tree', Pipeline([('tree', DecisionTreeClassifier(random_state=42))])),
('knn', Pipeline([('knn', MyOwnClassifier(KNeighborsClassifier()))])),
])
model.fit(X, y)
returns the error
AttributeError: 'MyOwnClassifier' object has no attribute 'classes_'
What really puzzles me about this is that in this answer, an identity transform could be used as part of the pipeline, and I can't imagine that object had 'classes_' either.
You've got 3 problems with your code:
StackingClassifier expects an attribute classes_ to be available on a fitted classifier, which is clearly stated in the error message. The linked example does have it, whereas yours doesn't. It can be checked if you run like dir(MyOwnClassifier(KNeighborsClassifier()).fit(X,y)).
BaseEstimator is missing from your class definition (you can do without it, but its presence makes life easier)
Pipelines in you code are extraneous clutter that are not necessary to debug your code and only complicating debugging.
Once you correct these problems you have a working code:
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.base import ClassifierMixin, BaseEstimator
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
class MyOwnClassifier(ClassifierMixin, BaseEstimator):
def __init__(self,classifier):
self.classifier = classifier
def fit(self, X, y):
self.classifier.fit(X,y)
self.classes_ = self.classifier.classes_
return self
def predict(self, X):
return self.classifier.predict(X)
def predict_proba(self, X):
return self.classifier.predict_proba(X)
model = StackingClassifier(estimators=[
('tree', DecisionTreeClassifier(random_state=42)),
('knn', MyOwnClassifier(KNeighborsClassifier()))])
model.fit(X,y)
StackingClassifier(estimators=[('tree',
DecisionTreeClassifier(random_state=42)),
('knn',
MyOwnClassifier(classifier=KNeighborsClassifier()))])
I would like to create a class that takes a scikit-learn pipeline and loop over it (as in the code example below).
In the example below I can however only pass the instance of my pipeline to the class and not create a new one in order to start with a fresh model.
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
class my_class:
def __init__(self,model):
self.model = model
def evaluate(self, X, y):
results = []
for i in range(10):
self.model.fit(X,y) #I always use the same instance here.
y_pred = self.model.predict(X)
results.append(accuracy_score(y_pred=y_pred, y_true=y))
return results
iris = load_iris()
X = iris.data
y = iris.target
pipeline = Pipeline([
('classifier', AdaBoostClassifier())
])
test = my_class(pipeline)
scores = test.evaluate(X,y)
Your code might be initializing different models but the result will always be the same because you are training and testing on the same data X every time without any change in the hyperparameters and random_state as None. That is why the results list will always contain same value.
I am using sklearn pipeline in my code and saving the pipeline object to deploy in the another envinorment. I have one custom class to drop the features. I am saving the model successfully but when I am using the pipeline object in another envirorment which has same version of sklearn, it is throwing an error. The pipeline is working fine when I did not include my custom class DropFeatures. Below is the code
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.externals import joblib
# Load the Iris dataset
df = pd.read_csv('Iris.csv')
label = 'Species'
labels = df[label]
df.drop(['Species'],axis=1,inplace=True)
# Set up a pipeline with a feature selection preprocessor that
# selects the top 2 features to use.
# The pipeline then uses a RandomForestClassifier to train the model.
class DropFeatures(BaseEstimator, TransformerMixin):
def __init__(self, features_to_drop=None):
self.features = features_to_drop
def fit(self, X, y=None):
return self
def transform(self, X):
# encode labels
if len(self.features) != 0:
X = X.copy()
X = X.drop(self.features, axis=1)
return X
return X
pipeline = Pipeline([
('drop_features', DropFeatures(['Id'])),
('feature_selection', SelectKBest(chi2, k=1)),
('classification', RandomForestClassifier())
])
pipeline.fit(df, labels)
print(pipeline.predict(query))
# Export the classifier to a file
joblib.dump(pipeline, 'model.joblib')
When I am using the model.joblib in another environment, I am getting an error. Below is the code to load the model and error in the image
from sklearn.externals import joblib
model = joblib.load('model1.joblib')
print(model)
Error stack trace:
If I exclude my custom transformer the GridSearchCV runs fine, but with, it errors.
Here is a fake dataset:
import pandas
import numpy
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
import sklearn_pandas
from sklearn.preprocessing import MinMaxScaler
df = pandas.DataFrame({"Letter":["a","b","c","d","a","b","c","d","a","b","c","d","a","b","c","d"],
"Number":[1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4],
"Label":["G","G","B","B","G","G","B","B","G","G","B","B","G","G","B","B"]})
class MyTransformer(TransformerMixin):
def transform(self, x, **transform_args):
x["Number"] = x["Number"].apply(lambda row: row*2)
return x
def fit(self, x, y=None, **fit_args):
return self
x_train = df
y_train = x_train.pop("Label")
mapper = DataFrameMapper([
("Number", MinMaxScaler()),
("Letter", LabelBinarizer()),
])
pipe = Pipeline([
("custom", MyTransformer()),
("mapper", mapper),
("classifier", RandomForestClassifier()),
])
param_grid = {"classifier__min_samples_split":[10,20], "classifier__n_estimators":[2,3,4]}
model_grid = sklearn_pandas.GridSearchCV(pipe, param_grid, verbose=2, scoring="accuracy")
model_grid.fit(x_train, y_train)
and the error is
list indices must be integers, not str
How can I make GridSearchCV work while there is a custom transformer in my pipeline?
I know this answer comes rather late, but I've encountered the same behavior with sklearn and BaseSearchCV derivative classes. The problem actually seems to stem from the _PartitionIterator class in the sklearn cross_validation module, as it makes the assumption that everything emitted from every TransformerMixin class in the pipeline is going to be array-like, and thus it generates slices of indices that are used to index incoming X args in a array-like manner. Here's the __iter__ method:
def __iter__(self):
ind = np.arange(self.n)
for test_index in self._iter_test_masks():
train_index = np.logical_not(test_index)
train_index = ind[train_index]
test_index = ind[test_index]
yield train_index, test_index
And the BaseSearchCV grid search metaclass calls cross_validation's _fit_and_score, which uses a method called safe_split. Here's the relevant line:
X_subset = [X[idx] for idx in indices]
This will absolutely produce unexpected results if X is a pandas dataframe, which you're emitting from your transform function.
There are two ways I've found to fix this:
Make sure to return an array from your transformer:
return x.as_matrix()
This is a hack. If the pipe of transformers demands the input to the next transformer be a DataFrame, as was my case, you can write a utilities script that is essentially the same as the sklearn grid_search module, but includes some clever validation methods that are called in the _fit method of the BaseSearchCV class:
def _validate_X(X):
"""Returns X if X isn't a pandas frame, otherwise
the underlying matrix in the frame. """
return X if not isinstance(X, pd.DataFrame) else X.as_matrix()
def _validate_y(y):
"""Returns y if y isn't a series, otherwise the array"""
if y is None:
return y
# if it's a series
elif isinstance(y, pd.Series):
return np.array(y.tolist())
# if it's a dataframe:
elif isinstance(y, pd.DataFrame):
# check it's X dims
if y.shape[1] > 1:
raise ValueError('matrix provided as y')
return y[y.columns[0]].tolist()
# bail and let the sklearn function handle validation
return y
As an example, here's my "custom grid_search module".
Short version: pandas and scikit-learn's cross validation methods didn't like to talk in that way (in my version, 0.15); this may be fixed simply by updating scikit-learn to 0.16/stable or 0.17/dev.
The GridSearchCV class validates the data and converts it to an array (so that it can perform CV splits correctly). So you don't get to use Pandas DataFrame features inside of built-in cross validation loops.
You will have to make your own cross-validation routines that don't do the validation if you want to do this kind of thing.
EDIT: This is my experience with scikit-learn's cross validation routines. It is why sklearn-pandas provides cross_val_score. However, so far as I can tell, GridSearchCV is not specialized by sklearn-pandas; your import of it accidentally imports the default sklearn version. Therefore, you may have to implement you own grid search using ParameterGrid and sklearn-pandas's cross_val_score.
How do you call partial_fit() on a scikit-learn classifier wrapped inside a Pipeline()?
I'm trying to build an incrementally trainable text classifier using SGDClassifier like:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
classifier = Pipeline([
('vectorizer', HashingVectorizer(ngram_range=(1,4), non_negative=True)),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(SGDClassifier())),
])
but I get an AttributeError trying to call classifier.partial_fit(x,y).
It supports fit(), so I don't see why partial_fit() isn't available. Would it be possible to introspect the pipeline, call the data transformers, and then directly call partial_fit() on my classifier?
Here is what I'm doing - where 'mapper' and 'clf' are the 2 steps in my Pipeline obj.
def partial_pipe_fit(pipeline_obj, df):
X = pipeline_obj.named_steps['mapper'].fit_transform(df)
Y = df['class']
pipeline_obj.named_steps['clf'].partial_fit(X,Y)
You probably want to keep track of performance as you keep adjusting/updating your classifier - but that is a secondary point
so more specifically - the original pipeline(s) were constructed as follows
to_vect = Pipeline([('vect', CountVectorizer(min_df=2, max_df=.9, ngram_range=(1, 1), max_features = 100)),
('tfidf', TfidfTransformer())])
full_mapper = DataFrameMapper([
('norm_text', to_vect),
('norm_fname', to_vect), ])
full_pipe = Pipeline([('mapper', full_mapper), ('clf', SGDClassifier(n_iter=15, warm_start=True,
n_jobs=-1, random_state=self.random_state))])
google DataFrameMapper to learn more about it - but here it just enables a transformation step that plays nice with pandas
Pipeline does not use partial_fit, hence does not expose it. We would probably need a dedicated pipelining scheme for out-of-core computation but that also depends on the capabilities of the previous models.
In particular in this case you would probably want to do several passes over your data, one to fit each stage of the pipeline and then to transform the dataset to fit the next one, except for the first stage which is stateless, hence does not fit parameters from the data.
In the mean time it's probably easier to roll your own wrapper code tailored to your needs.
Even though this question is 8 years old it is still very relevant and has not been updated for quite a time now.
As the matter of fact, there is now a nice package created by Vincent Warmerdam called tokenwiser.
It is used for NLP stuff mostly to fit within the sklearn infrastructure. However, there is the main building block that can be used even for not NLP tasks.
The package has PartialPipeline boiler plate and Documentantation.
Example here:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer
from tokenwiser.textprep import Cleaner, Identity, HyphenTextPrep
from tokenwiser.pipeline import PartialPipeline, PartialFeatureUnion
pipe = PartialPipeline([
("clean", Cleaner()),
("union", PartialFeatureUnion([
("full_text_pipe", PartialPipeline([
("identity", Identity()),
("hash1", HashingVectorizer()),
])),
("hyphen_pipe", PartialPipeline([
("hyphen", HyphenTextPrep()),
("hash2", HashingVectorizer()),
]))
])),
("clf", SGDClassifier())
])
X = [
"i really like this post",
"thanks for that comment",
"i enjoy this friendly forum",
"this is a bad post",
"i dislike this article",
"this is not well written"
]
y = np.array([1, 1, 1, 0, 0, 0])
for loop in range(3):
pipe.partial_fit(X, y, classes=[0, 1])
I can imagine this template working even for non-NLP-related stuff. Hope someone will find this super usefull.
I also propose my basic implementation of utilizing partial_fit within a sklearn pipeline.
We just need to use a model that allows for partial fit (e.g. SGDregressor, xgboost, etc) and create own sklearn compatible classes
(Huge KUDOS to VIncent Warmerdam who started this in his TOKENWISER project)
import xgboost as xgb
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklego.preprocessing import PatsyTransformer
class xgboost_partial_trainer(BaseEstimator, TransformerMixin):
"""
allows for incremental training od xgboost model within a sklean pipeline
"""
def __init__(self, training_params: dict = None):
self.training_params = training_params
self.trained_model = None
self._first_call = True
self.evals_result = {}
self.iter_number = 1
self._X_train, self._X_test, self._y_train, self._y_test = (
None,
None,
None,
None,
)
def partial_fit(self, X, y=None, classes=None, **fit_params):
print(f"firts run: {self._first_call}, n_iter = {self.iter_number}")
self.iter_number += 1
if self._first_call:
# Select random subset of data and store within the model (for error loss over time)
self._X_train, self._X_test, self._y_train, self._y_test = train_test_split(
X, y, test_size=0.6, random_state=1
)
self._xg_train = xgb.DMatrix(self._X_train, label=self._y_train)
self._xg_test = xgb.DMatrix(self._X_test, label=self._y_test)
# validations set to watch performance - same testing data, changebla training data
self.watchlist = [
(self._xg_train, "train_batch"),
(self._xg_test, "eval_fixed"),
]
# Trainig Part Itself
self.trained_model = xgb.train(
params=self.training_params,
dtrain=xgb.DMatrix(X, y),
xgb_model=self.trained_model,
evals=self.watchlist,
)
# Swich outside firts batch
self._first_call = False
else:
self._xg_train = xgb.DMatrix(X, y)
self.watchlist = [
(self._xg_train, "train_batch"),
(self._xg_test, "eval_fixed"),
]
self.trained_model = xgb.train(
params=self.training_params,
dtrain=self._xg_train,
xgb_model=self.trained_model,
evals=self.watchlist,
)
# self._predicted_y = self.trained_model.predict(xgb.DMatrix(self._X_test))
# print(f"mean_squared_error = {mean_squared_error(self._y_test, self._predicted_y, squared = False)}")
return self
def predict(self, X, y=None, **fit_params):
return self.trained_model.predict(xgb.DMatrix(X))
def transform(self, X, y=None, **fit_params):
return self.trained_model.predict(xgb.DMatrix(X))
def fit(self, X, y=None, **fit_params):
return self
class PartialPipeline(Pipeline):
"""
Utility function to generate a `PartialPipeline`
Arguments:
steps: a collection of text-transformers
"""
def partial_fit(self, X, y=None, classes=None, **kwargs):
"""
Fits the components, but allow for batches.
"""
# print(f"there are partial steps {self.steps_partial}")
for _, step in self.steps:
if hasattr(step, "partial_fit"):
step.partial_fit(X, y, **kwargs)
elif hasattr(step, "fit_transform"):
X = step.fit_transform(X)
elif hasattr(step, "transform"):
X = step.transform(X)
elif hasattr(step, "fit"):
X = step.fit(X)
return self
Once we have these sklearn classes we may utilize the Pipeline:
my_pipeline = PartialPipeline([
("patsy", PatsyTransformer(FORMULA2)),
("xgboost_model", xgboost_partial_trainer(training_params=params)),
])
df_chunked = pd.read_csv(your_date, chunksize=5_000)
for df in df_chunked:
my_pipeline.partial_fit(df, y=df["speed"])
Please, provide me with feedback and code cleaning suggestions. I am fully aware that this is not perfect. However, as a nice prototype - not too bad!