I have this code. Can I save 'pca' and 'svm_clf' to one file by using joblib.dump? If not, is there any other way to do this?
from sklearn.svm import SVC
from sklearn.externals import joblib
from sklearn import decomposition
from sklearn import svm
X = [[1,3,4,5,6,7,8,7,0],[1,3,4,5,6,7,9,5,1]]
Y = [1,0]
#pca
pca = decomposition.PCA(0.98)
X_new = pca.fit_transform(X)
svm_clf = svm.LinearSVC()
#svm model
svm_clf = SVC(C=1, kernel='rbf')
svm_clf.fit(X_new, Y)
#save model
joblib.dump(pca, 'model.sav')
joblib.dump(svm_clf, 'model.sav')
You can simply use:
joblib.dump([pca, svm_clf], 'model.sav', compress=1)
And then use the models like:
pca, svm_clf = joblib.load('model.sav')
Probably a nicer way is to define a pipeline if you want to use these two models together and dump the pipeline:
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(pca, svm_clf)
joblib.dump(pipeline, 'model.sav')
Related
I am using the iris flower dataset to do the sorting. I need to make a confusion matrix through cross validation (fold = 10) but I don't know how to do it. I generated the confusion matrix of only one round.
# I am using TPOT autoML library for python
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import LabelEncoder
tpot_data = pd.read_csv('iris.csv')
tpot_data = tpot_data.apply(LabelEncoder().fit_transform)
features = tpot_data.drop('species', axis=1).values
training_features, testing_features, training_target, testing_target = \
train_test_split(features, tpot_data['species'].values, random_state=10)
exported_pipeline = make_pipeline(StackingEstimator(estimator=GaussianNB()),
MultinomialNB(alpha=0.01, fit_prior=False)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(testing_target, results))
pd.crosstab(testing_target, results, rownames=['Actual Class'], colnames=['Predicted Class'])
from sklearn.model_selection import cross_val_score
array_cross_val_score = cross_val_score(estimator=exported_pipeline, X=training_features,
y=training_target, cv=10, scoring='accuracy')
# I would like the confusion matrix to be based on the average cross-validation
np.mean(array_cross_val_score)
I am using imbalanced-learn to oversample my data. I want to know how many entries in each class there are after using the oversampling method.
This code works nicely:
import imblearn.over_sampling import SMOTE
from collections import Counter
def oversample(x_values, y_values):
oversampler = SMOTE(random_state=42, n_jobs=-1)
x_oversampled, y_oversampled = oversampler.fit_resample(x_values, y_values)
print("Oversampling training set from {0} to {1} using {2}".format(dict(Counter(y_values)), dict(Counter(y_over_sampled)), oversampling_method))
return x_oversampled, y_oversampled
But I switched to using a pipeline so I can use GridSearchCV to find the best oversampling method (out of ADASYN, SMOTE and BorderlineSMOTE). Therefore I never actually call fit_resample myself and lose my output using something like this:
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
pipe = Pipeline([('scaler', MinMaxScaler()), ('sampler', SMOTE(random_state=42, n_jobs=-1)), ('estimator', RandomForestClassifier())])
pipe.fit(x_values, y_values)
The upsampling works, but I lose my output on how many entries for each class there are in the training set.
Is there a way of getting a similar output than the first example using a pipeline?
In theory yes. When an over-sampler is fitted, an attribute sampling_strategy_ is created, containing the number of samples from the minority class(es) to be generated when fit_resample is invoked. You can use it to get a similar output as your example above. Here is a modified example based on your code:
# Imports
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
# Create toy dataset
X, y = make_classification(weights=[0.20, 0.80], random_state=0)
init_class_distribution = Counter(y)
min_class_label, _ = init_class_distribution.most_common()[-1]
print(f'Initial class distribution: {dict(init_class_distribution)}')
# Create and fit pipeline
pipe = Pipeline([('scaler', MinMaxScaler()), ('sampler', SMOTE(random_state=42, n_jobs=-1)), ('estimator', RandomForestClassifier(random_state=23))])
pipe.fit(X, y)
sampling_strategy = dict(pipe.steps).get('sampler').sampling_strategy_
expected_n_samples = sampling_strategy.get(min_class_label)
print(f'Expected number of generated samples: {expected_n_samples}')
# Fit and resample over-sampler pipeline
sampler_pipe = Pipeline(pipe.steps[:-1])
X_res, y_res = sampler_pipe.fit_resample(X, y)
actual_class_distribution = Counter(y_res)
print(f'Actual class distribution: {actual_class_distribution}')
For a text classification project I made a pipeline for the feature selection and the classifier. Now my question is if it is possible to include the feature extraction module in the pipeline and how. I looked some things up about it, but it doesn't seem to fit with my current code.
This is what I have now:
# feature_extraction module.
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction import DictVectorizer
import numpy as np
vec = DictVectorizer()
X = vec.fit_transform(instances)
scaler = StandardScaler(with_mean=False) # we use cross validation, no train/test set
X_scaled = scaler.fit_transform(X) # To make sure everything is on the same scale
enc = LabelEncoder()
y = enc.fit_transform(labels)
# Feature selection and classification pipeline
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn.pipeline import Pipeline
feat_sel = SelectKBest(mutual_info_classif, k=200)
clf = linear_model.LogisticRegression()
pipe = Pipeline([('mutual_info', feat_sel), ('logistregress', clf)]))
y_pred = model_selection.cross_val_predict(pipe, X_scaled, y, cv=10)
How can I put the dictvectorizer until the label encoder in the pipeline?
Here's how you would do it. Assuming instances is a dict-like object, as specified in the API, then just build your pipeline like so:
pipe = Pipeline([('vectorizer', DictVectorizer()),
('scaler', StandardScaler(with_mean=False)),
('mutual_info', feat_sel),
('logistregress', clf)])
To predict, then call cross_val_predict, passing instances as X:
y_pred = model_selection.cross_val_predict(pipe, instances, y, cv=10)
for a machine learning experiment I have to perform feature selection. I have no division in training and test set because of 10-fold cross validation. Someone told me that I have to do the feature selection per fold. But I have no idea how to do that. Here is a part of my code.
vec = DictVectorizer()
X = vec.fit_transform(instances) # No train/ test set, because we'll use 10-fold cross validation
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X) # To make sure everything is on the same scale
enc = LabelEncoder()
y = enc.fit_transform(labels)
#feature selection
from sklearn.feature_selection import SelectKBest, mutual_info_classif
feat_sel = SelectKBest(mutual_info_classif, k=200)
X_fs = feat_sel.fit_transform(X_scaled, y)
#train a classifier
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
y_pred = model_selection.cross_val_predict(clf, X_fs, y, cv=10)
Can someone help me with the selection per fold?
Answering the second question that you posted.
You can use cross validation and see the results:
Do:
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFECV
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
feat_sel = SelectKBest(mutual_info_classif, k=200)
clf = MultinomialNB()
pipe = Pipeline([('mutual_info',feat_sel), ('naive_bayes',clf)])
scores = cross_val_score(pipe, X_scaled, y, cv =10, scoring = 'accuracy')
print(np.mean(scores))
You can use Pipeline, join the feature selector and the classifier into a pipeline and cross-validate the pipeline.
Reference: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
I use the dataset from UCI repo: http://archive.ics.uci.edu/ml/datasets/Energy+efficiency
Then doing next:
from pandas import *
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.cross_validation import train_test_split
dataset = read_excel('/Users/Half_Pint_boy/Desktop/ENB2012_data.xlsx')
dataset = dataset.drop(['X1','X4'], axis=1)
trg = dataset[['Y1','Y2']]
trn = dataset.drop(['Y1','Y2'], axis=1)
Then do the models and cross validate:
models = [LinearRegression(),
RandomForestRegressor(n_estimators=100, max_features ='sqrt'),
KNeighborsRegressor(n_neighbors=6),
SVR(kernel='linear'),
LogisticRegression()
]
Xtrn, Xtest, Ytrn, Ytest = train_test_split(trn, trg, test_size=0.4)
I'm creating a regression model for predicting values but have a problems. Here is the code:
TestModels = DataFrame()
tmp = {}
for model in models:
m = str(model)
tmp['Model'] = m[:m.index('(')]
for i in range(Ytrn.shape[1]):
model.fit(Xtrn, Ytrn[:,i])
tmp[str(i+1)] = r2_score(Ytest[:,0], model.predict(Xtest))
TestModels = TestModels.append([tmp])
TestModels.set_index('Model', inplace=True)
It shows unhashable type: 'slice' for line model.fit(Xtrn, Ytrn[:,i])
How can it be avoided and made working?
Thanks!
I think that I had a similar problem before! Try to convert your data to numpy arrays before feeding them to sklearn estimators. It most probably solve the hashing problem. For instance, You can do:
Xtrn_array = Xtrn.as_matrix()
Ytrn_array = Ytrn.as_matrix()
and use Xtrn_array and Ytrn_array when you fit your data to estimators.