I have trained an SVM classifier and got good results now I need to save that model, load it when I want, and predict new unseen data.
this is my code what should I add
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['linear','rbf', 'poly', 'sigmoid']}
def select_features(X_train,y_train,X_test):
fs = SelectKBest(score_func=f_classif, k='all')
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs,X_test_fs,fs
data = pd.read_csv('d:/f.csv')
d=data.values
X = d[:,0:207]
y = d[:,208]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_fs, X_test_fs, fs =select_features(X_train,y_train,X_test)
model = GridSearchCV(SVC(),param_grid,refit=True,verbose=2,n_jobs=-1)
model.fit(X_train_fs, y_train)
y_pred = model.predict(X_test_fs)
print('Best GridSearchCV parameters: ',model.best_params_)
print("Accuracy: ",accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
Short answer:
SVC has does not have a load or save function. However, the params are stored in a dict which means you can save them as json. Afterwards you can load the dict and reinstantiate the SVM with the set_params method.
from sklearn.svm import SVC
svc = SVC()
svc.set_params(loaded_params)
Is that your question?
Related
I have tried to evalute the different Machine Learning model and facing this error. The error shows that cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') and the kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
If anyone know where is the problem please let me know about it.
# comparing algorithms and training models
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# Spot Check Algorithms
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors=19)))
models.append(('Decision Tree', DecisionTreeClassifier(min_samples_leaf=60)))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier(n_estimators=80, max_depth=3,random_state=0,min_samples_leaf=9)))
X_train = X_train.reshape(658448,-1)
Y_train = Y_train.reshape(5879,-1)
Y_test = Y_test.reshape(1960,1)
X_test = X_test.reshape(219520,-1)
Y_test = Y_test.astype('int')
X_train = X_train.astype('int')
Y_train = Y_train.astype('int')
type(Y_train )
type(X_train)
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
#kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
#kf = KFold(n_splits=5, random_state=3,shuffle=True )
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()
I have build a random forest classifier using scikit learn and python, and I am having trouble actually feeding data in to see the prediction. I want to see the format of the output, and to convert this to a json file. I have attached the code for the random forest and what the data looks like. I believe I need to use 'y_pred', but I am not sure what format the input data needs to be.
X = dataset.iloc[:, 2:4].values
y = dataset["pages"]
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=20, random_state = 0)
classifier = classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
You can simply concatenate the predicted values with the matrix of features.
Also note that the pipeline is exactly for this purpose, when you first want to transform the data and then apply some classifier.
This should work for you:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
X = dataset.iloc[:, 2:4].values
y = dataset["pages"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
classifier = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=20, random_state=0))
classifier = classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
pred = pd.concat([X_test, pd.Series(y_pred, name="pages")], axis=1)
Code #1 Pass pipeline with PCA, XGBClassifier steps to scikit-learn cross_validate function
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
kwargs = {
'n_jobs': -1,
'cv': LeaveOneOut(),
'X': X,
'y': y
}
pipe = Pipeline([
('pca', PCA(1, random_state=42)),
('xgbc', XGBClassifier(random_state=42))
])
results = cross_validate(pipe, **kwargs)
print(results['test_score'].mean())
Code #2 Write cross-validation loop hardcode and calculate mean accuracy for exactly same input X as Code #1
from xgboost import XGBClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
acc = []
for train_idx, test_idx in LeaveOneOut().split(X, y):
x_train, x_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
pca = PCA(1, random_state=42)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
model = XGBClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
acc.append(score)
print(np.mean(acc))
I am running the [code] of multi-label classification1.how to fix the NameError that the "X_train" is not defined.the python code is given below.
import scipy
from scipy.io import arff
data, meta = scipy.io.arff.loadarff('./yeast/yeast-train.arff')
from sklearn.datasets import make_multilabel_classification
# this will generate a random multi-label dataset
X, y = make_multilabel_classification(sparse = True, n_labels = 20,
return_indicator = 'sparse', allow_unlabeled = False)
# using binary relevance
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(X_train, y_train)
# predict
predictions = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)
You forgot to split the dataset into train and test sets.
Import the library
from sklearn.model_selection import train_test_split
Add this line before classifier.fit()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train does not exist, you have to split between train and test :
from sklearn.preprocessing import StandardScaler
s =StandardScaler()
X_train = s.fit_transform(X_train)
X_test = s.fit_transform(X_test)
I am trying to use this library from sklearn named SVC.
However I have this error when I run my program:
ValueError: Unknown label type: 'continuous'
I do not know if there is a regressor library for Support Vector Regressor, this is the only I have found so far. Here is my code:
import sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
X, Y = get_data(filename)
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=33)
svc = SVC()
svc.fit(X_train, y_train)
print(svc.score(X_train, y_train))
print(svc.score(X_test, y_test))
Thanks.
SVC is a classifier so will not support continous values in targets. What you need is SVR. Just replace all occurences of SVC with SVR and you are good to go.
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)
print(svr.score(X_train, y_train))
print(svr.score(X_test, y_test))