ValueError: Found input variables with inconsistent numbers of samples: [658448, 5879] - python

I have tried to evalute the different Machine Learning model and facing this error. The error shows that cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') and the kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
If anyone know where is the problem please let me know about it.
# comparing algorithms and training models
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# Spot Check Algorithms
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors=19)))
models.append(('Decision Tree', DecisionTreeClassifier(min_samples_leaf=60)))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier(n_estimators=80, max_depth=3,random_state=0,min_samples_leaf=9)))
X_train = X_train.reshape(658448,-1)
Y_train = Y_train.reshape(5879,-1)
Y_test = Y_test.reshape(1960,1)
X_test = X_test.reshape(219520,-1)
Y_test = Y_test.astype('int')
X_train = X_train.astype('int')
Y_train = Y_train.astype('int')
type(Y_train )
type(X_train)
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
#kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
#kf = KFold(n_splits=5, random_state=3,shuffle=True )
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

Related

gridsearch before RFE is taking super long

I try to do a gridsearch on my dataset to know how many features i want to select in my RFE, but it is taking super long. Does anyone know if this is normal, or do i have a foult in my script?
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, RFE
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
#%% train-test split
data = pd.read_csv('preprocesseddata.csv')
data.drop(['Date', 'About'], axis=1, inplace=True)
y = data['Class']
X = data[['Duration_Ball Training','Duration_Match','Duration_Other','Duration_Strenght Training','Positie','Gender','Voorkeursbeen','Instroomjaar','Age','Hours Sleep','Stress','Muscle Soreness','T-test','20m Sprint','CMJ 2b','Yo Yo Result','Heart Rate (Max)','Latest Height', 'Body Fat %','Repetitive Injury','Prefered Leg','AcuteLegs_1day','AcuteCardio_1day','AcuteLegs_3days','AcuteCardio_3days','AcuteLegs_7days','AcuteCardio_7days','ChronicLegs_14days','ChronicCardio_14days','ChronicLegs_21days','ChronicCardio_21days','ChronicLegs_28days','ChronicCardio_28days','TrainingmonotonyLegs','TrainingmonotonyCardio']]
y = y.astype('category')
y = y.cat.codes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
#%% RFE as part of pipeline
lr = LogisticRegression(solver='liblinear', random_state=123)
pipe = make_pipeline(RFE(estimator=lr, step=1), KNeighborsClassifier())
parameters = {'rfe__n_features_to_select': range(1,35), 'kneighborsclassifier__n_neighbors': range(1,30)}
grid = GridSearchCV(pipe, param_grid=parameters, cv=10, n_jobs=1)
grid.fit(X_train_std, y_train)
print('Best params:', grid.best_params_)
print('Best accuracy:', grid.best_score_)
#%% RFE
lr = LogisticRegression(solver='liblinear', random_state=123)
rfe= RFE(estimator=lr, n_features_to_select=5, step=-1)
rfe.fit(X_train_std, y_train)
X_train_sub = rfe.transform(X_train_std)
rfe.support_
It seems to get stuck at the print best parameters line

Save and Load SVM trained moedel

I have trained an SVM classifier and got good results now I need to save that model, load it when I want, and predict new unseen data.
this is my code what should I add
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['linear','rbf', 'poly', 'sigmoid']}
def select_features(X_train,y_train,X_test):
fs = SelectKBest(score_func=f_classif, k='all')
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs,X_test_fs,fs
data = pd.read_csv('d:/f.csv')
d=data.values
X = d[:,0:207]
y = d[:,208]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
X_train_fs, X_test_fs, fs =select_features(X_train,y_train,X_test)
model = GridSearchCV(SVC(),param_grid,refit=True,verbose=2,n_jobs=-1)
model.fit(X_train_fs, y_train)
y_pred = model.predict(X_test_fs)
print('Best GridSearchCV parameters: ',model.best_params_)
print("Accuracy: ",accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
Short answer:
SVC has does not have a load or save function. However, the params are stored in a dict which means you can save them as json. Afterwards you can load the dict and reinstantiate the SVM with the set_params method.
from sklearn.svm import SVC
svc = SVC()
svc.set_params(loaded_params)
Is that your question?

How to plot computational time for multi-model?

I'd like to compare the computational time for multiple models using bar or something esle. So, I need to know which one is fastest model and also slowest one easly using the figure instead of numbers.
This full code from here:
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import time
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = read_csv(url, names=names)
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
y = array[:,4]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1, shuffle=True)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
time_model = []
for name, model in models:
start = time.time()
kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
com_time = time.time() - start
time_model.append(com_time)
names.append(name)
print('%s: %f (%f) ' % (name, cv_results.mean(), cv_results.std()))
print ('time', time.time() - start)
# print time.mean
# Compare Algorithms
# pyplot.boxplot(results, labels=names)
# pyplot.title('Algorithm Comparison')
# pyplot.show()
# print time_model, names
pyplot.figure()
pyplot.title('Algorithm Comparison')
pyplot.bar(time_model, labels=names)
pyplot.show()
How to do so that it will be similar this figure below and the same order (Ascending)?

Why the output of cross_validate differ from the hardcode loop when using XGBClassifier?

Code #1 Pass pipeline with PCA, XGBClassifier steps to scikit-learn cross_validate function
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, LeaveOneOut
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
kwargs = {
'n_jobs': -1,
'cv': LeaveOneOut(),
'X': X,
'y': y
}
pipe = Pipeline([
('pca', PCA(1, random_state=42)),
('xgbc', XGBClassifier(random_state=42))
])
results = cross_validate(pipe, **kwargs)
print(results['test_score'].mean())
Code #2 Write cross-validation loop hardcode and calculate mean accuracy for exactly same input X as Code #1
from xgboost import XGBClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.decomposition import PCA
import random
random.seed(42)
import numpy as np
np.random.seed(42)
acc = []
for train_idx, test_idx in LeaveOneOut().split(X, y):
x_train, x_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
pca = PCA(1, random_state=42)
pca.fit(x_train)
x_train = pca.transform(x_train)
x_test = pca.transform(x_test)
model = XGBClassifier(random_state=42, n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
acc.append(score)
print(np.mean(acc))

Changing the evaluation metric of a model_selection

I'm trying to modify the example from this tutorial to use my own data.
In the tutorial Y-data can only have 3 different values, but in my case it can be between 0 and 200. I consider it a successful estimate if the prediction gets to +-3.
I suspect I have to make some modification to the scoring variable, but I'm not sure how to proceed.
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
url = "testdata2.csv"
dataset = pandas.read_csv(url)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Split-out validation dataset
array = dataset.values
X = array[:,0:6]
Y = array[:,6]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(cv_results)

Categories