I'm testing different models (classifiers) and I've created a list (that will contain model names) and then loop through it to print accuracy and cross validation score for each one of them. It works fine.
The thing I'd like to do is showing them ordered by descending accuracy_score (metrics.accuracy_score(y_test, y_pred) in the code below). How do I do that easily?
Thanks a lot to anyone who'll be willing to help!
#create an array of models
models = []
models.append(("Random Forest",RandomForestClassifier(n_estimators = 100, random_state = 0)))
#models.append(("Logistic Regression",LogisticRegression()))
models.append(("Naive Bayes",GaussianNB()))
models.append(("SVM",SVC()))
models.append(("Dtree",DecisionTreeClassifier()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("Gradient Boosting",GradientBoostingClassifier()))
#measure the accuracy and show results per model
for name, model in models:
# fit the model with x and y data
model.fit(X_train, y_train)
#Prediction of test set
y_pred = model.predict(X_test)
kfold = KFold(n_splits=4)#, random_state=22)
cv_result = cross_val_score(model, X_train, y_train, cv = kfold, scoring = "accuracy")
print('\033[1m', name, '\033[0m')
print('accuracy score is: \033[1m', metrics.accuracy_score(y_test, y_pred),'\033[0m')
print('cross validation score is: ' ,cv_result,'\n------------------------------------------------------------------------------------')
Append your scores to a new list, and then sort that list using the .sort() method, like so:
#create an array of models
models = []
models.append(("Random Forest",RandomForestClassifier(n_estimators = 100, random_state = 0)))
#models.append(("Logistic Regression",LogisticRegression()))
models.append(("Naive Bayes",GaussianNB()))
models.append(("SVM",SVC()))
models.append(("Dtree",DecisionTreeClassifier()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("Gradient Boosting",GradientBoostingClassifier()))
results = [] # New list to store results
#measure the accuracy and show results per model
for name, model in models:
# fit the model with x and y data
model.fit(X_train, y_train)
#Prediction of test set
y_pred = model.predict(X_test)
kfold = KFold(n_splits=4)#, random_state=22)
cv_result = cross_val_score(model, X_train, y_train, cv = kfold, scoring = "accuracy")
results.append((name, metrics.accuracy_score(y_test, y_pred)))
print('\033[1m', name, '\033[0m')
print('accuracy score is: \033[1m', metrics.accuracy_score(y_test, y_pred),'\033[0m')
print('cross validation score is: ' ,cv_result,'\n------------------------------------------------------------------------------------')
results.sort(key=lambda tup: tup[1], reverse=True) # sort in-place
print(results) # print results
Rather than doing everything in the same big chunk of code inside the loop, I suggest to identify the different types of operations that you're doing and separate them into their own functions:
Run the model, fit, predict, compute score;
Sort the list;
Print the model;
import operator # itemgetter
#create an array of models
models = []
models.append(("Random Forest",RandomForestClassifier(n_estimators = 100, random_state = 0)))
#models.append(("Logistic Regression",LogisticRegression()))
models.append(("Naive Bayes",GaussianNB()))
models.append(("SVM",SVC()))
models.append(("Dtree",DecisionTreeClassifier()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("Gradient Boosting",GradientBoostingClassifier()))
def run_model(m):
name, model = m
# fit the model with x and y data
model.fit(X_train, y_train)
#Prediction of test set
y_pred = model.predict(X_test)
kfold = KFold(n_splits=4)#, random_state=22)
cv_result = cross_val_score(model, X_train, y_train, cv = kfold, scoring = "accuracy")
return (name, accuracy, cv_result)
def print_model(name, accuracy, cv_result):
print('\033[1m', name, '\033[0m')
print('accuracy score is: \033[1m', accuracy,'\033[0m')
print('cross validation score is: ' ,cv_result,'\n------------------------------------------------------------------------------------')
results = sorted(map(run_model, models), key=operator.itemgetter(1))
for name, accuracy, cv_result in results:
print_model(name, accuracy, cv_result)
Disclaimer: Contrary to all best practices, I did not test this code before posting it, because the OP didn't provide example values for X_train, y_train, X_test, y_test, nor the relevant import to make their code work.
Related
I thought those two methods should produce similar scores but then I got different scores. Here are my codes:
#prepare the data and model
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size = 0.7,test_size = 0.3,random_state = 10)
kf = KFold(n_splits=10,shuffle=True,random_state=1)
lasso = Lasso(alpha=0.1)
lassocv = LassoCV(cv=kf)
And from following code I found very different scores:
#see the scores
print(np.mean(cross_val_score(lasso, X_train, Y_train, scoring='r2',
cv=kf, n_jobs=-1)))
#score above is 0.18371746011784781
lassocv.fit(X_train,Y_train)
print(lassocv.score(X_train,Y_train))
#the score above is 0.30164049053598596
BTW, both the scores of cross_val_score and .score() will fluctuate if I delete the random_state=10 when I split train and test data, but cross_val_score always gets a lower r2 score than .score().
Thanks for your help!
I am working on building a prediction model. I have managed to reach until getting the cross-validation scores. Now I have no idea how to continue. What function should I use to make predictions using cross-validation scores?
X = data.iloc[:,0:16]
Y = data.iloc[:,16]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y,
test_size=validation_size, random_state=seed)
models = [
('LR', LogisticRegression()),
('CART', DecisionTreeClassifier()),
('KNN', KNeighborsClassifier()),
('SVM', SVC())
]
results, names = [], []
for name, model in models:
seed = 32
scoring = 'accuracy'
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
Cross validation is mostly used as a more robust validation scheme to check if your model is performing well or not. After that you can train a model with the whole dataset after being satisfied with your cross validation score or you can use.
sklearn.model_selection.cross_val_predict
Which predicts cross validated estimates. You can check out the documentation for more information.
I'm trying to use a gradient-boosting model to predict future scores in fantasy football - for now only looking at the 2 previous rounds. Currently, if a player is expected to score more than 6 points, the model would return '1', otherwise '0' - indicating whether the player would be a good captain choice or not.
In my original table i have player-name and round information to give context, but i removed these when training the algorithm. My question is, once the model makes a prediction - how can i show this prediction in combination with the player name, for example:
PlayerA - captain prediction = 1
etc.
y = ds.isCaptain
GB_table = ds.drop(['Player', 'Round', 'isCaptain', 'Points'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(GB_table, y, test_size=0.2)
baseline = GradientBoostingClassifier(learning_rate=0.01,n_estimators=1500,max_depth=4, min_samples_split=40, min_samples_leaf=7,max_features=4 , subsample=0.95, random_state=10)
baseline.fit(X_train,y_train)
predictors=list(X_train)
feat_imp = pd.Series(baseline.feature_importances_, predictors).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Importance of Features')
plt.ylabel('Feature Importance Score')
print('Accuracy of GBM on test set: {:.3f}'.format(baseline.score(X_test, y_test)))
pred=baseline.predict(X_test)
print(classification_report(y_test, pred))
The above shows me the predicted results, but unfortunately since I removed the player name and round information from the GB_table, I can no longer understand from who/which round the prediction is made.
I'm assuming you are using pandas DataFrames, in which case it's quite straightforward.
The index numbers in your X_train and X_test DataFrames will correspond to the index in your original 'ds' DataFrame.
Try:
pred = baseline.predict(X_test)
pred_original_data = ds.iloc[X_test.index]
pred_original_data['prediction'] = pred
You could drop the player column and other fields after train_test_split.
Here is my suggestion
y = ds.isCaptain
X_train, X_test, y_train, y_test = train_test_split(ds, y, test_size=0.2)
baseline = GradientBoostingClassifier(learning_rate=0.01, n_estimators=1500,max_depth=4, min_samples_split=40, min_samples_leaf=7,max_features=4 , subsample=0.95, random_state=10)
baseline.fit(X_train.drop(['Player', 'Round', 'isCaptain', 'Points'], axis=1),y_train)
X_test_input = X_test.drop(['Player', 'Round', 'isCaptain', 'Points']
score = baseline.score(X_test_input, y_test))
print('Accuracy of GBM on test set: {:.3f}'.format(score)
X_test['prediction'] = baseline.predict(X_test_input)
print(classification_report(y_test, X_test['prediction']))
I have the simple fitting model like this:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
print accuracy_score(y_test, predictions)
and with using cross validation I have this:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 7)
from cross validation how can I take the accuracy in order to have the same measure print accuracy_score(y_test, predictions)? Is it accuracies.mean()?
print accuracies will give an array of accuracy on each fold of cross validation
print "Train set score :: {} ".format(accuracies.mean()) will give the mean accuracy on the cross validation and
print "Train set score :: {} +/-{}".format(accuracies.mean(),accuracies.std()*2) will give you the accuracy along with the mean deviation
Using python and scikit-learn, I'd like to do a grid search. But some of my models end up being empty. How can I make the grid search function to ignore those models?
I guess I can have a scoring function which returns 0 if the models is empty, but I'm not sure how.
predictor = sklearn.svm.LinearSVC(penalty='l1', dual=False, class_weight='auto')
param_dist = {'C': pow(2.0, np.arange(-10, 11))}
learner = sklearn.grid_search.GridSearchCV(estimator=predictor,
param_grid=param_dist,
n_jobs=self.n_jobs, cv=5,
verbose=0)
learner.fit(X, y)
My data's in a way that this learner object will choose a C corresponding to an empty model. Any idea how I can make sure the model's not empty?
EDIT: by an "empty model" I mean a model that has selected 0 features to use. Specially with an l1 regularized model, this can easily happen. So in this case, if the C in the SVM is small enough, the optimization problem will find the 0 vector as the optimal solution for the coefficients. Therefore predictor.coef_ will be a vector of 0s.
Try to implement custom scorer, something similar to:
import numpy as np
def scorer_(estimator, X, y):
# Your criterion here
if np.allclose(estimator.coef_, np.zeros_like(estimator.coef_)):
return 0
else:
return estimator.score(X, y)
learner = sklearn.grid_search.GridSearchCV(...
scoring=scorer_)
I don't think there is such a built-in function; it's easy, however, to make a custom gridsearcher:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
import itertools
from sklearn import metrics
import operator
def model_eval(X, y, model, cv):
scores = []
for train_idx, test_idx in cv:
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]
model.fit(X_train, y_train)
nonzero_coefs = len(np.nonzero(model.coef_)[0]) #check for nonzero coefs
if nonzero_coefs == 0: #if they're all zero, don't evaluate any further; move to next hyperparameter combo
return 0
predictions = model.predict(X_test)
score = metrics.accuracy_score(y_test, predictions)
scores.append(score)
return np.array(scores).mean()
X, y = make_classification(n_samples=1000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
n_classes=2,
random_state=0,
shuffle=False)
C = pow(2.0, np.arange(-20, 11))
penalty = {'l1', 'l2'}
parameter_grid = itertools.product(C, penalty)
kf = KFold(X.shape[0], n_folds=5) #use the same folds to evaluate each hyperparameter combo
hyperparameter_scores = {}
for C, penalty in parameter_grid:
model = svm.LinearSVC(dual=False, C=C, penalty=penalty)
result = model_eval(X, y, model, kf)
hyperparameter_scores[(C, penalty)] = result
sorted_scores = sorted(hyperparameter_scores.items(), key=operator.itemgetter(1))
best_parameters, best_score = sorted_scores[-1]
print best_parameters
print best_score