I've trained a simple random forest algorithm and bagging classifier (n_estimators = 100). Is it possible to plot the history of accuracy in bagging Classifier? How to calculate the variance of in 100 samples?
I've just printed the accuracy value for both algorithms:
# DecisionTree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90)
clf2 = tree.DecisionTreeClassifier()
clf2.fit(X_tr, y_tr)
pred2 = clf2.predict(X_test)
acc2 = clf2.score(X_test, y_test)
acc2 # 0.6983930778739185
# Bagging
clf3 = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5, max_features=0.5, n_estimators=100,\
verbose=2)
clf3.fit(X_tr, y_tr)
pred3 = clf3.predict(X_test)
acc3=clf3.score(X_test,y_test)
acc3 # 0.911619283065513
I don't think that you can get this information from the fitted BaggingClassifier. But you can create such a plot by fitting for different n_estimators:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X, X_test, y, y_test = train_test_split(iris.data,
iris.target,
test_size=0.20)
estimators = list(range(1, 20))
accuracy = []
for n_estimators in estimators:
clf = BaggingClassifier(DecisionTreeClassifier(max_depth=1),
max_samples=0.2,
n_estimators=n_estimators)
clf.fit(X, y)
acc = clf.score(X_test, y_test)
accuracy.append(acc)
plt.plot(estimators, accuracy)
plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.show()
(Of course, the iris dataset is easily fit with just a single DecisionTreeClassifier, so I set max_depth=1 in this example.)
For a statistically meaningful result, you should fit a BaggingClassifier multiple times for each n_estimators and take the average of the obtained accuracies.
Related
Using a decision tree classifier to predict the winner of a chess game. In my dataset I have a 'Result' column that gives a 0, 1, or 2 for a win, loss, and draw respectively. How do I ensure that the final leaves of my decision tree classifier are the 'Result' and show its name and value, if this is even possible?
Dataset
from sklearn.model_selection import train_test_split
x = chess.drop(columns = ['Result'])
print(x.head())
y = chess['Result']
print(y.head())
print(x.shape, y.shape)
x_train, x_test, y_train, y_test = train_test_split(x, y)
print(x_train.shape, x_test.shape)
#%%
#Decision Tree Classifier
from sklearn import tree
from sklearn import metrics
import matplotlib.pyplot as plt
clf = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, random_state=0)
clf.fit(x_train, y_train)
y_train_pred = clf.predict(x_train)
y_pred = clf.predict(x_test)
print(y_pred)
print(metrics.confusion_matrix (y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.precision_score(y_test, y_pred, average = None))
print(metrics.recall_score(y_test, y_pred, average = None))
tree.plot_tree(clf, feature_names = cols, class_names = cols)
plt.savefig('out.pdf')
Decision Tree
I have tried switching my x and y values, adding class_names, dropping different columns, to no avail. I need the final leaves to show their name and value, and that name and value be the 'Result.'
I am currently attempting to train a neural network that predicts a 1kHz sine wave.
While the model itself has an accuracy score of 0.89, it does not accurately predict my test data.
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
#Generate 1kHz sine wave
pi = np.pi
X = np.arange(0,2*pi,0.05)
y = np.sin(1000*X)
tscv = TimeSeriesSplit(n_splits=3, test_size=30)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
plt.plot(X_train, y_train)
plt.plot(X_test, y_test)
#Training using train samples
sc = StandardScaler()
X_train = sc.fit_transform(X_train.reshape(-1,1))
X_test = sc.fit_transform(X_test.reshape(-1,1))
regr = MLPRegressor(random_state=1, max_iter=1000,hidden_layer_sizes=(32, 32)).fit(X_train, y_train)
regr.fit(X_train, y_train)
plt.plot(X_train, regr.predict(X_train), color = 'red')
plt.scatter(X_train, y_train)
regr.score(X_train, y_train)
Result of training
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_test, regr.predict(X_test))
Result of test
As you can see, the test data is far less periodic than the ML model. Why is this the case?
After making a desicion tree function, I have decided to check how accurate is the tree, and to confirm that atleast the first split is the same if I'll make another trees with the same data
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
from sklearn import tree
from sklearn import preprocessing
import sys
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
.....
def desicion_tree(data_set:pd.DataFrame,val_1 : str, val_2 : str):
#Encoder -- > fit doesn't accept strings
feature_cols = data_set.columns[0:-1]
X = data_set[feature_cols] # Independent variables
y = data_set.Mut #class
y = y.to_list()
le = preprocessing.LabelBinarizer()
y = le.fit_transform(y)
# Split data set into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) # 75%
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth= 4, criterion= 'entropy')
# Train Decision Tree Classifer
clf.fit(X_train, y_train)
# Predict the response for test dataset
y_pred = clf.predict(X_test)
#Perform cross validation
for i in range(2, 8):
plt.figure(figsize=(14, 7))
# Perform Kfold cross validation
#cv = ShuffleSplit(test_size=0.25, random_state=0)
kf = KFold(n_splits=5,shuffle= True)
scores = cross_val_score(estimator=clf, X=X, y=y, n_jobs=4, cv=kf)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
tree.plot_tree(clf,filled = True,feature_names=feature_cols,class_names=[val_1,val_2])
plt.show()
desicion_tree(car_rep_sep_20, 'Categorial', 'Non categorial')
Down , I wrote a loop in order to rectreate the tree with splitted values using Kfold. The accuracy is changing (around 90%) but the tree is the same, where did I mistaken?
cross_val_score clones the estimator in order to fit-and-score on the various folds, so the clf object remains the same as when you fit it to the entire dataset before the loop, and so the plotted tree is that one rather than any of the cross-validated ones.
To get what you're after, I think you can use cross_validate with option return_estimator=True. You also shouldn't need the loop, if your cv object has the number of splits desired:
kf = KFold(n_splits=5, shuffle=True)
cv_results = cross_validate(
estimator=clf,
X=X,
y=y,
n_jobs=4,
cv=kf,
return_estimator=True,
)
print("%0.2f accuracy with a standard deviation of %0.2f" % (
cv_results['test_score'].mean(),
cv_results['test_score'].std(),
))
for est in cv_results['estimator']:
tree.plot_tree(est, filled=True, feature_names=feature_cols, class_names=[val_1, val_2])
plt.show();
Alternatively, loop manually over the folds (or other cv iteration), fitting the model and plotting its tree in the loop.
write a program, use support vectore Regression-SVR to predict, firstly, split the dataset to train dataset and test dataset, the ratio of test dataset is 20%(case 1); secondly, use cross validate, split the dataset to 5 groups to predict(case 2),however, Using the same evaluation index(R2,MAE,MSE) to evaluate the two methods, the results are quite different
the program is as follows:
dataset = pd.read_csv('Dataset/allGlassStraightThroughTube.csv')
tube_par = dataset.iloc[:, 3:8].values
tube_eff = dataset.iloc[:, -1:].values
# # form train dataset , test dataset
tube_par_X_train, tube_par_X_test, tube_eff_Y_train, tube_eff_Y_test = train_test_split(tube_par, tube_eff, random_state=33, test_size=0.2)
# normalize the data
sc_X = StandardScaler()
sc_Y = StandardScaler()
sc_tube_par_X_train = sc_X.fit_transform(tube_par_X_train)
sc_tube_par_X_test = sc_X.transform(tube_par_X_test)
sc_tube_eff_Y_train = sc_Y.fit_transform(tube_eff_Y_train)
sc_tube_eff_Y_test = sc_Y.transform(tube_eff_Y_test)
# fit rbf SVR to the sc_tube_par_X dataset
support_vector_regressor = SVR(kernel='rbf')
support_vector_regressor.fit(sc_tube_par_X_train, sc_tube_eff_Y_train)
#
# # predict new result according to the sc_tube_par_X Dataset
pre_sc_tube_eff_Y_test = support_vector_regressor.predict(sc_tube_par_X_test)
pre_tube_eff_Y_test = sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)
# calculate the predict quality
print('R2-score value rbf SVR')
print(r2_score(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
print('The mean squared error of rbf SVR is')
print(mean_squared_error(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
print('The mean absolute error of rbf SVR is')
print(mean_absolute_error(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
# normalize
sc_tube_par_X = sc_X.fit_transform(tube_par)
sc_tube_eff_Y = sc_Y.fit_transform(tube_eff)
scoring = ['r2','neg_mean_squared_error', 'neg_mean_absolute_error']
rbf_svr_regressor = SVR(kernel='rbf')
scores = cross_validate(rbf_svr_regressor, sc_tube_par_X, sc_tube_eff_Y, cv=5, scoring=scoring, return_train_score=False)
in case 1, the evaluation index output is:
R2-score value rbf SVR
0.6486074476528559
The mean squared error of rbf SVR is
0.00013501023459497165
The mean absolute error of rbf SVR is
0.007196636233830076
in case 2, the evalution index output is:
R2-score
0.2621779727614816
test_neg_mean_squared_error
-0.6497292887710239
test_neg_mean_absolute_error
-0.5629408849740231
the difference between case 1 and case 2 is big, could you please me the reason and how to correct it
Bin.
I have prepared a little example to see how the results change using cross-validation. I recomend you to try to split the data without seed and see how the results change.
You will see that cross validation results are almost a constant independently of the data split.
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_validate
#from sklearn.cross_validation import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt
def print_metrics(real_y,predicted_y):
# calculate the predict quality
print('R2-score value {:>8.4f}'.format(r2_score(real_y, predicted_y)))
print('Mean squared error is {:>8.4f}'.format(mean_squared_error(real_y, predicted_y)))
print('Mean absolute error is {:>8.4f}\n\n'.format(mean_absolute_error(real_y, predicted_y)))
def show_plot(real_y,predicted_y):
fig,ax = plt.subplots()
ax.scatter(real_y,predicted_y,edgecolors=(0,0,0))
ax.plot([real_y.min(),real_y.max()],[real_y.min(),real_y.max()],"k--",lw=4)
ax.set_xlabel("Measured")
ax.set_ylabel("Predicted")
plt.show()
# dataset load
boston = datasets.load_boston()
#dataset info
# print(boston.keys())
# print(boston.DESCR)
# print(boston.data.shape)
# print(boston.feature_names)
# numpy_arrays
X = boston.data
Y = boston.target
# # form train dataset , test dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=33, test_size=0.2)
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=5, test_size=0.2)
# fit scalers
sc_X = StandardScaler().fit(X_train)
# standarizes X (train and test)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)
######################################################################
############################### SVR ##################################
######################################################################
support_vector_regressor = SVR(kernel='rbf')
support_vector_regressor.fit(X_train, Y_train)
predicted_Y = support_vector_regressor.predict(X_test)
print_metrics(predicted_Y,Y_test)
show_plot(predicted_Y,Y_test)
######################################################################
########################### LINEAR REGRESSOR #########################
######################################################################
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
predicted_Y = lin_model.predict(X_test)
print_metrics(predicted_Y,Y_test)
show_plot(predicted_Y,Y_test)
######################################################################
######################### SVR + CROSS VALIDATION #####################
######################################################################
sc = StandardScaler().fit(X)
standarized_X = sc.transform(X)
scoring = ['r2','neg_mean_squared_error', 'neg_mean_absolute_error']
rbf_svr_regressor = SVR(kernel='rbf')
scores = cross_validate(rbf_svr_regressor, standarized_X, Y, cv=10, scoring=scoring, return_train_score=False)
print(scores["test_r2"].mean())
print(-1*(scores["test_neg_mean_squared_error"].mean()))
print(-1*(scores["test_neg_mean_absolute_error"].mean()))
I want to have metrics per class label and an aggregate confusion matrix from a cross validation in scikit learn.
I wrote a method that performs a cross-validation for scikit learn that sums the confusion matrices and also stores all the predicted labels. Then, it calls scikit learn methods to print out the metrics.
The code below should run with any recent scikit learn installation, you can test it out with any dataset.
Is below the correct way to gather an aggregate cm and a classification_report when doing StratifiedKFold cross validation?
from sklearn import metrics
from sklearn.cross_validation import StratifiedKFold
import numpy as np
def customCrossValidation(self, X, y, classifier, n_folds=10, shuffle=True, random_state=0):
''' Perform a cross validation and print out the metrics '''
skf = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle, random_state=random_state)
cm = None
y_predicted_overall = None
y_test_overall = None
for train_index, test_index in skf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
classifier.fit(X_train, y_train)
y_predicted = classifier.predict(X_test)
# collect the y_predicted per fold
if y_predicted_overall is None:
y_predicted_overall = y_predicted
y_test_overall = y_test
else:
y_predicted_overall = np.concatenate([y_predicted_overall, y_predicted])
y_test_overall = np.concatenate([y_test_overall, y_test])
cv_cm = metrics.confusion_matrix(y_test, y_predicted)
# sum the cv per fold
if cm is None:
cm = cv_cm
else:
cm += cv_cm
print (metrics.classification_report(y_test_overall, y_predicted_overall, digits=3))
print (cm)