Compare graph for Accuracy of different models - python

What's wrong in the below code. I did not understand the errors. Please Help!!
First I imported our models. after that tried to find out accuracy score with comparison graph by box plot. Got so many errors. Please Help!!
from sklearn.svm import SVC
svm=SVC()
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
models = []
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier))
models.append(('CART', DecisionTreeClassifier()))
models.append(('LR', LogisticRegression()))
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=None)
cv_results = model_selection.cross_val_score(model, X_cow_train, Y_cow_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Comparison between different MLAs')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
[Errors][1]plt.show()

Related

ValueError: Found input variables with inconsistent numbers of samples: [658448, 5879]

I have tried to evalute the different Machine Learning model and facing this error. The error shows that cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') and the kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
If anyone know where is the problem please let me know about it.
# comparing algorithms and training models
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# Spot Check Algorithms
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors=19)))
models.append(('Decision Tree', DecisionTreeClassifier(min_samples_leaf=60)))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier(n_estimators=80, max_depth=3,random_state=0,min_samples_leaf=9)))
X_train = X_train.reshape(658448,-1)
Y_train = Y_train.reshape(5879,-1)
Y_test = Y_test.reshape(1960,1)
X_test = X_test.reshape(219520,-1)
Y_test = Y_test.astype('int')
X_train = X_train.astype('int')
Y_train = Y_train.astype('int')
type(Y_train )
type(X_train)
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
#kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
#kf = KFold(n_splits=5, random_state=3,shuffle=True )
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

To plot date on X-axis for actual vs predicted values

I am having timeseries dataset which looks like below:
Time series Dataset
I am using linear regression for which i have to do plot of original and prediction. This was the plot i got
The below is my code.
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
tss = TimeSeriesSplit(n_splits=5)
df = df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx].dropna()
test = df.iloc[val_idx].dropna()
FEATURES = ['7day_rolling_avg','Lag_1']
TARGET = 'Liquid Lvl % C'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
#################################################################################################################
model = LinearRegression()
model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print("Training score: ", score)
scores = cross_val_score(model, X_train, y_train,cv=3)
print("Mean cross-validation score: %.2f" % scores.mean())
ypred = model.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))
#x_ax = range(len(y_test))
#plt.plot(x_ax, y_test, label="original")
#plt.plot(x_ax, ypred,color='red', label="Linear/predicted")
#plt.title("Data Prediction")
#plt.legend()
#plt.show()
fig=plt.figure()
c=df.to_list()
plt.plot(c,y_test)
plt.plot(c,y_pred)
plt.show()
ax.plot(df)
I want date to be plotted on my X-axis. There was a similar question in stack overflow How to plot predicted values with actual values when we have multi-index
but it didnt work for me. This is what i tried.
fig=plt.figure()
c=df.to_list()
plt.plot(c,y_test)
plt.plot(c,y_pred)
plt.show()
ax.plot(df)

How to plot computational time for multi-model?

I'd like to compare the computational time for multiple models using bar or something esle. So, I need to know which one is fastest model and also slowest one easly using the figure instead of numbers.
This full code from here:
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import time
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = read_csv(url, names=names)
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
y = array[:,4]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1, shuffle=True)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
time_model = []
for name, model in models:
start = time.time()
kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
com_time = time.time() - start
time_model.append(com_time)
names.append(name)
print('%s: %f (%f) ' % (name, cv_results.mean(), cv_results.std()))
print ('time', time.time() - start)
# print time.mean
# Compare Algorithms
# pyplot.boxplot(results, labels=names)
# pyplot.title('Algorithm Comparison')
# pyplot.show()
# print time_model, names
pyplot.figure()
pyplot.title('Algorithm Comparison')
pyplot.bar(time_model, labels=names)
pyplot.show()
How to do so that it will be similar this figure below and the same order (Ascending)?

Changing the evaluation metric of a model_selection

I'm trying to modify the example from this tutorial to use my own data.
In the tutorial Y-data can only have 3 different values, but in my case it can be between 0 and 200. I consider it a successful estimate if the prediction gets to +-3.
I suspect I have to make some modification to the scoring variable, but I'm not sure how to proceed.
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
url = "testdata2.csv"
dataset = pandas.read_csv(url)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Split-out validation dataset
array = dataset.values
X = array[:,0:6]
Y = array[:,6]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(cv_results)

I am having out of bounds error issues with Yeast Data in Python.Why?

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 26 21:28:31 2017
#author: Chirantan
"""
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data"
names = ['Sequence Name','mcg', 'gvh', 'alm', 'mit', 'erl','pox','vac','nuc']
dataset = pandas.read_csv(url, names=names, delim_whitespace=True)
# shape
print(dataset.shape)
# head
print(dataset.head(20))
# descriptions
print(dataset.describe())
# class distribution
#print(dataset.groupby('').size())
# box and whisker plots
dataset.plot(kind='box', subplots=True, layout=(10,10), sharex=False, sharey=False)
plt.show()
# histograms
dataset.hist()
plt.show()
# scatter plot matrix
scatter_matrix(dataset)
plt.show()
# histograms
dataset.hist()
plt.show()
# scatter plot matrix
scatter_matrix(dataset)
plt.show()
# Split-out validation dataset
array = dataset.values
X = array[:,0:9]
Y = array[:,9]#HERE IS THE ERROR
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Make predictions on validation dataset
#knn = KNeighborsClassifier()
svm = SVC()
svm.fit(X_train, Y_train)
predictions = svm.predict(X_validation)
#knn.fit(X_train, Y_train)
#predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
I am trying to use different classifiers for the multi class yeast dataset from UCI repository.Everything works fine with the above code with the Iris dataset with the following change only
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
But it is not working with the Yeast dataset when I do this
# Split-out validation dataset
array = dataset.values
X = array[:,0:9]
Y = array[:,9]
validation_size = 0.20
Here is the error messaage
File "<ipython-input-40-707d4eef8576>", line 55, in <module>
Y = array[:,9]
IndexError: index 9 is out of bounds for axis 1 with size 9
I do not understand this .array stores the values for the dataset and now array[:,9] would give me the last column.Where am I wrong?Please help.
array does not have column with index 9. It has 9 columns, and the last one has index 8. (Because the first column has index 0.)

Categories