What's wrong in the below code. I did not understand the errors. Please Help!!
First I imported our models. after that tried to find out accuracy score with comparison graph by box plot. Got so many errors. Please Help!!
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
models = []
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier))
models.append(('CART', DecisionTreeClassifier()))
models.append(('LR', LogisticRegression()))
results = []
names = []
scoring = 'accuracy'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=None)
cv_results = model_selection.cross_val_score(model, X_cow_train, Y_cow_train, cv=kfold, scoring=scoring)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Comparison between different MLAs')
ax = fig.add_subplot(111)
I have tried to evalute the different Machine Learning model and facing this error. The error shows that cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') and the kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
If anyone know where is the problem please let me know about it.
# comparing algorithms and training models
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# Spot Check Algorithms
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors=19)))
models.append(('Decision Tree', DecisionTreeClassifier(min_samples_leaf=60)))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Random Forest', RandomForestClassifier(n_estimators=80, max_depth=3,random_state=0,min_samples_leaf=9)))
X_train = X_train.reshape(658448,-1)
Y_train = Y_train.reshape(5879,-1)
Y_test = Y_test.reshape(1960,1)
X_test = X_test.reshape(219520,-1)
Y_test = Y_test.astype('int')
X_train = X_train.astype('int')
Y_train = Y_train.astype('int')
type(Y_train )
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
#kfold = StratifiedShuffleSplit(n_splits=2, random_state=2, test_size=.25)
#kf = KFold(n_splits=5, random_state=3,shuffle=True )
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
I am having timeseries dataset which looks like below:
Time series Dataset
I am using linear regression for which i have to do plot of original and prediction. This was the plot i got
The below is my code.
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
tss = TimeSeriesSplit(n_splits=5)
df = df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx].dropna()
test = df.iloc[val_idx].dropna()
FEATURES = ['7day_rolling_avg','Lag_1']
TARGET = 'Liquid Lvl % C'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
model = LinearRegression()
model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print("Training score: ", score)
scores = cross_val_score(model, X_train, y_train,cv=3)
print("Mean cross-validation score: %.2f" % scores.mean())
ypred = model.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))
#x_ax = range(len(y_test))
#plt.plot(x_ax, y_test, label="original")
#plt.plot(x_ax, ypred,color='red', label="Linear/predicted")
#plt.title("Data Prediction")
I want date to be plotted on my X-axis. There was a similar question in stack overflow How to plot predicted values with actual values when we have multi-index
but it didnt work for me. This is what i tried.
I'd like to compare the computational time for multiple models using bar or something esle. So, I need to know which one is fastest model and also slowest one easly using the figure instead of numbers.
This full code from here:
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import time
# Load dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = read_csv(url, names=names)
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
y = array[:,4]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1, shuffle=True)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
time_model = []
for name, model in models:
start = time.time()
kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
com_time = time.time() - start
print('%s: %f (%f) ' % (name, cv_results.mean(), cv_results.std()))
print ('time', time.time() - start)
# print time.mean
# Compare Algorithms
# pyplot.boxplot(results, labels=names)
# pyplot.title('Algorithm Comparison')
# pyplot.show()
# print time_model, names
pyplot.title('Algorithm Comparison')
pyplot.bar(time_model, labels=names)
How to do so that it will be similar this figure below and the same order (Ascending)?
I'm trying to modify the example from this tutorial to use my own data.
In the tutorial Y-data can only have 3 different values, but in my case it can be between 0 and 200. I consider it a successful estimate if the prediction gets to +-3.
I suspect I have to make some modification to the scoring variable, but I'm not sure how to proceed.
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
url = "testdata2.csv"
dataset = pandas.read_csv(url)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Split-out validation dataset
array = dataset.values
X = array[:,0:6]
Y = array[:,6]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
# -*- coding: utf-8 -*-
Created on Wed Apr 26 21:28:31 2017
#author: Chirantan
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data"
names = ['Sequence Name','mcg', 'gvh', 'alm', 'mit', 'erl','pox','vac','nuc']
dataset = pandas.read_csv(url, names=names, delim_whitespace=True)
# shape
# head
# descriptions
# class distribution
# box and whisker plots
dataset.plot(kind='box', subplots=True, layout=(10,10), sharex=False, sharey=False)
# histograms
# scatter plot matrix
# histograms
# scatter plot matrix
# Split-out validation dataset
array = dataset.values
X = array[:,0:9]
Y = array[:,9]#HERE IS THE ERROR
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
# Make predictions on validation dataset
#knn = KNeighborsClassifier()
svm = SVC()
svm.fit(X_train, Y_train)
predictions = svm.predict(X_validation)
#knn.fit(X_train, Y_train)
#predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
I am trying to use different classifiers for the multi class yeast dataset from UCI repository.Everything works fine with the above code with the Iris dataset with the following change only
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
But it is not working with the Yeast dataset when I do this
# Split-out validation dataset
array = dataset.values
X = array[:,0:9]
Y = array[:,9]
validation_size = 0.20
Here is the error messaage
File "<ipython-input-40-707d4eef8576>", line 55, in <module>
Y = array[:,9]
IndexError: index 9 is out of bounds for axis 1 with size 9
I do not understand this .array stores the values for the dataset and now array[:,9] would give me the last column.Where am I wrong?Please help.
array does not have column with index 9. It has 9 columns, and the last one has index 8. (Because the first column has index 0.)