Predicting a Sine Wave Using ML in Python - python

I am currently attempting to train a neural network that predicts a 1kHz sine wave.
While the model itself has an accuracy score of 0.89, it does not accurately predict my test data.
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
#Generate 1kHz sine wave
pi = np.pi
X = np.arange(0,2*pi,0.05)
y = np.sin(1000*X)
tscv = TimeSeriesSplit(n_splits=3, test_size=30)
for train_index, test_index in tscv.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
plt.plot(X_train, y_train)
plt.plot(X_test, y_test)
#Training using train samples
sc = StandardScaler()
X_train = sc.fit_transform(X_train.reshape(-1,1))
X_test = sc.fit_transform(X_test.reshape(-1,1))
regr = MLPRegressor(random_state=1, max_iter=1000,hidden_layer_sizes=(32, 32)).fit(X_train, y_train)
regr.fit(X_train, y_train)
plt.plot(X_train, regr.predict(X_train), color = 'red')
plt.scatter(X_train, y_train)
regr.score(X_train, y_train)
Result of training
plt.scatter(X_test, y_test, color = 'red')
plt.plot(X_test, regr.predict(X_test))
Result of test
As you can see, the test data is far less periodic than the ML model. Why is this the case?

Related

ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. warnings.warn(

I am trying to use Grid-Search-Cross-Validation to find the best value of hyperparameter C. I did split the data set into two subsets contains 50% of the Mnist 784, and used only one of the two subsets with 60% and 40% for training and testing respectively.
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import numpy as np
mnist = fetch_openml('mnist_784')
X, y = mnist['data'], mnist['target']
X_1, X_2, y_1, y_2 = train_test_split(X, y, test_size=0.5, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.4)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm = LinearSVC(dual=False, max_iter=10000)
param_grid = {'C': [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]}
grid = GridSearchCV(svm, param_grid, scoring='accuracy')
grid.fit(X_train_scaled, y_train)
print("Best value of C:", grid.best_params_['C'])
accuracy = grid.score(X_test_scaled, y_test)
print("Test accuracy:", accuracy)
I have tried everything without any progress.
I tried minimizing the data and then normalizing it, also tried to increase the max_iter=10000 and 15000.

What does the error mean and how to fix it - "ValueError: query data dimension must match training data dimension"

I am trying to write the code for K-NN
Below is my code. - I know that issue is in `predict() but I am not able to figure out how o fix it.
# Importing the libraries
import numpy as np
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('UniversalBank.csv')
X = dataset.iloc[:,[ 1,2,3,5,6,7,8,10,11,12,13]].values #,
y = dataset.iloc[:,9].values
#Splitting the dataset to training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state= 0)
#Feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#Fitting the classifier to training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train,y_train)
#Predicting the test results
y_pred = classifier.predict(X_test)

python the evaluation index valus are different largely between cross validate and train_test_split cases

write a program, use support vectore Regression-SVR to predict, firstly, split the dataset to train dataset and test dataset, the ratio of test dataset is 20%(case 1); secondly, use cross validate, split the dataset to 5 groups to predict(case 2),however, Using the same evaluation index(R2,MAE,MSE) to evaluate the two methods, the results are quite different
the program is as follows:
dataset = pd.read_csv('Dataset/allGlassStraightThroughTube.csv')
tube_par = dataset.iloc[:, 3:8].values
tube_eff = dataset.iloc[:, -1:].values
# # form train dataset , test dataset
tube_par_X_train, tube_par_X_test, tube_eff_Y_train, tube_eff_Y_test = train_test_split(tube_par, tube_eff, random_state=33, test_size=0.2)
# normalize the data
sc_X = StandardScaler()
sc_Y = StandardScaler()
sc_tube_par_X_train = sc_X.fit_transform(tube_par_X_train)
sc_tube_par_X_test = sc_X.transform(tube_par_X_test)
sc_tube_eff_Y_train = sc_Y.fit_transform(tube_eff_Y_train)
sc_tube_eff_Y_test = sc_Y.transform(tube_eff_Y_test)
# fit rbf SVR to the sc_tube_par_X dataset
support_vector_regressor = SVR(kernel='rbf')
support_vector_regressor.fit(sc_tube_par_X_train, sc_tube_eff_Y_train)
#
# # predict new result according to the sc_tube_par_X Dataset
pre_sc_tube_eff_Y_test = support_vector_regressor.predict(sc_tube_par_X_test)
pre_tube_eff_Y_test = sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)
# calculate the predict quality
print('R2-score value rbf SVR')
print(r2_score(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
print('The mean squared error of rbf SVR is')
print(mean_squared_error(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
print('The mean absolute error of rbf SVR is')
print(mean_absolute_error(sc_Y.inverse_transform(sc_tube_eff_Y_test), sc_Y.inverse_transform(pre_sc_tube_eff_Y_test)))
# normalize
sc_tube_par_X = sc_X.fit_transform(tube_par)
sc_tube_eff_Y = sc_Y.fit_transform(tube_eff)
scoring = ['r2','neg_mean_squared_error', 'neg_mean_absolute_error']
rbf_svr_regressor = SVR(kernel='rbf')
scores = cross_validate(rbf_svr_regressor, sc_tube_par_X, sc_tube_eff_Y, cv=5, scoring=scoring, return_train_score=False)
in case 1, the evaluation index output is:
R2-score value rbf SVR
0.6486074476528559
The mean squared error of rbf SVR is
0.00013501023459497165
The mean absolute error of rbf SVR is
0.007196636233830076
in case 2, the evalution index output is:
R2-score
0.2621779727614816
test_neg_mean_squared_error
-0.6497292887710239
test_neg_mean_absolute_error
-0.5629408849740231
the difference between case 1 and case 2 is big, could you please me the reason and how to correct it
Bin.
I have prepared a little example to see how the results change using cross-validation. I recomend you to try to split the data without seed and see how the results change.
You will see that cross validation results are almost a constant independently of the data split.
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_validate
#from sklearn.cross_validation import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt
def print_metrics(real_y,predicted_y):
# calculate the predict quality
print('R2-score value {:>8.4f}'.format(r2_score(real_y, predicted_y)))
print('Mean squared error is {:>8.4f}'.format(mean_squared_error(real_y, predicted_y)))
print('Mean absolute error is {:>8.4f}\n\n'.format(mean_absolute_error(real_y, predicted_y)))
def show_plot(real_y,predicted_y):
fig,ax = plt.subplots()
ax.scatter(real_y,predicted_y,edgecolors=(0,0,0))
ax.plot([real_y.min(),real_y.max()],[real_y.min(),real_y.max()],"k--",lw=4)
ax.set_xlabel("Measured")
ax.set_ylabel("Predicted")
plt.show()
# dataset load
boston = datasets.load_boston()
#dataset info
# print(boston.keys())
# print(boston.DESCR)
# print(boston.data.shape)
# print(boston.feature_names)
# numpy_arrays
X = boston.data
Y = boston.target
# # form train dataset , test dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=33, test_size=0.2)
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=5, test_size=0.2)
# fit scalers
sc_X = StandardScaler().fit(X_train)
# standarizes X (train and test)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)
######################################################################
############################### SVR ##################################
######################################################################
support_vector_regressor = SVR(kernel='rbf')
support_vector_regressor.fit(X_train, Y_train)
predicted_Y = support_vector_regressor.predict(X_test)
print_metrics(predicted_Y,Y_test)
show_plot(predicted_Y,Y_test)
######################################################################
########################### LINEAR REGRESSOR #########################
######################################################################
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)
predicted_Y = lin_model.predict(X_test)
print_metrics(predicted_Y,Y_test)
show_plot(predicted_Y,Y_test)
######################################################################
######################### SVR + CROSS VALIDATION #####################
######################################################################
sc = StandardScaler().fit(X)
standarized_X = sc.transform(X)
scoring = ['r2','neg_mean_squared_error', 'neg_mean_absolute_error']
rbf_svr_regressor = SVR(kernel='rbf')
scores = cross_validate(rbf_svr_regressor, standarized_X, Y, cv=10, scoring=scoring, return_train_score=False)
print(scores["test_r2"].mean())
print(-1*(scores["test_neg_mean_squared_error"].mean()))
print(-1*(scores["test_neg_mean_absolute_error"].mean()))

Plotting history of accuracy in BaggingClassifier

I've trained a simple random forest algorithm and bagging classifier (n_estimators = 100). Is it possible to plot the history of accuracy in bagging Classifier? How to calculate the variance of in 100 samples?
I've just printed the accuracy value for both algorithms:
# DecisionTree
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90)
clf2 = tree.DecisionTreeClassifier()
clf2.fit(X_tr, y_tr)
pred2 = clf2.predict(X_test)
acc2 = clf2.score(X_test, y_test)
acc2 # 0.6983930778739185
# Bagging
clf3 = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5, max_features=0.5, n_estimators=100,\
verbose=2)
clf3.fit(X_tr, y_tr)
pred3 = clf3.predict(X_test)
acc3=clf3.score(X_test,y_test)
acc3 # 0.911619283065513
I don't think that you can get this information from the fitted BaggingClassifier. But you can create such a plot by fitting for different n_estimators:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X, X_test, y, y_test = train_test_split(iris.data,
iris.target,
test_size=0.20)
estimators = list(range(1, 20))
accuracy = []
for n_estimators in estimators:
clf = BaggingClassifier(DecisionTreeClassifier(max_depth=1),
max_samples=0.2,
n_estimators=n_estimators)
clf.fit(X, y)
acc = clf.score(X_test, y_test)
accuracy.append(acc)
plt.plot(estimators, accuracy)
plt.xlabel("Number of estimators")
plt.ylabel("Accuracy")
plt.show()
(Of course, the iris dataset is easily fit with just a single DecisionTreeClassifier, so I set max_depth=1 in this example.)
For a statistically meaningful result, you should fit a BaggingClassifier multiple times for each n_estimators and take the average of the obtained accuracies.

LeaveOneOut to determine k of knn

I want to know the best k for k-nearest-neighbor. I am using LeaveOneOut to divide my data into train and test sets. In the code below I have 150 data entries, so I get 150 different train and test sets. K should be in-between 1 and 40.
I want to plot the cross-validation average classification error as a function of k, too see which k is the best for KNN.
Here is my code:
import scipy.io as sio
import seaborn as sn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import LeaveOneOut
error = []
array = np.array(range(1,41))
dataset = pd.read_excel('Data/iris.xls')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
loo = LeaveOneOut()
loo.get_n_splits(X)
for train_index, test_index in loo.split(X):
#print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
#print(X_train, X_test, y_train, y_test)
for i in range(1, 41):
classifier = KNeighborsClassifier(n_neighbors=i)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
error.append(np.mean(y_pred != y_test))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 41), error, color='red', linestyle='dashed', marker='o', markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
You are calculating error at each prediction, thats why you have 6000 points in your error array. You need to collect the predictions of all points in the fold for a given 'n_neighbors' and then calculate the error for that value.
You can do this:
# Loop over possible values of "n_neighbors"
for i in range(1, 41):
# Collect the actual and predicted values for all splits for a single "n_neighbors"
actual = []
predicted = []
for train_index, test_index in loo.split(X):
#print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
classifier = KNeighborsClassifier(n_neighbors=i)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# Append the single predictions and actual values here.
actual.append(y_test[0])
predicted.append(y_pred[0])
# Outside the loop, calculate the error.
error.append(np.mean(np.array(predicted) != np.array(actual)))
Rest of your code is okay.
There is a more compact way to do this if you use the cross_val_predict
from sklearn.model_selection import cross_val_predict
for i in range(1, 41):
classifier = KNeighborsClassifier(n_neighbors=i)
y_pred = cross_val_predict(classifier, X, y, cv=loo)
error.append(np.mean(y_pred != y))

Categories