To plot date on X-axis for actual vs predicted values - python

I am having timeseries dataset which looks like below:
Time series Dataset
I am using linear regression for which i have to do plot of original and prediction. This was the plot i got
The below is my code.
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
tss = TimeSeriesSplit(n_splits=5)
df = df.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df):
train = df.iloc[train_idx].dropna()
test = df.iloc[val_idx].dropna()
FEATURES = ['7day_rolling_avg','Lag_1']
TARGET = 'Liquid Lvl % C'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
#################################################################################################################
model = LinearRegression()
model.fit(X_train, y_train)
score = model.score(X_train, y_train)
print("Training score: ", score)
scores = cross_val_score(model, X_train, y_train,cv=3)
print("Mean cross-validation score: %.2f" % scores.mean())
ypred = model.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))
#x_ax = range(len(y_test))
#plt.plot(x_ax, y_test, label="original")
#plt.plot(x_ax, ypred,color='red', label="Linear/predicted")
#plt.title("Data Prediction")
#plt.legend()
#plt.show()
fig=plt.figure()
c=df.to_list()
plt.plot(c,y_test)
plt.plot(c,y_pred)
plt.show()
ax.plot(df)
I want date to be plotted on my X-axis. There was a similar question in stack overflow How to plot predicted values with actual values when we have multi-index
but it didnt work for me. This is what i tried.
fig=plt.figure()
c=df.to_list()
plt.plot(c,y_test)
plt.plot(c,y_pred)
plt.show()
ax.plot(df)

Related

How to visualize cluster boundaries

I generated several datasets, and using classifiers, I predicted the distribution of clusters. I need to draw boundaries between clusters on the chart. In the form of lines or in the form of filled areas - it does not matter. Please let me know if there is any way to do this.
My code:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split
n_sample = 2000
def make_square(n_sample):
data=np.array([0,[]])
data[0] = np.random.sample((n_sample,2))
for i in range(n_sample):
if data[0][i][0] > 0.5 and data[0][i][1] > 0.5 or data[0][i][0] < 0.5 and data[0][i][1] < 0.5:
data[1].append(1)
else:
data[1].append(0)
return data
datasets = [
make_circles(n_samples=n_sample, noise=0.09, factor=0.5),
make_square(n_sample),
make_moons(n_samples=n_sample, noise=0.12),
]
ks=[]
for data in datasets:
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
acc = classifier.score(X_test, y_test)
accs = []
for i in range(1, 8):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
acc0 = knn.score(X_test, y_test)
accs.append(acc0)
plt.figure(figsize=(12, 6))
plt.plot(range(1, 8), accs, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('accs Score K Value')
plt.xlabel('K Value')
plt.ylabel('accs Score')
print("Max Score:", max(accs), "k=",accs.index(max(accs))+1)
ks.append(accs.index(max(accs))+1)
for i in range(3):
data = datasets[i]
k = ks[i]
X,y = data[0],data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=33)
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
plt.figure(figsize=(9,9))
plt.title("Test")
plt.scatter(X_test[:,0], X_test[:,1], c=y_test)
plt.figure(figsize=(9,9))
plt.title("Predict")
plt.scatter(X_test[:,0], X_test[:,1], c=y_pred)
Example output:
enter image description here
enter image description here
scikit-learn 1.1 introduced the DecisionBoundaryDisplay to assist with this sort of task.
Following the use of make_moons and the KNeighborsClassifier in the question, we can fit the classifier on the dataset, invoke the DecisionBoundaryDisplay.from_estimator() method, then scatter the X data on the returned axis:
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import DecisionBoundaryDisplay
X, y = make_moons(noise=0.2)
clf = KNeighborsClassifier().fit(X, y)
disp = DecisionBoundaryDisplay.from_estimator(clf, X, response_method="predict", alpha=0.3)
disp.ax_.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
Resulting in something like this:

To forecasting that can predict future data (example x day)

I have Timeseries dataset. I have used cross validation and XGBregressor model. Now i want to forcast my prediction for particular x day.
As per my understaning any fundamental ML prediction model can be expressed as : y = a. f(x)+b. where x = input, b= bias, y= prediction.
I have already trained the model. So for my case x will be time vector. Now the predicted forcasted output matrix say Y already contains all the forcast of x days/month which include everything.
So only thing for me is now to filter out from data what I need.
So, I am trying to write a fucntion argument where Y is the argument and from that i want to filter out for example it say below 20% or what will be forcast on 36 day. Can someone explain me how i can write this?
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
import xgboost as xgbr
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
tss = TimeSeriesSplit(n_splits=3, test_size=24*365*1, gap=24)
df3 = df3.sort_index()
fold = 0
preds = []
scores = []
for train_idx, val_idx in tss.split(df3):
train = df3.iloc[train_idx].dropna()
test = df3.iloc[val_idx].dropna()
FEATURES = ['7day_rolling_avg','Lag_1']
TARGET = 'Liquid Lvl % C'
X_train = train[FEATURES]
y_train = train[TARGET]
X_test = test[FEATURES]
y_test = test[TARGET]
#################################################################################################################
xgbr = xgb.XGBRegressor(verbosity=0)
print(xgbr)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
importance_type='gain', learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)
xgbr.fit(X_train, y_train)
score = xgbr.score(X_train, y_train)
print("Training score: ", score)
scores = cross_val_score(xgbr, X_train, y_train,cv=3)
print("Mean cross-validation score: %.2f" % scores.mean())
ypred = xgbr.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))
x_ax = range(len(y_test))
plt.plot(x_ax, y_test, label="original")
plt.plot(x_ax, ypred, label="predicted")
plt.title("Data Prediction")
plt.legend()
plt.show()

How to add outliers to a Linear Regression dataset?

I am trying to play around to see how outliers in a dataset might affect a Linear Regression model. The issue I'm having is I don't exactly know how to add outliers to a dataset, I've only found loads of articles online about how to detect and remove them.
This is the code I have so far:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
# Generate regression dataset
X, y = make_regression(
n_samples=1000,
n_features=1,
noise=0.0,
bias=0.0,
random_state=42,
)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
regressor = LinearRegression()
regressor.fit(X_train, y_train) # Training the algorithm
y_pred = regressor.predict(X_test)
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
plt.scatter(X_test, y_test)
plt.plot(X_test, y_pred, color="red", linewidth=1)
plt.show()
And this is the output:
My question is how can I add outliers to this clean dataset in order to see the effects outliers will have on the resulting model?
Any help would be appreciated, thanks!
You can add values directly to X and y. Since the slope is significant enough, this will end up giving you outliers. You could us any method you want really.
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
# Generate regression dataset
X, y = make_regression(
n_samples=1000,
n_features=1,
noise=0.0,
bias=0.0,
random_state=42,
)
for x in range(20):
X=np.append(X, np.random.choice(X.flatten()))
y=np.append(y, np.random.choice(y.flatten()))
X = X.reshape(-1,1)
y = y.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
regressor = LinearRegression()
regressor.fit(X_train, y_train) # Training the algorithm
y_pred = regressor.predict(X_test)
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
plt.scatter(X_test, y_test)
plt.plot(X_test, y_pred, color="red", linewidth=1)
plt.show()

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?)

Trying to get a prediction using my decision tree model gives the titular error on the final line of code.
X=BTC_cleanData[-1:]
---> print(regressor.predict(X))
ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),
(k,m?)->(n?,m?) (size 145 is different from 146)
As far as I can tell, I've successfully trained and tested the model, but I'm doing something wrong when I attempt to output a prediction. I think something about the way I am defining the target to be predicted is adding a column to a matrix somewhere, hence matmul error. How do I write a prediction function that works?
here's the full code, I've left out feature selection as its very long:
import pandas as pd
import numpy as np
import talib
import matplotlib.pyplot as plt
%matplotlib inline
import investpy
from investpy import data
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
#Import open, high, low, close, volume and Return data from csv using investpy
BTC = data = investpy.get_crypto_historical_data(crypto='bitcoin', from_date='01/01/2014', to_date='06/08/2020')
#Convert Data from Int to Float
BTC.Volume = BTC.Volume.astype(float)
BTC.High = BTC.High.astype(float)
BTC.Low = BTC.Low.astype(float)
BTC.Close = BTC.Close.astype(float)
#Drop Unnecessary Columns
del BTC['Currency']
#Select Indicators as Features
BTC['AD'] = talib.AD(BTC['High'].values, BTC['Low'].values, BTC['Close'].values, BTC['Volume'].values)
...(there is a long list here)
#Create forward looking columns using shift
BTC['NextDayPrice'] = BTC['Close'].shift(-1)
#Copy dataframe and clean data
BTC_cleanData = BTC.copy()
BTC_cleanData.dropna(inplace=True)
BTC_cleanData.to_csv('C:/Users/Admin/Desktop/BTCdata.csv')
#Split Data into Training and Testing Set
#separate the features and targets into separate datasets.
#split the data into training and testing sets using a 70/30 split
#Using splicing, separate the features from the target into individual data sets.
X_all = BTC_cleanData.iloc[:, BTC_cleanData.columns != 'NextDayPrice'] # feature values for all days
y_all = BTC_cleanData['NextDayPrice'] # corresponding targets/labels
print (X_all.head()) # print the first 5 rows
#Split the data into training and testing sets using the given feature as the target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.30, random_state=42)
from sklearn.linear_model import LinearRegression
#Create a decision tree regressor and fit it to the training set
regressor = LinearRegression()
regressor.fit(X_train,y_train)
print ("Training set: {} samples".format(X_train.shape[0]))
print ("Test set: {} samples".format(X_test.shape[0]))
#Evaluate Model (out of sample Accuracy and Mean Squared Error)
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, X_test, y_test, cv=10)
print ("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2))
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, regressor.predict(X_test))
print("MSE: %.4f" % mse)
#Evaluate Model (In sample Accuracy and Mean Squared Error)
trainscores = cross_val_score(regressor, X_train, y_train, cv=10)
print ("accuracy: %0.2f (+/- %0.2f)" % (trainscores.mean(), trainscores.std() / 2))
mse = mean_squared_error(y_train, regressor.predict(X_train))
print("MSE: %.4f" % mse)
print(regressor.predict(X_train))
#Predict Next Day Price
X=BTC_cleanData[-1:]
print(regressor.predict(X))
You have trained your model using the X_train data. To predict the unseen data, you just need print(regressor.predict(X_test)).
Before you had:
X=BTC_cleanData[-1:] # this has one more column compared to X_train and X_test
print(regressor.predict(X))
But BTC_cleanData[-1:] has one more column compared to X_train and X_test. However, the model was trained using the X_train that DOES NOT have this additional column, and this leads to the error.
Clean working code:
import pandas as pd
import numpy as np
import talib
import matplotlib.pyplot as plt
%matplotlib inline
import investpy
from investpy.crypto import get_crypto_historical_data
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#Import open, high, low, close, volume and Return data from csv using investpy
BTC = get_crypto_historical_data(crypto='bitcoin', from_date='01/01/2014', to_date='06/08/2020')
#Convert Data from Int to Float
BTC.Volume = BTC.Volume.astype(float)
BTC.High = BTC.High.astype(float)
BTC.Low = BTC.Low.astype(float)
BTC.Close = BTC.Close.astype(float)
#Drop Unnecessary Columns
del BTC['Currency']
#Select Indicators as Features
BTC['AD'] = talib.AD(BTC['High'].values, BTC['Low'].values, BTC['Close'].values, BTC['Volume'].values)
#Create forward looking columns using shift
BTC['NextDayPrice'] = BTC['Close'].shift(-1)
#Copy dataframe and clean data
BTC_cleanData = BTC.copy()
BTC_cleanData.dropna(inplace=True)
#BTC_cleanData.to_csv('C:/Users/Admin/Desktop/BTCdata.csv')
#Split Data into Training and Testing Set
#separate the features and targets into separate datasets.
#split the data into training and testing sets using a 70/30 split
#Using splicing, separate the features from the target into individual data sets.
X_all = BTC_cleanData.iloc[:, BTC_cleanData.columns != 'NextDayPrice'] # feature values for all days
y_all = BTC_cleanData['NextDayPrice'] # corresponding targets/labels
print (X_all.head()) # print the first 5 rows
#Split the data into training and testing sets using the given feature as the target
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.30, random_state=42)
#Create a decision tree regressor and fit it to the training set
regressor = LinearRegression()
regressor.fit(X_train,y_train)
print ("Training set: {} samples".format(X_train.shape[0]))
print ("Test set: {} samples".format(X_test.shape[0]))
#Evaluate Model (out of sample Accuracy and Mean Squared Error)
scores = cross_val_score(regressor, X_test, y_test, cv=10)
print ("accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2))
mse = mean_squared_error(y_test, regressor.predict(X_test))
print("MSE: %.4f" % mse)
#Evaluate Model (In sample Accuracy and Mean Squared Error)
trainscores = cross_val_score(regressor, X_train, y_train, cv=10)
print ("accuracy: %0.2f (+/- %0.2f)" % (trainscores.mean(), trainscores.std() / 2))
mse = mean_squared_error(y_train, regressor.predict(X_train))
print("MSE: %.4f" % mse)
print(regressor.predict(X_train))
#Predict Next Day Price
print(regressor.predict(X_test))

I am having out of bounds error issues with Yeast Data in Python.Why?

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 26 21:28:31 2017
#author: Chirantan
"""
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data"
names = ['Sequence Name','mcg', 'gvh', 'alm', 'mit', 'erl','pox','vac','nuc']
dataset = pandas.read_csv(url, names=names, delim_whitespace=True)
# shape
print(dataset.shape)
# head
print(dataset.head(20))
# descriptions
print(dataset.describe())
# class distribution
#print(dataset.groupby('').size())
# box and whisker plots
dataset.plot(kind='box', subplots=True, layout=(10,10), sharex=False, sharey=False)
plt.show()
# histograms
dataset.hist()
plt.show()
# scatter plot matrix
scatter_matrix(dataset)
plt.show()
# histograms
dataset.hist()
plt.show()
# scatter plot matrix
scatter_matrix(dataset)
plt.show()
# Split-out validation dataset
array = dataset.values
X = array[:,0:9]
Y = array[:,9]#HERE IS THE ERROR
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
# Make predictions on validation dataset
#knn = KNeighborsClassifier()
svm = SVC()
svm.fit(X_train, Y_train)
predictions = svm.predict(X_validation)
#knn.fit(X_train, Y_train)
#predictions = knn.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
I am trying to use different classifiers for the multi class yeast dataset from UCI repository.Everything works fine with the above code with the Iris dataset with the following change only
# Split-out validation dataset
array = dataset.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
But it is not working with the Yeast dataset when I do this
# Split-out validation dataset
array = dataset.values
X = array[:,0:9]
Y = array[:,9]
validation_size = 0.20
Here is the error messaage
File "<ipython-input-40-707d4eef8576>", line 55, in <module>
Y = array[:,9]
IndexError: index 9 is out of bounds for axis 1 with size 9
I do not understand this .array stores the values for the dataset and now array[:,9] would give me the last column.Where am I wrong?Please help.
array does not have column with index 9. It has 9 columns, and the last one has index 8. (Because the first column has index 0.)

Categories