how to use fit model to predict tomorrow's value? - python

I am learning time series forecast. Here is the code and it is working fine, the prediction is always comparing to test data. Now given the today's stock price from X, how can I predict tomorrow's stock Y( MSFT)price? This is the main goal of prediction. Thanks
import numpy as np
import pandas as pd
import pandas_datareader.data as web
# Error Metrics
from sklearn.metrics import mean_squared_error
# Time series Models
from statsmodels.tsa.arima_model import ARIMA
from matplotlib import pyplot
from datetime import datetime, timedelta
#Diable the warnings
import warnings
warnings.filterwarnings('ignore')
stk_tickers = ['MSFT', 'IBM', 'GOOGL']
ccy_tickers = ['DEXJPUS', 'DEXUSUK']
idx_tickers = ['SP500', 'DJIA', 'VIXCLS']
stk_data = web.DataReader(stk_tickers, 'yahoo',start_date, end_date)
ccy_data = web.DataReader(ccy_tickers, 'fred')
idx_data = web.DataReader(idx_tickers, 'fred')
return_period = 5
Y = np.log(stk_data.loc[:, ('Adj Close', 'MSFT')]).diff(return_period).shift(-return_period)
Y.name = Y.name[-1]+'_pred'
X1 = np.log(stk_data.loc[:, ('Adj Close', ('GOOGL', 'IBM'))]).diff(return_period)
X1.columns = X1.columns.droplevel()
X2 = np.log(ccy_data).diff(return_period)
X3 = np.log(idx_data).diff(return_period)
X4 = pd.concat([np.log(stk_data.loc[:, ('Adj Close', 'MSFT')]).diff(i) for i in [return_period, return_period*3, return_period*6, return_period*12]], axis=1).dropna()
X4.columns = ['MSFT_DT', 'MSFT_3DT', 'MSFT_6DT', 'MSFT_12DT']
X = pd.concat([X1, X2, X3, X4], axis=1)
dataset = pd.concat([Y, X], axis=1).dropna().iloc[::return_period, :]
Y = dataset.loc[:, Y.name]
X = dataset.loc[:, X.columns]
validation_size = 0.2
#In case the data is not dependent on the time series, then train and test split randomly
# seed = 7
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
#In case the data is not dependent on the time series, then train and test split should be done based on sequential sample
#This can be done by selecting an arbitrary split point in the ordered list of observations and creating two new datasets.
train_size = int(len(X) * (1-validation_size))
X_train, X_test = X[0:train_size], X[train_size:len(X)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(X)]
X_train_ARIMA=X_train.loc[:, ['GOOGL', 'IBM', 'DEXJPUS', 'SP500', 'DJIA', 'VIXCLS']]
X_test_ARIMA=X_test.loc[:, ['GOOGL', 'IBM', 'DEXJPUS', 'SP500', 'DJIA', 'VIXCLS']]
tr_len = len(X_train_ARIMA)
te_len = len(X_test_ARIMA)
to_len = len (X)
modelARIMA=ARIMA(endog=Y_train,exog=X_train_ARIMA,order=[2,0,1])
model_fit = modelARIMA.fit()
error_Training_ARIMA = mean_squared_error(Y_train, model_fit.fittedvalues)
predicted = model_fit.predict(start = tr_len -1 ,end = to_len -1, exog = X_test_ARIMA)[1:]

Related

Why am I getting negative SCORE even if i am using scoring = 'neg_mean_squared_error'?

Why am I getting negative SCORE even if i am using scoring = 'neg_mean_squared_error'?
I tried the following code from apparently the source code:
neg_mean_squared_error_scorer = make_scorer(mean_squared_error, greater_is_better=False)
Source Code
However it doesn't work. And I don't see the point of using it if we are supposed to use scoring = 'neg_mean_squared_error'.
Here is the code I used:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
from sklearn.metrics import \
r2_score, get_scorer, make_scorer, mean_squared_error
from sklearn.linear_model import \
Lasso, Ridge, LassoCV,LinearRegression
from sklearn.preprocessing import \
StandardScaler, PolynomialFeatures
from sklearn.model_selection import \
KFold, RepeatedKFold, GridSearchCV, \
cross_validate, train_test_split
# Features
x1 = np.linspace(-20,20,100)
x1 = np.array(x1).reshape(-1,1)
x2 = pow(x1,2)
x3 = pow(x1,3)
x4 = pow(x1,4)
x5 = pow(x1,5)
# Parameters
beta_0 = 1.75
beta_1 = 5
beta_3 = 0.05
beta_5 = -10.3
eps_mu = 0 # epsilon mean
eps_sigma = sqrt(4) # epsilon standard deviation
eps_size = 100 # epsilon size
np.random.seed(1) # Fixing a seed
eps = np.random.normal(eps_mu, eps_sigma, eps_size)
eps = np.array(eps).reshape(-1,1)
y = beta_0 + beta_1*x1 + beta_3*x3 + beta_5*x5 + eps
data = np.concatenate((y,x1,x2,x3,x4,x5), axis = 1)
X = data[:,1:6]
y = data[:,0]
alphas_to_try = np.linspace(0.00000000000000000000000001,0.002,10) ######## To modify #######
scoring = 'neg_mean_squared_error'
#scoring = (mean_squared_error, greater_is_better=False)
scorer = get_scorer(scoring)
k = 5
cv = KFold(n_splits = k)
for train_index, test_index in cv.split(data):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)
validation_scores = []
train_scores = []
results_list = []
test_scores = []
for curr_alpha in alphas_to_try:
regmodel = Lasso(alpha = curr_alpha)
results = cross_validate(
regmodel, X, y, scoring=scoring, cv=cv,
return_train_score = True)
validation_scores.append(np.mean(results['test_score']))
train_scores.append(np.mean(results['train_score']))
results_list.append(results)
regmodel.fit(X,y)
y_pred = regmodel.predict(X_test)
test_scores.append(scorer(regmodel, X_test, y_test))
chosen_alpha_id = np.argmax(validation_scores)
chosen_alpha = alphas_to_try[chosen_alpha_id]
max_validation_score = np.max(validation_scores)
test_score_at_chosen_alpha = test_scores[chosen_alpha_id]
print('chosen_alpha:', chosen_alpha)
print('max_validation_score:', max_validation_score)
print('test_score_at_chosen_alpha:', test_score_at_chosen_alpha)
plt.figure(figsize = (8,8))
sns.lineplot(y = validation_scores, x = alphas_to_try, label = 'validation_data')
sns.lineplot(y = train_scores, x = alphas_to_try, label = 'training_data')
plt.axvline(x=chosen_alpha, linestyle='--')
sns.lineplot(y = test_scores, x = alphas_to_try, label = 'test_data')
plt.xlabel('alpha_parameter')
plt.ylabel(scoring)
plt.title('LASSO Regularisation')
plt.legend()
plt.show()
Why the code is not working? Why am I getting negative scores?
Output:
What I am supposed to get:
I am supposed to get something like the screenshot above, but MSE instead of r2 on the y axis.
As the name suggests, neg_mean_squared_error is the negative of the mean-squared-error, so negative scores is expected (in fact, it is positive scores that are impossible).
As to the plots, there's a bigger problem. Your train and validation scores are obtained using cross_validate, and are fine. But your test scores are obtained by fitting the regressor to the entire X, y and then scoring that on X_test, y_test, a subset of the training set! So those scores are quite optimistically biased.
A quick check on the scale of the errors: you have a degree-5 polynomial with the original feature taking values between -20 and 20. So the target takes values on the order of 10^6, and so squared errors may be expected on the order of 10^12.

I'm having problems converting my forecast_out from daily to a minute forecast

Hi I'm working on a learning model and I seem to have been stuck with the forecast_out function of Scikit-learn.
I need help in creating forecasts not just for daily, but hourly, and even by the minute as well.
Thanks!
import pandas as pd
import datetime as dt
import pandas_datareader as reader
import fbprophet
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
end = dt.datetime.now()
start = dt.datetime(end.year-1,end.month,end.day,end.hour,end.minute)
start
df = reader.get_data_yahoo('BTC-USD',start,end)
print(df.head())
df = df[['Adj Close']]
print(df.head())
forecast_out =30 #(I want to be able to predict hourly and even by the minute as well)
df['Prediction'] = df[['Adj Close']].shift(-forecast_out)
print(df.tail())
X = np.array (df.drop(['Prediction'],1))
X = X[:-forecast_out]
print (X)
y = np.array (df['Prediction'])
y = y[:-forecast_out]
print(y)
x_train, x_test, y_train, y_test = train_test_split (X,y, test_size=0.2)
svr_rbf = SVR(kernel='rbf' , C=1e3, gamma=0.1)
svr_rbf.fit(x_train, y_train)
svm_confidence = svr_rbf.score(x_test, y_test)
print ("svm confidence: ", svm_confidence)
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_confidence = lr.score(x_test, y_test)
print ("lr confidence: ", lr_confidence)
x_forecast = np.array(df.drop(['Prediction'],1))[-forecast_out:]
print(x_forecast)
lr_prediction = lr.predict (x_forecast)
print(lr_prediction)
svm_prediction = svr_rbf.predict (x_forecast)
print(svm_prediction)

What value to offer to the future dataframe when doing predictions?

I have a dataset "group_by_df" which has a "day" column and "o3". I want to do some predictions in the future, for example for the next 5 days. I have managed to complete the dataframe in the "day" column with the following 5 days using this code:
days_predicted = 5
rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
df = pd.DataFrame({'day': rng})
df[sensor_name] = group_by_df["o3"] #the current existent values for o3
df[sensor_name][len(group_by_df):] = "" #the future values
The thing is that I cannot do the linear Regression with this format for the "o3" column, as I either leave it Nan or "". But linear regression throws this error: ValueError: could not convert string to float:
What values should I give to the "o3" column so that I can do a future prediction? Here is the updated code, I have replaced the unknown "o3" values with the average for the known days. Is this a good approach?
import datetime
import datetime as dt
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from distributed.deploy.ssh import bcolors
from flask_babel import _
from pandas.plotting import register_matplotlib_converters
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None # default='warn'
register_matplotlib_converters()
def create_columns(data):
data["readable time"] = ""
data["day"] = ""
for i in range(0, len(data)):
data.loc[i, ['readable time']] = datetime.datetime.fromtimestamp(
data["time"][i]).strftime(
'%d/%m/%Y %H:%M:%S')
data.loc[i, ['day']] = datetime.datetime.fromtimestamp(data["time"][i]).strftime(
'%d/%m/%Y')
def calculate_linear_regression(data, sensor_name):
create_columns(data) # from timeseries
data['day'] = pd.to_datetime(data['day'], dayfirst=True)
data = data.sort_values(by=['readable time'])
group_by_df = pd.DataFrame([name, group.mean()[sensor_name]] for name, group in data.groupby('day'))
group_by_df.columns = ['day', sensor_name]
print("group by df ", group_by_df)
group_by_df['day'] = pd.to_datetime(group_by_df['day'])
# initial length of dataframe(before future prediction)
initial_len_df = len(group_by_df)
days_predicted = 5
rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
df = pd.DataFrame({'day': rng})
df[sensor_name] = group_by_df[sensor_name]
df[sensor_name][len(group_by_df):] = group_by_df[sensor_name].mean() # ""
print("after... \n", df)
group_by_df = df
print("GROUP BY DF\n", group_by_df)
group_by_df['day'] = group_by_df['day'].map(dt.datetime.toordinal)
def split(group_by_df):
X = group_by_df[['day']].values
y = group_by_df[[sensor_name]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split(group_by_df)
def analyse_forecast():
print("MSE linear regression(mean squared error)",
mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
print("r2 score ", r2_score(group_by_df[sensor_name], group_by_df['predicted']))
rmse = np.sqrt(mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
print("RMSE for linear regression=", rmse)
print( "MSE TEST ", mean_squared_error(y_test, group_by_df['predicted'][len(X_train):]))
print("MSE TRAIN ", mean_squared_error(y_train, group_by_df['predicted'][:len(X_train)]))
print("r2 score TEST", r2_score(y_test, group_by_df['predicted'][len(X_train):]))
return mean_squared_error(group_by_df[sensor_name], group_by_df['predicted'])
def calculate_linear_reg():
group_by_df.reset_index(inplace=True)
mse_list = []
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(group_by_df[['day']])
group_by_df['predicted'] = y_pred
mse_list.append(analyse_forecast())
print(group_by_df) # print predicted values
calculate_linear_reg()
return group_by_df, X_train, sensor_name, initial_len_df
def create_figure(group_by_df, X_train, sensor_name, initial_len_df):
print("INITIAL LEN DF IS", initial_len_df)
linear_regression_fig = go.Figure()
# plot predicted values
linear_regression_fig.add_trace(go.Scatter(
x=group_by_df['day'].map(dt.datetime.fromordinal),
y=group_by_df['predicted'],
name=_("Linear Regression"),
mode='lines+markers',
marker=dict(
color=np.where(group_by_df['day'].index < len(X_train), 'red', 'green'))))
# plot actual values
linear_regression_fig.add_trace(go.Scatter(
x=group_by_df['day'].map(dt.datetime.fromordinal)[:initial_len_df],
y=group_by_df[sensor_name][:initial_len_df],
name=_('Actual values'),
mode='lines+markers'))
linear_regression_fig.update_layout(
height=700,
font=dict(color="grey"),
paper_bgcolor='rgba(0,0,0,0)',
title=_('Linear Regression for ') + _(sensor_name),
yaxis_title=_(sensor_name),
xaxis_title=_('Day'),
showlegend=True)
linear_regression_fig.show()
data="https://raw.githubusercontent.com/iulianastroia/csv_data/master/final_dataframe.csv"
data = pd.read_csv(data)
group_by_df, X_train, sensor_name, initial_len_df = calculate_linear_regression(data, "o3")
linear_reg_fig = create_figure(group_by_df, X_train, sensor_name, initial_len_df)
``

LinearRegression by given last n rows

I'm currently working on a time-series model. It's very simple. I'm deploying last row OHLC (open, high, low, close) value and trying to predict next close. Simple and useless. But what i want to do is to give last 10 days to predict tomorrow's price. I know it's not going to be accurate but this is what i am trying to do.
Here how i get the NextClose and apply it to Linear Regression model:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
df = pd.read_csv("./EURUSD.csv")
days = 1
df['NextClose'] = df['Close'].shift(-days)
df = df.dropna()
total = len(df)
test_ratio = 0.30
test_size = int(total * test_ratio)
total = len(df)
test_ratio = 0.30
test_size = int(total * test_ratio)
X = df[['Open', 'High', 'Low', 'Close']]
y = df[['NextClose']]
#build test and train data
X_train = X[:-test_size]
y_train = y[:-test_size]
X_test = X[-test_size:]
y_test = y[-test_size:]
# build model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
plt.scatter(y_pred, y_test)
plt.show()
In this case, i am giving only the final row. What I want to do is to feed last 10-20 rows.
I believe this is a similar data transformation to the one described in the function create_dataset on this MachineLearningMastery page (see the section LSTM for Regression Using the Window Method).
The goal is to use the data from rows t:(t+days) to predict the closing price at row (t+days)+1.
The X_train matrix will have days * X.shape[1] columns in each row, which in the example below represents the flattened data from 10 days worth of data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# generate random data to test
df = pd.DataFrame(np.random.normal(size=(2000, 4)))
df.columns = ['Open', 'High', 'Low', 'Close']
days = 10
df = df.dropna()
total = len(df)
test_ratio = 0.30
test_size = int(total * test_ratio)
X = df[['Open', 'High', 'Low', 'Close']]
y = df['Close'].shift(-days)
# this function based on the MachineLearningMastery page mentioned
def create_dataset(X, y, look_back=1):
dataX, dataY = [], []
for i in range(X.shape[0]-look_back):
a = X.iloc[i:(i+look_back), :].values.flatten()
dataX.append(a)
dataY.append(y.iloc[i])
return np.array(dataX), np.array(dataY)
#build test and train data
X_train, y_train = create_dataset(X[:-test_size], y[:-test_size], look_back=days)
X_test, y_test = create_dataset(X[-test_size:], y[-test_size:], look_back=days)
# build model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
plt.scatter(y_pred, y_test)
plt.show()

Why is my output dataframe shape not 1459 x 2 but 1460 x 2

Below is what i have done so far.
#importing the necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
filepath = r"C:\Users...Kaggle data\house prediction iowa\house_predtrain (3).csv"
train = pd.read_csv(filepath)
print(train.shape)
filepath2 = r"C:\Users...Kaggle data\house prediction iowa\house_predtest (1).csv"
test = pd.read_csv (filepath2)
print(test.shape)
#first we raplace all the NANs by 0 in botht the train and test data
train = train.fillna(0)
test = test.fillna(0) #error one
train.dtypes.value_counts()
#isolating all the object/categorical feature and converting them to numeric features
encode_cols = train.dtypes[train.dtypes == np.object]
encode_cols2 = test.dtypes[test.dtypes == np.object]
#print(encode_cols)
encode_cols = encode_cols.index.tolist()
encode_cols2 = encode_cols2.index.tolist()
print(encode_cols2)
# Do the one hot encoding
train_dummies = pd.get_dummies(train, columns=encode_cols)
test_dummies = pd.get_dummies(test, columns=encode_cols2)
#align your test and train data (error2)
train, test = train_dummies.align(test_dummies, join = 'left', axis = 1)
print(train.shape)
print(test.shape)
#Now working with Floats features
numericals_floats = train.dtypes == np.float
numericals = train.columns[numericals_floats]
print(numericals)
#we check for skewness in the float data
skew_limit = 0.35
skew_vals = train[numericals].skew()
skew_cols = (skew_vals
.sort_values(ascending=False)
.to_frame()
.rename(columns={0:'Skewness'}))
skew_cols
#Visualising them above data before and after log transforming
%matplotlib inline
field = 'GarageYrBlt'
fig, (ax_before, ax_after) = plt.subplots(1, 2, figsize=(10,5))
train[field].hist(ax=ax_before)
train[field].apply(np.log1p).hist(ax=ax_after)
ax_before.set (title = 'Before np.log1p', ylabel = 'frequency', xlabel = 'Value')
ax_after.set (title = 'After np.log1p', ylabel = 'frequency', xlabel = 'Value')
fig.suptitle('Field: "{}"'.format (field));
#note how applying log transformation on GarageYrBuilt does not do much
print(skew_cols.index.tolist()) #returns a list of the values
for i in skew_cols.index.tolist():
if i == "SalePrice": #we do not want to transform the feature to be predicted
continue
train[i] = train[i].apply(np.log1p)
test[i] = test[i].apply(np.log1p)
feature_cols = [x for x in train.columns if x != ('SalePrice')]
X_train = train[feature_cols]
y_train = train['SalePrice']
X_test = test[feature_cols]
y_test = train['SalePrice']
print(X_test.shape)
print(y_train.shape)
print(X_train.shape)
#now to the most fun part. Feature engineering is over!!!
#i am going to use linear regression, L1 regularization, L2 regularization and ElasticNet(blend of L1 and L2)
#first up, Linear Regression
alphas =[0.00005, 0.0005, 0.005, 0.05, 0.5, 0.1, 0.3, 1, 3, 5, 10, 25, 50, 100] #i choosed this
l1_ratios = np.linspace(0.1, 0.9, 9)
#LinearRegression
linearRegression = LinearRegression().fit(X_train, y_train)
prediction1 = linearRegression.predict(X_test)
LR_score = linearRegression.score(X_train, y_train)
print(LR_score)
#ridge
ridgeCV = RidgeCV(alphas=alphas).fit(X_train, y_train)
prediction2 = ridgeCV.predict(X_test)
R_score = ridgeCV.score(X_train, y_train)
print(R_score)
#lasso
lassoCV = LassoCV(alphas=alphas, max_iter=1e2).fit(X_train, y_train)
prediction3 = lassoCV.predict(X_test)
L_score = lassoCV.score(X_train, y_train)
print(L_score)
#elasticNetCV
elasticnetCV = ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, max_iter=1e2).fit(X_train, y_train)
prediction4 = elasticnetCV.predict(X_test)
EN_score = elasticnetCV.score(X_train, y_train)
print(EN_score)
from sklearn.ensemble import RandomForestRegressor
randfr = RandomForestRegressor()
randfr = randfr.fit(X_train, y_train)
prediction5 = randfr.predict(X_test)
print(prediction5.shape)
RF_score = randfr.score(X_train, y_train)
print(RF_score)
#putting it lall together
rmse_vals = [LR_score, R_score, L_score, EN_score, RF_score]
labels = ['Linear', 'Ridge', 'Lasso', 'ElasticNet', 'RandomForest']
rmse_df = pd.Series(rmse_vals, index=labels).to_frame()
rmse_df.rename(columns={0: 'SCORES'}, inplace=1)
rmse_df
\\KaggleHouse_submission_1 = pd.DataFrame({'Id': test.Id, 'SalePrice': prediction5})
KaggleHouse_submission_1 = KaggleHouse_submission_1
print(KaggleHouse_submission_1.shape)
In the kaggle house prediction there is a train dataset and a test dataset. here is the link to the actual data link. The output dataframe size should be a 1459 X 2 but mine is 1460 X 2 for some reason. I am not sure why this is happening. Any feedbacks is highly appreciated.
In the following line:
test = train.fillna(0)
you are assigning (overwriting) test variable with the "train" data ...
Scikit learn is very sensitive o ordering of columns, so if your train data set and the test data set are misaligned, you may have a problem similar to that above. so you need to first ensure that the test data is encoded same as the train data by using the following align command.
train, test = train_dummies.align(test_dummies, join='left', axis = 1)
see changes in my code above

Categories