LSTM neural network for multiple steps time series prediction - python

I tried to develop a model that foresees two time-steps forward
In this regard I modified a GitHub code for the single step forecast coding a data_load function that takes n steps backward in the X_train/test series and set it against a y_train/test 2-array.
I set the neurons list to output in Dense a 2-vector object.
And last I wrote a predict function and a plot function for the 2-step-forecast.
I do not normalized features lables and forecasts I will do in the future.
After a bit of hyperfine tuning it returns a good score for the mse and rmse:
Train Score: 0.00000 MSE (0.00 RMSE)
Test Score: 0.00153 MSE (0.04 RMSE)
It can find quite well the trend, but it returns all forecasts with negative directions.
Does anyone have a suggestion?
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt2
import pandas as pd
from pandas import datetime
import math, time
import itertools
from sklearn import preprocessing
import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM
from keras.models import load_model
import keras
from numpy import newaxis
pd.core.common.is_list_like = pd.api.types.is_list_like
import pandas_datareader.data as web
import h5py
from keras import backend as K
import quandl
quandl.ApiConfig.api_key = 'myQuandlKey'
seq_len = 2
shape = [seq_len, 9, 2]
neurons = [256, 256, 64, 2]
dropout = 0.2
decay = 0.5
epochs = 100
stock_name = 'AAPL'
global_start_time = time.time()
def get_stock_data(stock_name, normalize=True, ma=[]):
"""
Return a dataframe of that stock and normalize all the values.
(Optional: create moving average)
"""
df = quandl.get_table('WIKI/PRICES', ticker = stock_name)
df.drop(['ticker', 'open', 'high', 'low', 'close', 'ex-dividend', 'volume', 'split_ratio'], 1, inplace=True)
df.set_index('date', inplace=True)
# Renaming all the columns so that we can use the old version code
df.rename(columns={'adj_open': 'Open', 'adj_high': 'High', 'adj_low': 'Low', 'adj_volume': 'Volume', 'adj_close': 'Adj Close'}, inplace=True)
# Percentage change
df['Pct'] = df['Adj Close'].pct_change()
df.dropna(inplace=True)
# Moving Average
if ma != []:
for moving in ma:
df['{}ma'.format(moving)] = df['Adj Close'].rolling(window=moving).mean()
df.dropna(inplace=True)
if normalize:
min_max_scaler = preprocessing.MinMaxScaler()
df['Open'] = min_max_scaler.fit_transform(df.Open.values.reshape(-1,1))
df['High'] = min_max_scaler.fit_transform(df.High.values.reshape(-1,1))
df['Low'] = min_max_scaler.fit_transform(df.Low.values.reshape(-1,1))
df['Volume'] = min_max_scaler.fit_transform(df.Volume.values.reshape(-1,1))
df['Adj Close'] = min_max_scaler.fit_transform(df['Adj Close'].values.reshape(-1,1))
df['Pct'] = min_max_scaler.fit_transform(df['Pct'].values.reshape(-1,1))
if ma != []:
for moving in ma:
df['{}ma'.format(moving)] = min_max_scaler.fit_transform(df['{}ma'.format(moving)].values.reshape(-1,1))
# Move Adj Close to the rightmost for the ease of training
adj_close = df['Adj Close']
df.drop(labels=['Adj Close'], axis=1, inplace=True)
df = pd.concat([df, adj_close], axis=1)
#df.to_csv('aap.csv')
return df
df = get_stock_data(stock_name, ma=[50, 100, 200])
def plot_stock(df):
print(df.head())
plt.subplot(211)
plt.plot(df['Adj Close'], color='red', label='Adj Close')
plt.legend(loc='best')
plt.subplot(212)
plt.plot(df['Pct'], color='blue', label='Percentage change')
plt.legend(loc='best')
plt.show()
#plot_stock(df)
def load_data(stock, seq_len):
amount_of_features = len(stock.columns)
data = stock.values
sequence_length = seq_len + 2 # index starting from 0
result = []
for index in range(len(data) - sequence_length): # maxmimum date = lastest date - sequence length
result.append(data[index: index + sequence_length]) # index : index + 22days
result = np.array(result)
row = round(0.8 * result.shape[0]) # 80% split
train = result[:int(row), :,:] # 80% date
X_train = train[:, :-2,:] # all data until day m
y_train = train[:, -2:,:][:,:,-1] # day m + 1 adjusted close price
X_test = result[int(row):, :-2,:]
y_test = result[int(row):, -2:,:][:,:,-1]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], amount_of_features))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], amount_of_features))
print ("________________________________________________________________")
print ("X_train shape = {}".format(X_train.shape))
print ("y_train shape = {}".format(y_train.shape))
print ("")
print ("X_test shape = {}".format(X_test.shape))
print ("y_test shape = {}".format(y_test.shape))
print ("________________________________________________________________")
return [X_train, y_train, X_test, y_test]
X_train, y_train, X_test, y_test = load_data(df, seq_len)
def build_model(shape, neurons, dropout, decay):
model = Sequential()
model.add(LSTM(neurons[0], input_shape=(shape[0], shape[1]), return_sequences=True))
model.add(Dropout(dropout))
model.add(LSTM(neurons[1], input_shape=(shape[0], shape[1]), return_sequences=False))
model.add(Dropout(dropout))
model.add(Dense(neurons[2],kernel_initializer="uniform",activation='relu'))
model.add(Dense(neurons[3],kernel_initializer="uniform",activation='linear'))
# model = load_model('my_LSTM_stock_model1000.h5')
adam = keras.optimizers.Adam(decay=decay)
model.compile(loss='mse',optimizer='adam', metrics=['accuracy'])
model.summary()
return model
model = build_model(shape, neurons, dropout, decay)
model.fit(
X_train,
y_train,
batch_size=512,
epochs=epochs,
validation_split=0.01,
verbose=1)
def model_score(model, X_train, y_train, X_test, y_test):
trainScore = model.evaluate(X_train, y_train, verbose=0)
print('Train Score: %.5f MSE (%.2f RMSE)' % (trainScore[0], math.sqrt(trainScore[0])))
testScore = model.evaluate(X_test, y_test, verbose=0)
print('Test Score: %.5f MSE (%.2f RMSE)' % (testScore[0], math.sqrt(testScore[0])))
return trainScore[0], testScore[0]
model_score(model, X_train, y_train, X_test, y_test)
def percentage_difference(model, X_test, y_test):
percentage_diff=[]
p = model.predict(X_test)
for u in range(len(y_test)): # for each data index in test data
pr = p[u][0] # pr = prediction on day u
percentage_diff.append((pr-y_test[u]/pr)*100)
print('Prediction duration: ', str(datetime.timedelta(seconds=(time.time() - global_start_time))) )
return p
def denormalize(stock_name, normalized_value):
"""
Return a dataframe of that stock and normalize all the values.
(Optional: create moving average)
"""
df = quandl.get_table('WIKI/PRICES', ticker = stock_name)
df.drop(['ticker', 'open', 'high', 'low', 'close', 'ex-dividend', 'volume', 'split_ratio'], 1, inplace=True)
df.set_index('date', inplace=True)
# Renaming all the columns so that we can use the old version code
df.rename(columns={'adj_open': 'Open', 'adj_high': 'High', 'adj_low': 'Low', 'adj_volume': 'Volume', 'adj_close': 'Adj Close'}, inplace=True)
df.dropna(inplace=True)
df = df['Adj Close'].values.reshape(-1,1)
normalized_value = normalized_value.reshape(-1,1)
#return df.shape, p.shape
min_max_scaler = preprocessing.MinMaxScaler()
a = min_max_scaler.fit_transform(df)
new = min_max_scaler.inverse_transform(normalized_value)
return new
def plot_result(stock_name, normalized_value_p, normalized_value_y_test):
newp = denormalize(stock_name, normalized_value_p)
newy_test = denormalize(stock_name, normalized_value_y_test)
#newy_test = np.roll(newy_test,1,0)
plt2.plot(newp, color='red', label='Prediction')
plt2.plot(newy_test,color='blue', label='Actual')
plt2.legend(loc='best')
plt2.title('Global run time {}'.format(str(datetime.timedelta(seconds=(time.time() - global_start_time))) ) )
plt2.xlabel('Days')
plt2.ylabel('Adjusted Close')
plt2.show()
def predict_sequences_multiple(model, data, window_size, prediction_len):
#Predict sequence of 50 steps before shifting prediction run forward by 50 steps
prediction_seqs = []
for i in range(int(len(data)/prediction_len)):
curr_frame = data[i*prediction_len]
predicted = []
for j in range(prediction_len):
predicted.append(model.predict(curr_frame[newaxis,:,:])[0,0])
curr_frame = curr_frame[1:]
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
prediction_seqs.append(predicted)
print('Prediction duration: ', str(datetime.timedelta(seconds=(time.time() - global_start_time))) )
return prediction_seqs
def plot_results_multiple(predicted_data, true_data, prediction_len):
fig = plt.figure(facecolor='white')
ax = fig.add_subplot(111)
ax.plot(true_data, label='True Data')
#Pad the list of predictions to shift it in the graph to it's correct start
for i, data in enumerate(predicted_data):
padding = [None for p in range(i * prediction_len)]
plt.plot(padding + data, label='Prediction')
plt.legend()
plt.title('Global run time {}'.format(str(datetime.timedelta(seconds=(time.time() - global_start_time))) ) )
plt.show()
#Single step prediction
#p = percentage_difference(model, X_test, y_test)
#plot_result(stock_name, p, y_test)
#Multiple step prediction
predictions = predict_sequences_multiple(model, X_test, seq_len, 2)
plot_results_multiple(predictions, y_test, 2)

Related

Predict data outside existing data

Iv'e made a basic prediction to learn this complicated area, the prediction works but only as long as i already have data.
The result i want is to predict data further than my existing data to predict what hasn't been yet.
So like i said i'm new and if someone can show me how or tell me what i dont understand good enough in the code to do this i would be grateful.
heres the code:
from cProfile import label
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20,10
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('./Data/market-price-3y.csv')
# Use only one column
df = df[['Date', 'Close']]
# Change type of date from object to datetime
df['Date'] = pd.to_datetime(df.Date, format='%Y-%m-%d %H:%M:%S')
# Set Date as index
df.index = df['Date']
print(df.head())
# Show the data as a graph
# plt.plot(df['Close'], label='Close Price History', color='red')
# plt.show()
df = df.sort_index(ascending=True, axis=0)
data = pd.DataFrame(index=range(0, len(df)), columns=['Date', 'Close'])
for i in range(0, len(data)):
data['Date'][i] = df['Date'][i]
data['Close'][i] = df['Close'][i]
scaler = MinMaxScaler(feature_range=(0,1))
data.index = data.Date
data.drop('Date', axis=1, inplace=True)
# Split data into training and testing datasets
final_data = data.values
train_data = final_data[0:900,:]
valid_data = final_data[900:,:]
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(final_data)
x_train_data, y_train_data = [], []
for i in range(60, len(train_data)):
x_train_data.append(scaled_data[i-60:i,0])
y_train_data.append(scaled_data[i,0])
x_train_data = np.asarray(x_train_data)
y_train_data = np.asarray(y_train_data)
x_train_data = np.reshape(x_train_data, (x_train_data.shape[0], x_train_data.shape[1],1))
# Create the LSTM Model
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(np.shape(x_train_data)[1], 1)))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(1))
model_data = data[len(data) - len(valid_data)-60:].values
model_data = model_data.reshape(-1,1)
model_data = scaler.transform(model_data)
# Train and test the data
lstm_model.compile(loss='mean_squared_error', optimizer='adam')
lstm_model.fit(x_train_data, y_train_data, epochs=1, batch_size=1, verbose=2)
# Test Data
X_test = []
for i in range(60, model_data.shape[0]):
X_test.append(model_data[i-60:i,0])
X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
# Result
predicted_price = lstm_model.predict(X_test)
predicted_price = scaler.inverse_transform(predicted_price)
train_data = data[:900]
valid_data = data[900:]
valid_data['Predictions'] = predicted_price
plt.plot(train_data['Close'])
plt.plot(valid_data[['Close', 'Predictions']])
plt.show()
[![Matplotlib output][1]][1] [1]: https://i.stack.imgur.com/7j0Yf.png
So i want the prediction to keep going for lets say 15more points instead of ending when the test data ends.
here you can increase the data length at this code phase:
# Test Data
X_test = []
for i in range(60, model_data.shape[0]):
X_test.append(model_data[i-60:i,0])
X_test = np.array(X_test)
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
here instead of 60, you can give 60+15.
or you can create a new test dataset like similar to the above coding it depends on your data set. :)

How do I use custom CSV in my code instead of Yahoo Finance data?

I'm building a stock prediction neural network. The tutorial i was watching was importing the stock data from yahoo finance. I want to improve the code by making it fetch the data from a CSV file so the code can be used even if you are not connected to the internet.
What do I need to change In my code to have it use custom data from a CSV file?
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pandas_datareader as web
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
company = '^GDAXI'
start = dt.datetime(2012,1,1)
end = dt.datetime(2021,1,1)
data = web.DataReader(company, 'yahoo', start, end)
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(data['Close'].values.reshape(-1, 1))
prediction_days = 60
x_train = []
y_train = []
for x in range(prediction_days, len(scaled_data)):
x_train.append(scaled_data[x-prediction_days:x, 0])
y_train.append(scaled_data[x, 0])
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
#BUILD MODEL
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1)) #next day prediction
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, epochs=25, batch_size=32)
#TEST ON EXISTING DATA
test_start = dt.datetime(2020,1,1)
test_end = dt.datetime.now()
test_dataset = web.DataReader(company, 'yahoo', test_start, test_end)
actual_prices = test_dataset['Close'].values
total_dataset = pd.concat((data['Close'], test_dataset['Close']), axis=0)
model_inputs = total_dataset[len(total_dataset)-len(test_dataset)-prediction_days:].values
model_inputs = model_inputs.reshape(-1,1)
model_inputs = scaler.transform(model_inputs)
#PREDICTIONS ON TEST DATA
x_test = []
for x in range(prediction_days, len(model_inputs)):
x_test.append(model_inputs[x-prediction_days:x, 0])
x_test = np.array(x_test)
x_test = np.reshape(x_test,(x_test.shape[0], x_test.shape[1],1))
predicted_prices = model.predict(x_test)
predicted_prices = scaler.inverse_transform(predicted_prices)
#PLOT
plt.plot(actual_prices, color="green", label="Actual Price")
plt.plot(predicted_prices, color="blue", label="Predicted Price")
plt.title("GER40 Share Price")
plt.xlabel('Time')
plt.ylabel('GER40 Price')
plt.legend()
plt.show()
#Predict Next Day
real_dataset = [model_inputs[len(model_inputs)+1-prediction_days:len(model_inputs+1), 0]]
real_dataset = np.array(real_dataset)
real_dataset = np.reshape(real_dataset, (real_dataset.shape[0], real_dataset.shape[1], 1))
prediction = model.predict(real_dataset)
prediction = scaler.inverse_transform(prediction)
print(f"Close: {prediction}")
The CSV file i'm using doesn't have headings, but i think i can add those using excel
I think you should consider doing it this way.
from pandas_datareader import data as wb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from sklearn.preprocessing import MinMaxScaler
start = '2019-06-30'
end = '2020-06-30'
tickers = ['GOOG']
thelen = len(tickers)
price_data = []
for ticker in tickers:
prices = wb.DataReader(ticker, start = start, end = end, data_source='yahoo')[['Open','Adj Close']]
price_data.append(prices.assign(ticker=ticker)[['ticker', 'Open', 'Adj Close']])
#names = np.reshape(price_data, (len(price_data), 1))
df = pd.concat(price_data)
df.reset_index(inplace=True)
for col in df.columns:
print(col)
#used for setting the output figure size
rcParams['figure.figsize'] = 20,10
#to normalize the given input data
scaler = MinMaxScaler(feature_range=(0, 1))
#to read input data set (place the file name inside ' ') as shown below
df['Adj Close'].plot()
plt.legend(loc=2)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
ntrain = 80
df_train = df.head(int(len(df)*(ntrain/100)))
ntest = -80
df_test = df.tail(int(len(df)*(ntest/100)))
#importing the packages
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
#dataframe creation
seriesdata = df.sort_index(ascending=True, axis=0)
new_seriesdata = pd.DataFrame(index=range(0,len(df)),columns=['Date','Adj Close'])
length_of_data=len(seriesdata)
for i in range(0,length_of_data):
new_seriesdata['Date'][i] = seriesdata['Date'][i]
new_seriesdata['Adj Close'][i] = seriesdata['Adj Close'][i]
#setting the index again
new_seriesdata.index = new_seriesdata.Date
new_seriesdata.drop('Date', axis=1, inplace=True)
#creating train and test sets this comprises the entire data’s present in the dataset
myseriesdataset = new_seriesdata.values
totrain = myseriesdataset[0:255,:]
tovalid = myseriesdataset[255:,:]
#converting dataset into x_train and y_train
scalerdata = MinMaxScaler(feature_range=(0, 1))
scale_data = scalerdata.fit_transform(myseriesdataset)
x_totrain, y_totrain = [], []
length_of_totrain=len(totrain)
for i in range(60,length_of_totrain):
x_totrain.append(scale_data[i-60:i,0])
y_totrain.append(scale_data[i,0])
x_totrain, y_totrain = np.array(x_totrain), np.array(y_totrain)
x_totrain = np.reshape(x_totrain, (x_totrain.shape[0],x_totrain.shape[1],1))
#LSTM neural network
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(x_totrain.shape[1],1)))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(1))
lstm_model.compile(loss='mean_squared_error', optimizer='adadelta')
lstm_model.fit(x_totrain, y_totrain, epochs=10, batch_size=1, verbose=2)
#predicting next data stock price
myinputs = new_seriesdata[len(new_seriesdata) - (len(tovalid)+1) - 60:].values
myinputs = myinputs.reshape(-1,1)
myinputs = scalerdata.transform(myinputs)
tostore_test_result = []
for i in range(60,myinputs.shape[0]):
tostore_test_result.append(myinputs[i-60:i,0])
tostore_test_result = np.array(tostore_test_result)
tostore_test_result = np.reshape(tostore_test_result,(tostore_test_result.shape[0],tostore_test_result.shape[1],1))
myclosing_priceresult = lstm_model.predict(tostore_test_result)
myclosing_priceresult = scalerdata.inverse_transform(myclosing_priceresult)
totrain = df_train
tovalid = df_test
#predicting next data stock price
myinputs = new_seriesdata[len(new_seriesdata) - (len(tovalid)+1) - 60:].values
# Printing the next day’s predicted stock price.
print(len(tostore_test_result));
print(myclosing_priceresult);
Final Result:
[[1396.532]]

What value to offer to the future dataframe when doing predictions?

I have a dataset "group_by_df" which has a "day" column and "o3". I want to do some predictions in the future, for example for the next 5 days. I have managed to complete the dataframe in the "day" column with the following 5 days using this code:
days_predicted = 5
rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
df = pd.DataFrame({'day': rng})
df[sensor_name] = group_by_df["o3"] #the current existent values for o3
df[sensor_name][len(group_by_df):] = "" #the future values
The thing is that I cannot do the linear Regression with this format for the "o3" column, as I either leave it Nan or "". But linear regression throws this error: ValueError: could not convert string to float:
What values should I give to the "o3" column so that I can do a future prediction? Here is the updated code, I have replaced the unknown "o3" values with the average for the known days. Is this a good approach?
import datetime
import datetime as dt
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from distributed.deploy.ssh import bcolors
from flask_babel import _
from pandas.plotting import register_matplotlib_converters
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None # default='warn'
register_matplotlib_converters()
def create_columns(data):
data["readable time"] = ""
data["day"] = ""
for i in range(0, len(data)):
data.loc[i, ['readable time']] = datetime.datetime.fromtimestamp(
data["time"][i]).strftime(
'%d/%m/%Y %H:%M:%S')
data.loc[i, ['day']] = datetime.datetime.fromtimestamp(data["time"][i]).strftime(
'%d/%m/%Y')
def calculate_linear_regression(data, sensor_name):
create_columns(data) # from timeseries
data['day'] = pd.to_datetime(data['day'], dayfirst=True)
data = data.sort_values(by=['readable time'])
group_by_df = pd.DataFrame([name, group.mean()[sensor_name]] for name, group in data.groupby('day'))
group_by_df.columns = ['day', sensor_name]
print("group by df ", group_by_df)
group_by_df['day'] = pd.to_datetime(group_by_df['day'])
# initial length of dataframe(before future prediction)
initial_len_df = len(group_by_df)
days_predicted = 5
rng = pd.date_range(group_by_df['day'].min(), periods=len(group_by_df) + days_predicted, freq='D')
df = pd.DataFrame({'day': rng})
df[sensor_name] = group_by_df[sensor_name]
df[sensor_name][len(group_by_df):] = group_by_df[sensor_name].mean() # ""
print("after... \n", df)
group_by_df = df
print("GROUP BY DF\n", group_by_df)
group_by_df['day'] = group_by_df['day'].map(dt.datetime.toordinal)
def split(group_by_df):
X = group_by_df[['day']].values
y = group_by_df[[sensor_name]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split(group_by_df)
def analyse_forecast():
print("MSE linear regression(mean squared error)",
mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
print("r2 score ", r2_score(group_by_df[sensor_name], group_by_df['predicted']))
rmse = np.sqrt(mean_squared_error(group_by_df[sensor_name], group_by_df['predicted']))
print("RMSE for linear regression=", rmse)
print( "MSE TEST ", mean_squared_error(y_test, group_by_df['predicted'][len(X_train):]))
print("MSE TRAIN ", mean_squared_error(y_train, group_by_df['predicted'][:len(X_train)]))
print("r2 score TEST", r2_score(y_test, group_by_df['predicted'][len(X_train):]))
return mean_squared_error(group_by_df[sensor_name], group_by_df['predicted'])
def calculate_linear_reg():
group_by_df.reset_index(inplace=True)
mse_list = []
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(group_by_df[['day']])
group_by_df['predicted'] = y_pred
mse_list.append(analyse_forecast())
print(group_by_df) # print predicted values
calculate_linear_reg()
return group_by_df, X_train, sensor_name, initial_len_df
def create_figure(group_by_df, X_train, sensor_name, initial_len_df):
print("INITIAL LEN DF IS", initial_len_df)
linear_regression_fig = go.Figure()
# plot predicted values
linear_regression_fig.add_trace(go.Scatter(
x=group_by_df['day'].map(dt.datetime.fromordinal),
y=group_by_df['predicted'],
name=_("Linear Regression"),
mode='lines+markers',
marker=dict(
color=np.where(group_by_df['day'].index < len(X_train), 'red', 'green'))))
# plot actual values
linear_regression_fig.add_trace(go.Scatter(
x=group_by_df['day'].map(dt.datetime.fromordinal)[:initial_len_df],
y=group_by_df[sensor_name][:initial_len_df],
name=_('Actual values'),
mode='lines+markers'))
linear_regression_fig.update_layout(
height=700,
font=dict(color="grey"),
paper_bgcolor='rgba(0,0,0,0)',
title=_('Linear Regression for ') + _(sensor_name),
yaxis_title=_(sensor_name),
xaxis_title=_('Day'),
showlegend=True)
linear_regression_fig.show()
data="https://raw.githubusercontent.com/iulianastroia/csv_data/master/final_dataframe.csv"
data = pd.read_csv(data)
group_by_df, X_train, sensor_name, initial_len_df = calculate_linear_regression(data, "o3")
linear_reg_fig = create_figure(group_by_df, X_train, sensor_name, initial_len_df)
``

LinearRegression by given last n rows

I'm currently working on a time-series model. It's very simple. I'm deploying last row OHLC (open, high, low, close) value and trying to predict next close. Simple and useless. But what i want to do is to give last 10 days to predict tomorrow's price. I know it's not going to be accurate but this is what i am trying to do.
Here how i get the NextClose and apply it to Linear Regression model:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
df = pd.read_csv("./EURUSD.csv")
days = 1
df['NextClose'] = df['Close'].shift(-days)
df = df.dropna()
total = len(df)
test_ratio = 0.30
test_size = int(total * test_ratio)
total = len(df)
test_ratio = 0.30
test_size = int(total * test_ratio)
X = df[['Open', 'High', 'Low', 'Close']]
y = df[['NextClose']]
#build test and train data
X_train = X[:-test_size]
y_train = y[:-test_size]
X_test = X[-test_size:]
y_test = y[-test_size:]
# build model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
plt.scatter(y_pred, y_test)
plt.show()
In this case, i am giving only the final row. What I want to do is to feed last 10-20 rows.
I believe this is a similar data transformation to the one described in the function create_dataset on this MachineLearningMastery page (see the section LSTM for Regression Using the Window Method).
The goal is to use the data from rows t:(t+days) to predict the closing price at row (t+days)+1.
The X_train matrix will have days * X.shape[1] columns in each row, which in the example below represents the flattened data from 10 days worth of data.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
# generate random data to test
df = pd.DataFrame(np.random.normal(size=(2000, 4)))
df.columns = ['Open', 'High', 'Low', 'Close']
days = 10
df = df.dropna()
total = len(df)
test_ratio = 0.30
test_size = int(total * test_ratio)
X = df[['Open', 'High', 'Low', 'Close']]
y = df['Close'].shift(-days)
# this function based on the MachineLearningMastery page mentioned
def create_dataset(X, y, look_back=1):
dataX, dataY = [], []
for i in range(X.shape[0]-look_back):
a = X.iloc[i:(i+look_back), :].values.flatten()
dataX.append(a)
dataY.append(y.iloc[i])
return np.array(dataX), np.array(dataY)
#build test and train data
X_train, y_train = create_dataset(X[:-test_size], y[:-test_size], look_back=days)
X_test, y_test = create_dataset(X[-test_size:], y[-test_size:], look_back=days)
# build model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
plt.scatter(y_pred, y_test)
plt.show()

Change prediction range in Python from tutorial

I'm probably jumped into Python at the deep end here but I have followed a tutorial from start to finish which works fine, and I generally (I think) understand.
I'm trying to recreate what I learnt using my own data.
I've got this working ok as well however the tutorial showed the prediction line on the plot graph generated over the actual data.
What do I need to change for it to predict say 28 days ahead rather than on top of the data I already have?
here is my code (i say my...mostly from tutorial!)
from connectionstring import conn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.offline as pyoff
import plotly.graph_objs as go
#import Keras
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.layers import LSTM
from sklearn.model_selection import KFold, cross_val_score, train_test_split
params = ("20180801","20191002")
sqlString = "SELECT ODCDAT AS DATE, SUM(ODORDQ) AS DROPIN FROM mytable.mydb WHERE ODCDAT BETWEEN %s AND %s GROUP BY ODCDAT ORDER BY ODCDAT"
command = (sqlString % params)
SQL_Query = pd.read_sql_query(command, conn)
df = pd.DataFrame(SQL_Query, columns=['DATE','DROPIN'])
df['DATE'] = pd.to_datetime(df['DATE'], format='%Y%m%d')
print(df.head(10))
#new dataframe
df_diff = df.copy()
df_diff['prev_day'] = df_diff['DROPIN'].shift(1)
df_diff = df_diff.dropna()
df_diff['diff'] = (df_diff['DROPIN'] - df_diff['prev_day'])
df_diff.head(10)
print(df_diff)
#plot monthly sales diff
plot_data = [
go.Scatter(
x=df_diff['DATE'],
y=df_diff['diff'],
)
]
plot_layout = go.Layout(
title='Daily Drop In Diff'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.plot(fig)
#create dataframe for transformation from time series to supervised
df_supervised = df_diff.drop(['prev_day'],axis=1)
#adding lags
for inc in range(1,31):
field_name = 'lag_' + str(inc)
df_supervised[field_name] = df_supervised['diff'].shift(inc)
#drop null values
df_supervised = df_supervised.dropna().reset_index(drop=True)
print(df_supervised)
# Import statsmodels.formula.api
import statsmodels.formula.api as smf
# Define the regression formula
model = smf.ols(formula='diff ~ lag_1 + lag_2 + lag_3 + lag_4 + lag_5 + lag_6 + lag_7 + lag_8 + lag_9 + lag_10 + lag_11 + lag_12 + lag_13 + lag_14 + lag_15 + lag_16 + lag_17 + lag_18 + lag_19 + lag_20 + lag_21 + lag_22 + lag_23 + lag_24 + lag_24 + lag_25 + lag_26 + lag_27 + lag_28 + lag_29 + lag_30', data=df_supervised)
# Fit the regression
model_fit = model.fit()
# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)
#import MinMaxScaler and create a new dataframe for LSTM model
from sklearn.preprocessing import MinMaxScaler
df_model = df_supervised.drop(['DROPIN','DATE'],axis=1)
#split train and test set
train_set, test_set = df_model[0:-28].values, df_model[-28:].values
#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
# reshape training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)
# reshape test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, nb_epoch=100, batch_size=1, verbose=1, shuffle=False)
y_pred = model.predict(X_test,batch_size=1)
#for multistep prediction, you need to replace X_test values with the predictions coming from t-1
#reshape y_pred
y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])
#rebuild test set for inverse transform
pred_test_set = []
for index in range(0,len(y_pred)):
#print np.concatenate([y_pred[index],X_test[index]],axis=1)
pred_test_set.append(np.concatenate([y_pred[index],X_test[index]],axis=1))
#reshape pred_test_set
pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
#inverse transform
pred_test_set_inverted = scaler.inverse_transform(pred_test_set)
#create dataframe that shows the predicted sales
result_list = []
sales_dates = list(df[-29:].DATE)
act_sales = list(df[-29:].DROPIN)
for index in range(0,len(pred_test_set_inverted)):
result_dict = {}
result_dict['pred_value'] = int(pred_test_set_inverted[index][0] + act_sales[index])
result_dict['DATE'] = sales_dates[index+1]
result_list.append(result_dict)
df_result = pd.DataFrame(result_list)
#for multistep prediction, replace act_sales with the predicted sales
print(df_result)
#merge with actual sales dataframe
df_sales_pred = pd.merge(df,df_result,on='DATE',how='left')
#plot actual and predicted
plot_data = [
go.Scatter(
x=df_sales_pred['DATE'],
y=df_sales_pred['DROPIN'],
name='actual'
),
go.Scatter(
x=df_sales_pred['DATE'],
y=df_sales_pred['pred_value'],
name='predicted'
)
]
plot_layout = go.Layout(
title='Sales Prediction'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.plot(fig)
and here is the second graph i plot which I want to predict ahead...

Categories