Related
This code predicts the values of a specified stock up to the current date but not a date beyond the training dataset. This code is from an earlier question I had asked and so my understanding of it is rather low. I assume the solution would be a simple variable change to add the extra time but I am unaware as to which value needs to be manipulated.
import pandas as pd
import numpy as np
import yfinance as yf
import os
import matplotlib.pyplot as plt
from IPython.display import display
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
pd.options.mode.chained_assignment = None
# download the data
df = yf.download(tickers=['AAPL'], period='2y')
# split the data
train_data = df[['Close']].iloc[: - 200, :]
valid_data = df[['Close']].iloc[- 200:, :]
# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(train_data)
train_data = scaler.transform(train_data)
valid_data = scaler.transform(valid_data)
# extract the training sequences
x_train, y_train = [], []
for i in range(60, train_data.shape[0]):
x_train.append(train_data[i - 60: i, 0])
y_train.append(train_data[i, 0])
x_train = np.array(x_train)
y_train = np.array(y_train)
# extract the validation sequences
x_valid = []
for i in range(60, valid_data.shape[0]):
x_valid.append(valid_data[i - 60: i, 0])
x_valid = np.array(x_valid)
# reshape the sequences
x_train = x_train.reshape(x_train.shape[0],
x_train.shape[1], 1)
x_valid = x_valid.reshape(x_valid.shape[0],
x_valid.shape[1], 1)
# train the model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True,
input_shape=x_train.shape[1:]))
model.add(LSTM(units=50))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=50, batch_size=128, verbose=1)
# generate the model predictions
y_pred = model.predict(x_valid)
y_pred = scaler.inverse_transform(y_pred)
y_pred = y_pred.flatten()
# plot the model predictions
df.rename(columns={'Close': 'Actual'}, inplace=True)
df['Predicted'] = np.nan
df['Predicted'].iloc[- y_pred.shape[0]:] = y_pred
df[['Actual', 'Predicted']].plot(title='AAPL')
display(df)
plt.show()
You could train your model to predict a future sequence (e.g. the next 30 days) instead of predicting the next value (the next day) as it is currently the case.
In order to do that, you need to define the outputs as y[t: t + H] (instead of y[t] as in the current code) where y is the time series and H is the length of the forecast period (i.e. the number of days ahead that you want to forecast). You also need to set the number of outputs of the last layer equal to H (instead of equal to 1 as in the current code).
You can still define the inputs as y[t - T: t] where T is the length of the lookback period (or number of timesteps), and therefore the model's input shape is still (T, 1). The lookback period T is usually longer than the forecast period H (i.e. T > H) and it's often set equal to a multiple of H (i.e. T = m * H where m > 1 is an integer.).
import numpy as np
import pandas as pd
import yfinance as yf
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None
tf.random.set_seed(0)
# download the data
df = yf.download(tickers=['AAPL'], period='1y')
y = df['Close'].fillna(method='ffill')
y = y.values.reshape(-1, 1)
# scale the data
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(y)
y = scaler.transform(y)
# generate the input and output sequences
n_lookback = 60 # length of input sequences (lookback period)
n_forecast = 30 # length of output sequences (forecast period)
X = []
Y = []
for i in range(n_lookback, len(y) - n_forecast + 1):
X.append(y[i - n_lookback: i])
Y.append(y[i: i + n_forecast])
X = np.array(X)
Y = np.array(Y)
# fit the model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(n_lookback, 1)))
model.add(LSTM(units=50))
model.add(Dense(n_forecast))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X, Y, epochs=100, batch_size=32, verbose=0)
# generate the forecasts
X_ = y[- n_lookback:] # last available input sequence
X_ = X_.reshape(1, n_lookback, 1)
Y_ = model.predict(X_).reshape(-1, 1)
Y_ = scaler.inverse_transform(Y_)
# organize the results in a data frame
df_past = df[['Close']].reset_index()
df_past.rename(columns={'index': 'Date', 'Close': 'Actual'}, inplace=True)
df_past['Date'] = pd.to_datetime(df_past['Date'])
df_past['Forecast'] = np.nan
df_past['Forecast'].iloc[-1] = df_past['Actual'].iloc[-1]
df_future = pd.DataFrame(columns=['Date', 'Actual', 'Forecast'])
df_future['Date'] = pd.date_range(start=df_past['Date'].iloc[-1] + pd.Timedelta(days=1), periods=n_forecast)
df_future['Forecast'] = Y_.flatten()
df_future['Actual'] = np.nan
results = df_past.append(df_future).set_index('Date')
# plot the results
results.plot(title='AAPL')
I'm building a stock prediction neural network. The tutorial i was watching was importing the stock data from yahoo finance. I want to improve the code by making it fetch the data from a CSV file so the code can be used even if you are not connected to the internet.
What do I need to change In my code to have it use custom data from a CSV file?
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pandas_datareader as web
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
company = '^GDAXI'
start = dt.datetime(2012,1,1)
end = dt.datetime(2021,1,1)
data = web.DataReader(company, 'yahoo', start, end)
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(data['Close'].values.reshape(-1, 1))
prediction_days = 60
x_train = []
y_train = []
for x in range(prediction_days, len(scaled_data)):
x_train.append(scaled_data[x-prediction_days:x, 0])
y_train.append(scaled_data[x, 0])
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
#BUILD MODEL
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1)) #next day prediction
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, epochs=25, batch_size=32)
#TEST ON EXISTING DATA
test_start = dt.datetime(2020,1,1)
test_end = dt.datetime.now()
test_dataset = web.DataReader(company, 'yahoo', test_start, test_end)
actual_prices = test_dataset['Close'].values
total_dataset = pd.concat((data['Close'], test_dataset['Close']), axis=0)
model_inputs = total_dataset[len(total_dataset)-len(test_dataset)-prediction_days:].values
model_inputs = model_inputs.reshape(-1,1)
model_inputs = scaler.transform(model_inputs)
#PREDICTIONS ON TEST DATA
x_test = []
for x in range(prediction_days, len(model_inputs)):
x_test.append(model_inputs[x-prediction_days:x, 0])
x_test = np.array(x_test)
x_test = np.reshape(x_test,(x_test.shape[0], x_test.shape[1],1))
predicted_prices = model.predict(x_test)
predicted_prices = scaler.inverse_transform(predicted_prices)
#PLOT
plt.plot(actual_prices, color="green", label="Actual Price")
plt.plot(predicted_prices, color="blue", label="Predicted Price")
plt.title("GER40 Share Price")
plt.xlabel('Time')
plt.ylabel('GER40 Price')
plt.legend()
plt.show()
#Predict Next Day
real_dataset = [model_inputs[len(model_inputs)+1-prediction_days:len(model_inputs+1), 0]]
real_dataset = np.array(real_dataset)
real_dataset = np.reshape(real_dataset, (real_dataset.shape[0], real_dataset.shape[1], 1))
prediction = model.predict(real_dataset)
prediction = scaler.inverse_transform(prediction)
print(f"Close: {prediction}")
The CSV file i'm using doesn't have headings, but i think i can add those using excel
I think you should consider doing it this way.
from pandas_datareader import data as wb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from sklearn.preprocessing import MinMaxScaler
start = '2019-06-30'
end = '2020-06-30'
tickers = ['GOOG']
thelen = len(tickers)
price_data = []
for ticker in tickers:
prices = wb.DataReader(ticker, start = start, end = end, data_source='yahoo')[['Open','Adj Close']]
price_data.append(prices.assign(ticker=ticker)[['ticker', 'Open', 'Adj Close']])
#names = np.reshape(price_data, (len(price_data), 1))
df = pd.concat(price_data)
df.reset_index(inplace=True)
for col in df.columns:
print(col)
#used for setting the output figure size
rcParams['figure.figsize'] = 20,10
#to normalize the given input data
scaler = MinMaxScaler(feature_range=(0, 1))
#to read input data set (place the file name inside ' ') as shown below
df['Adj Close'].plot()
plt.legend(loc=2)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
ntrain = 80
df_train = df.head(int(len(df)*(ntrain/100)))
ntest = -80
df_test = df.tail(int(len(df)*(ntest/100)))
#importing the packages
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
#dataframe creation
seriesdata = df.sort_index(ascending=True, axis=0)
new_seriesdata = pd.DataFrame(index=range(0,len(df)),columns=['Date','Adj Close'])
length_of_data=len(seriesdata)
for i in range(0,length_of_data):
new_seriesdata['Date'][i] = seriesdata['Date'][i]
new_seriesdata['Adj Close'][i] = seriesdata['Adj Close'][i]
#setting the index again
new_seriesdata.index = new_seriesdata.Date
new_seriesdata.drop('Date', axis=1, inplace=True)
#creating train and test sets this comprises the entire data’s present in the dataset
myseriesdataset = new_seriesdata.values
totrain = myseriesdataset[0:255,:]
tovalid = myseriesdataset[255:,:]
#converting dataset into x_train and y_train
scalerdata = MinMaxScaler(feature_range=(0, 1))
scale_data = scalerdata.fit_transform(myseriesdataset)
x_totrain, y_totrain = [], []
length_of_totrain=len(totrain)
for i in range(60,length_of_totrain):
x_totrain.append(scale_data[i-60:i,0])
y_totrain.append(scale_data[i,0])
x_totrain, y_totrain = np.array(x_totrain), np.array(y_totrain)
x_totrain = np.reshape(x_totrain, (x_totrain.shape[0],x_totrain.shape[1],1))
#LSTM neural network
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(x_totrain.shape[1],1)))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(1))
lstm_model.compile(loss='mean_squared_error', optimizer='adadelta')
lstm_model.fit(x_totrain, y_totrain, epochs=10, batch_size=1, verbose=2)
#predicting next data stock price
myinputs = new_seriesdata[len(new_seriesdata) - (len(tovalid)+1) - 60:].values
myinputs = myinputs.reshape(-1,1)
myinputs = scalerdata.transform(myinputs)
tostore_test_result = []
for i in range(60,myinputs.shape[0]):
tostore_test_result.append(myinputs[i-60:i,0])
tostore_test_result = np.array(tostore_test_result)
tostore_test_result = np.reshape(tostore_test_result,(tostore_test_result.shape[0],tostore_test_result.shape[1],1))
myclosing_priceresult = lstm_model.predict(tostore_test_result)
myclosing_priceresult = scalerdata.inverse_transform(myclosing_priceresult)
totrain = df_train
tovalid = df_test
#predicting next data stock price
myinputs = new_seriesdata[len(new_seriesdata) - (len(tovalid)+1) - 60:].values
# Printing the next day’s predicted stock price.
print(len(tostore_test_result));
print(myclosing_priceresult);
Final Result:
[[1396.532]]
I tried to develop a model that foresees two time-steps forward
In this regard I modified a GitHub code for the single step forecast coding a data_load function that takes n steps backward in the X_train/test series and set it against a y_train/test 2-array.
I set the neurons list to output in Dense a 2-vector object.
And last I wrote a predict function and a plot function for the 2-step-forecast.
I do not normalized features lables and forecasts I will do in the future.
After a bit of hyperfine tuning it returns a good score for the mse and rmse:
Train Score: 0.00000 MSE (0.00 RMSE)
Test Score: 0.00153 MSE (0.04 RMSE)
It can find quite well the trend, but it returns all forecasts with negative directions.
Does anyone have a suggestion?
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt2
import pandas as pd
from pandas import datetime
import math, time
import itertools
from sklearn import preprocessing
import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.recurrent import LSTM
from keras.models import load_model
import keras
from numpy import newaxis
pd.core.common.is_list_like = pd.api.types.is_list_like
import pandas_datareader.data as web
import h5py
from keras import backend as K
import quandl
quandl.ApiConfig.api_key = 'myQuandlKey'
seq_len = 2
shape = [seq_len, 9, 2]
neurons = [256, 256, 64, 2]
dropout = 0.2
decay = 0.5
epochs = 100
stock_name = 'AAPL'
global_start_time = time.time()
def get_stock_data(stock_name, normalize=True, ma=[]):
"""
Return a dataframe of that stock and normalize all the values.
(Optional: create moving average)
"""
df = quandl.get_table('WIKI/PRICES', ticker = stock_name)
df.drop(['ticker', 'open', 'high', 'low', 'close', 'ex-dividend', 'volume', 'split_ratio'], 1, inplace=True)
df.set_index('date', inplace=True)
# Renaming all the columns so that we can use the old version code
df.rename(columns={'adj_open': 'Open', 'adj_high': 'High', 'adj_low': 'Low', 'adj_volume': 'Volume', 'adj_close': 'Adj Close'}, inplace=True)
# Percentage change
df['Pct'] = df['Adj Close'].pct_change()
df.dropna(inplace=True)
# Moving Average
if ma != []:
for moving in ma:
df['{}ma'.format(moving)] = df['Adj Close'].rolling(window=moving).mean()
df.dropna(inplace=True)
if normalize:
min_max_scaler = preprocessing.MinMaxScaler()
df['Open'] = min_max_scaler.fit_transform(df.Open.values.reshape(-1,1))
df['High'] = min_max_scaler.fit_transform(df.High.values.reshape(-1,1))
df['Low'] = min_max_scaler.fit_transform(df.Low.values.reshape(-1,1))
df['Volume'] = min_max_scaler.fit_transform(df.Volume.values.reshape(-1,1))
df['Adj Close'] = min_max_scaler.fit_transform(df['Adj Close'].values.reshape(-1,1))
df['Pct'] = min_max_scaler.fit_transform(df['Pct'].values.reshape(-1,1))
if ma != []:
for moving in ma:
df['{}ma'.format(moving)] = min_max_scaler.fit_transform(df['{}ma'.format(moving)].values.reshape(-1,1))
# Move Adj Close to the rightmost for the ease of training
adj_close = df['Adj Close']
df.drop(labels=['Adj Close'], axis=1, inplace=True)
df = pd.concat([df, adj_close], axis=1)
#df.to_csv('aap.csv')
return df
df = get_stock_data(stock_name, ma=[50, 100, 200])
def plot_stock(df):
print(df.head())
plt.subplot(211)
plt.plot(df['Adj Close'], color='red', label='Adj Close')
plt.legend(loc='best')
plt.subplot(212)
plt.plot(df['Pct'], color='blue', label='Percentage change')
plt.legend(loc='best')
plt.show()
#plot_stock(df)
def load_data(stock, seq_len):
amount_of_features = len(stock.columns)
data = stock.values
sequence_length = seq_len + 2 # index starting from 0
result = []
for index in range(len(data) - sequence_length): # maxmimum date = lastest date - sequence length
result.append(data[index: index + sequence_length]) # index : index + 22days
result = np.array(result)
row = round(0.8 * result.shape[0]) # 80% split
train = result[:int(row), :,:] # 80% date
X_train = train[:, :-2,:] # all data until day m
y_train = train[:, -2:,:][:,:,-1] # day m + 1 adjusted close price
X_test = result[int(row):, :-2,:]
y_test = result[int(row):, -2:,:][:,:,-1]
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], amount_of_features))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], amount_of_features))
print ("________________________________________________________________")
print ("X_train shape = {}".format(X_train.shape))
print ("y_train shape = {}".format(y_train.shape))
print ("")
print ("X_test shape = {}".format(X_test.shape))
print ("y_test shape = {}".format(y_test.shape))
print ("________________________________________________________________")
return [X_train, y_train, X_test, y_test]
X_train, y_train, X_test, y_test = load_data(df, seq_len)
def build_model(shape, neurons, dropout, decay):
model = Sequential()
model.add(LSTM(neurons[0], input_shape=(shape[0], shape[1]), return_sequences=True))
model.add(Dropout(dropout))
model.add(LSTM(neurons[1], input_shape=(shape[0], shape[1]), return_sequences=False))
model.add(Dropout(dropout))
model.add(Dense(neurons[2],kernel_initializer="uniform",activation='relu'))
model.add(Dense(neurons[3],kernel_initializer="uniform",activation='linear'))
# model = load_model('my_LSTM_stock_model1000.h5')
adam = keras.optimizers.Adam(decay=decay)
model.compile(loss='mse',optimizer='adam', metrics=['accuracy'])
model.summary()
return model
model = build_model(shape, neurons, dropout, decay)
model.fit(
X_train,
y_train,
batch_size=512,
epochs=epochs,
validation_split=0.01,
verbose=1)
def model_score(model, X_train, y_train, X_test, y_test):
trainScore = model.evaluate(X_train, y_train, verbose=0)
print('Train Score: %.5f MSE (%.2f RMSE)' % (trainScore[0], math.sqrt(trainScore[0])))
testScore = model.evaluate(X_test, y_test, verbose=0)
print('Test Score: %.5f MSE (%.2f RMSE)' % (testScore[0], math.sqrt(testScore[0])))
return trainScore[0], testScore[0]
model_score(model, X_train, y_train, X_test, y_test)
def percentage_difference(model, X_test, y_test):
percentage_diff=[]
p = model.predict(X_test)
for u in range(len(y_test)): # for each data index in test data
pr = p[u][0] # pr = prediction on day u
percentage_diff.append((pr-y_test[u]/pr)*100)
print('Prediction duration: ', str(datetime.timedelta(seconds=(time.time() - global_start_time))) )
return p
def denormalize(stock_name, normalized_value):
"""
Return a dataframe of that stock and normalize all the values.
(Optional: create moving average)
"""
df = quandl.get_table('WIKI/PRICES', ticker = stock_name)
df.drop(['ticker', 'open', 'high', 'low', 'close', 'ex-dividend', 'volume', 'split_ratio'], 1, inplace=True)
df.set_index('date', inplace=True)
# Renaming all the columns so that we can use the old version code
df.rename(columns={'adj_open': 'Open', 'adj_high': 'High', 'adj_low': 'Low', 'adj_volume': 'Volume', 'adj_close': 'Adj Close'}, inplace=True)
df.dropna(inplace=True)
df = df['Adj Close'].values.reshape(-1,1)
normalized_value = normalized_value.reshape(-1,1)
#return df.shape, p.shape
min_max_scaler = preprocessing.MinMaxScaler()
a = min_max_scaler.fit_transform(df)
new = min_max_scaler.inverse_transform(normalized_value)
return new
def plot_result(stock_name, normalized_value_p, normalized_value_y_test):
newp = denormalize(stock_name, normalized_value_p)
newy_test = denormalize(stock_name, normalized_value_y_test)
#newy_test = np.roll(newy_test,1,0)
plt2.plot(newp, color='red', label='Prediction')
plt2.plot(newy_test,color='blue', label='Actual')
plt2.legend(loc='best')
plt2.title('Global run time {}'.format(str(datetime.timedelta(seconds=(time.time() - global_start_time))) ) )
plt2.xlabel('Days')
plt2.ylabel('Adjusted Close')
plt2.show()
def predict_sequences_multiple(model, data, window_size, prediction_len):
#Predict sequence of 50 steps before shifting prediction run forward by 50 steps
prediction_seqs = []
for i in range(int(len(data)/prediction_len)):
curr_frame = data[i*prediction_len]
predicted = []
for j in range(prediction_len):
predicted.append(model.predict(curr_frame[newaxis,:,:])[0,0])
curr_frame = curr_frame[1:]
curr_frame = np.insert(curr_frame, [window_size-1], predicted[-1], axis=0)
prediction_seqs.append(predicted)
print('Prediction duration: ', str(datetime.timedelta(seconds=(time.time() - global_start_time))) )
return prediction_seqs
def plot_results_multiple(predicted_data, true_data, prediction_len):
fig = plt.figure(facecolor='white')
ax = fig.add_subplot(111)
ax.plot(true_data, label='True Data')
#Pad the list of predictions to shift it in the graph to it's correct start
for i, data in enumerate(predicted_data):
padding = [None for p in range(i * prediction_len)]
plt.plot(padding + data, label='Prediction')
plt.legend()
plt.title('Global run time {}'.format(str(datetime.timedelta(seconds=(time.time() - global_start_time))) ) )
plt.show()
#Single step prediction
#p = percentage_difference(model, X_test, y_test)
#plot_result(stock_name, p, y_test)
#Multiple step prediction
predictions = predict_sequences_multiple(model, X_test, seq_len, 2)
plot_results_multiple(predictions, y_test, 2)
I am predicting Y based on X from past values. Our formatted CSV dataset has three columns (time_stamp, X and Y - where Y is the actual value) whose sample format is
time,X,Y
0.000561,0,10
0.000584,0,10
0.040411,5,10
0.040437,10,10
0.041638,12,10
0.041668,14,10
0.041895,15,10
0.041906,19,10
... ... ...
Before training the prediction model, here is how the plots of X and Y respectively look like the following.
Here is how I approached the problem with LSTM Recurrent Neural Networks in Python with Keras.
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
np.random.seed(7)
# Load data
df = pd.read_csv('test32_C_data.csv')
n_features = 100
def create_sequences(data, window=15, step=1, prediction_distance=15):
x = []
y = []
for i in range(0, len(data) - window - prediction_distance, step):
x.append(data[i:i + window])
y.append(data[i + window + prediction_distance][1])
x, y = np.asarray(x), np.asarray(y)
return x, y
# Scaling prior to splitting
scaler = MinMaxScaler(feature_range=(0.01, 0.99))
scaled_data = scaler.fit_transform(df.loc[:, ["X", "Y"]].values)
# Build sequences
x_sequence, y_sequence = create_sequences(scaled_data)
# Create test/train split
test_len = int(len(x_sequence) * 0.90)
valid_len = int(len(x_sequence) * 0.90)
train_end = len(x_sequence) - (test_len + valid_len)
x_train, y_train = x_sequence[:train_end], y_sequence[:train_end]
x_valid, y_valid = x_sequence[train_end:train_end + valid_len], y_sequence[train_end:train_end + valid_len]
x_test, y_test = x_sequence[train_end + valid_len:], y_sequence[train_end + valid_len:]
# Initialising the RNN
model = Sequential()
# Adding the input layerand the LSTM layer
model.add(LSTM(15, input_shape=(15, 2)))
# Adding the output layer
model.add(Dense(1))
# Compiling the RNN
model.compile(loss='mse', optimizer='rmsprop')
# Fitting the RNN to the Training set
model.fit(x_train, y_train, epochs=5)
# Getting the predicted values
y_pred = model.predict(x_test)
#y_pred = scaler.inverse_transform(y_pred)
plot_colors = ['#332288', '#3cb44b']
# Plot the results
pd.DataFrame({"Actual": y_test, "Predicted": np.squeeze(y_pred)}).plot(color=plot_colors)
plt.xlabel('Time [Index]')
plt.ylabel('Values')
Finally, when I run the code - the neural model seems to capture the pattern of the signal well as it is shown below.
However, one problem that I encountered in this output is the ranges of Y. As it is shown in the first two plots, the ranges should be 0-400 as shown above and to solve that I tried to use the scaler to inverse_transform as y_pred = scaler.inverse_transform(y_pred) but this throws an error: ValueError: non-broadcastable output operand with shape (7625,1) doesn't match the broadcast shape (7625,2). How can we solve this broadcast shape error?
Basically, the scaler has remembered that it was fed 2 features(/columns). So it is expecting 2 features to invert the transformation.
Two options here.
1) You make two different scalers: scaler_x and scaler_y like this :
# Scaling prior to splitting
scaler_x = MinMaxScaler(feature_range=(0.01, 0.99))
scaler_y = MinMaxScaler(feature_range=(0.01, 0.99))
scaled_x = scaler_x.fit_transform(df.loc[:, "X"].reshape([-1, 1]))
scaled_y = scaler_y.fit_transform(df.loc[:, "Y"].reshape([-1, 1]))
scaled_data = np.column_stack((scaled_x, scaled_y))
Then you will be able to do :
y_pred = scaler_y.inverse_transform(y_pred)
2) You fake the X column in your output like this :
y_pred_reshaped = np.zeros((len(y_pred), 2))
y_pred_reshaped[:,1] = y_pred
y_pred = scaler.inverse_transform(y_pred_reshaped)[:,1]
Does that help?
EDIT
here is the full code as required
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
np.random.seed(7)
# Load data
#df = pd.read_csv('test32_C_data.csv')
df = pd.DataFrame(np.random.randint(0,100, size=(100,3)), columns = ['time', 'X', 'Y'])
n_features = 100
def create_sequences(data, window=15, step=1, prediction_distance=15):
x = []
y = []
for i in range(0, len(data) - window - prediction_distance, step):
x.append(data[i:i + window])
y.append(data[i + window + prediction_distance][1])
x, y = np.asarray(x), np.asarray(y)
return x, y
# Scaling prior to splitting
scaler_x = MinMaxScaler(feature_range=(0.01, 0.99))
scaler_y = MinMaxScaler(feature_range=(0.01, 0.99))
scaled_x = scaler_x.fit_transform(df.loc[:, "X"].reshape([-1,1]))
scaled_y = scaler_y.fit_transform(df.loc[:, "Y"].reshape([-1,1]))
scaled_data = np.column_stack((scaled_x, scaled_y))
# Build sequences
x_sequence, y_sequence = create_sequences(scaled_data)
test_len = int(len(x_sequence) * 0.90)
valid_len = int(len(x_sequence) * 0.90)
train_end = len(x_sequence) - (test_len + valid_len)
x_train, y_train = x_sequence[:train_end], y_sequence[:train_end]
x_valid, y_valid = x_sequence[train_end:train_end + valid_len], y_sequence[train_end:train_end + valid_len]
x_test, y_test = x_sequence[train_end + valid_len:], y_sequence[train_end + valid_len:]
# Initialising the RNN
model = Sequential()
# Adding the input layerand the LSTM layer
model.add(LSTM(15, input_shape=(15, 2)))
# Adding the output layer
model.add(Dense(1))
# Compiling the RNN
model.compile(loss='mse', optimizer='rmsprop')
# Fitting the RNN to the Training set
model.fit(x_train, y_train, epochs=5)
# Getting the predicted values
y_pred = model.predict(x_test)
y_pred = scaler_y.inverse_transform(y_pred)
I am getting this error:
ValueError: Items of feature_columns must be a _FeatureColumn. Given
(type ): Index(['CreditScore',
'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
'IsActiveMember', 'EstimatedSalary', 'Exited'],
dtype='object').
I am using tensorFlow lib. I want to get prediction results but I can not run m.train(input_fn=get_input_fn ,steps=5000) code. I always got the same error whatever I did. I used these input functions in the following but nothing changed.
def input_fn_train():
x=tf.constant(df_train.astype(np.float64)),
y=tf.constant(df_train[LABEL].astype(np.float64))
return x, y
and
def get_input_fn(data_set, num_epochs=None, shuffle=False):
return tf.estimator.inputs.pandas_input_fn(
x=pd.DataFrame({k: data_set[k].values for k in data_set.columns}),
y=pd.Series(data_set[LABEL].values), num_epochs=num_epochs,
shuffle=shuffle)
I can not understand what should I do. What the error is about? I've been googling but never found useful thing. How can I handle this error. The code is below. Thanks!
import pandas as pd
import tensorflow as tf
import numpy as np
import tempfile
COLS= ["RowNumber","CustomerId","Surname","CreditScore","Geography",
"Gender","Age","Tenure","Balance","NumOfProducts","HasCrCard",
"IsActiveMember","EstimatedSalary","Exited"]
FEATURES = ["CreditScore","Age","Tenure","Balance","NumOfProducts",
"HasCrCard","IsActiveMember", "EstimatedSalary"]
LABEL="Exited"
df_train = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
df_test = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
test_label = df_test[LABEL].astype(float)
df_test.drop("Surname", axis = 1, inplace=True)
df_test.drop("RowNumber", axis = 1, inplace=True)
df_test.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("CustomerId", axis = 1, inplace=True)
df_train.drop("Surname", axis = 1, inplace=True)
df_train.drop("RowNumber", axis = 1, inplace=True)
df_train.drop("Geography", axis = 1, inplace=True)
df_train.drop("Gender", axis = 1, inplace=True)
def get_input_fn():
return {'x': tf.constant(df_train[FEATURES].as_matrix(), tf.float32,
df_train.shape),
'y': tf.constant(df_train[LABEL].as_matrix(), tf.float32,
df_train.shape)
}
df=df_train.select_dtypes(exclude=['object'])
numeric_cols=df.columns
m = tf.estimator.LinearClassifier(model_dir=model_dir, feature_columns=
[numeric_cols])
m.train(input_fn=get_input_fn ,steps=5000)
results = m.evaluate(input_fn= get_input_fn(df_test, num_epochs=1,
shuffle=False),steps=None)
y = m.predict(input_fn=get_input_fn(df_test, num_epochs=1, shuffle=False))
pred = list(y)
rowNumber=0
for i in pred:
print(str(rowNumber)+': '+str(pred[i]))
rowNumber=rowNumber+1
Your first mistake is how you create tf.estimator.LinearClassifier. You're passing the dataframe index df.columns into feature_columns, but should pass the list of tensorflow feature columns. The columns should define if it's numerical or categorical and in the later case the encoding type.
Secondly, the input function can be simplified a lot, since you're reading pandas dataframe. Just use tf.estimator.inputs.pandas_input_fn.
Your .csv is most likely different, I've made a dummy one with some values. So here's a way to read the input and fit the model correctly:
import pandas as pd
import tensorflow as tf
FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts",
"HasCrCard", "IsActiveMember", "EstimatedSalary", "Exited"]
credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
has_card = tf.feature_column.categorical_column_with_vocabulary_list("HasCrCard", ["True", "False"])
is_active_member = tf.feature_column.categorical_column_with_vocabulary_list("IsActiveMember", ["True", "False"])
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, has_card, is_active_member, estimated_salary]
def input_fn(num_epochs=None, shuffle=True, batch_size=100):
df = pd.read_csv('Churn_Modelling.csv',
names=FEATURES,
dtype={'HasCrCard': str, 'IsActiveMember': str},
skipinitialspace=True,
header=0)
df = df.dropna(how='any', axis=0) # remove NaN elements
labels = df["Exited"]
return tf.estimator.inputs.pandas_input_fn(x=df,
y=labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
model = tf.estimator.LinearClassifier(model_dir=None,
feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)
It is working clearly.
import pandas as pd
import tensorflow as tf
import tempfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
def split_data(data, rate, label):
data = data.dropna()
train_data, test_data = train_test_split(data, test_size=rate)
train_label = train_data[label]
train_data = train_data.drop(label, 1)
test_label = test_data[label]
test_data = test_data.drop(label, 1)
return train_data, train_label, test_data, test_label
LABEL = "Exited"
data = pd.read_csv("Churn_Modelling.csv", skipinitialspace=True,
header=0)
data.drop("Surname", axis=1, inplace=True)
data.drop("RowNumber", axis=1, inplace=True)
data.drop("CustomerId", axis=1, inplace=True)
data.drop("Geography", axis=1, inplace=True)
data.drop("Gender", axis=1, inplace=True)
x_train, y_train, x_test, y_test = split_data(data, 0.20, LABEL)
def get_input_fn_train():
input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_train,
y=y_train,
shuffle=False
)
return input_fn
def get_input_fn_test():
input_fn = tf.estimator.inputs.pandas_input_fn(
x=x_test,
y=y_test,
shuffle=False
)
return input_fn
feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input_fn
(get_input_fn_train())
model_dir = tempfile.mkdtemp()
m = tf.estimator.LinearClassifier(model_dir=model_dir,
feature_columns=feature_columns)
# train data
m.train(input_fn=get_input_fn_train(), steps=5000)
# you can get accuracy, accuracy_baseline, auc, auc_precision_recall,
#average_loss, global_step, label/mean, lossprediction/mean
results = m.evaluate(input_fn=get_input_fn_test(), steps=None)
print("model directory = %s" % model_dir)
for key in sorted(results):
print("%s: %s" % (key, results[key]))
# get prediction results
y = m.predict(input_fn=get_input_fn_test())
predictions = list(y)
pred1=pd.DataFrame(data=predictions)
prediction=pd.DataFrame(data=pred1['class_ids'])
pred=[]
for row in prediction["class_ids"]:
pred.append(row[0])
rowNumber = 0
for i in pred:
print(str(rowNumber) + ': ' + str(i))
rowNumber = rowNumber + 1
def calculate(prediction, LABEL):
arr = {"accuracy": accuracy_score(prediction, LABEL),
"report": classification_report(prediction, LABEL),
"Confusion_Matrix": confusion_matrix(prediction, LABEL),
"F1 score": f1_score(prediction, LABEL),
"Recall Score": recall_score(prediction, LABEL),
"cohen_kappa": cohen_kappa_score(prediction, LABEL)
}
return arr
pred2 = pd.DataFrame(data=pred)
print(calculate(pred2.round(), y_test))
I'm going to make some small changes to #Maxim's answer (thanks, btw) and post a minimum working example with random numpy data. This seems to run fine on my windows machine. Note the suppressed warning due to my particular hardware.
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import numpy as np
import tensorflow as tf
FEATURES = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary", "Exited"]
credit_score = tf.feature_column.numeric_column("CreditScore")
age = tf.feature_column.numeric_column("Age")
tenure = tf.feature_column.numeric_column("Tenure")
balance = tf.feature_column.numeric_column("Balance")
num_of_products = tf.feature_column.numeric_column("NumOfProducts")
estimated_salary = tf.feature_column.numeric_column("EstimatedSalary")
feature_columns = [credit_score, age, tenure, balance, num_of_products, estimated_salary]
def input_fn(num_epochs=None, shuffle=True, batch_size=100):
N_features = len(FEATURES)
print(N_features)
N_examples = 5000
X_train = np.random.rand(N_examples,N_features)
Y_train = np.random.rand(N_examples)
columns = [str(i) for i in range(N_features)]
columns = FEATURES
df = pd.DataFrame(data = X_train, columns = columns)
labels = df["Exited"]
return tf.estimator.inputs.pandas_input_fn(x=df,
y=labels,
batch_size=batch_size,
num_epochs=num_epochs,
shuffle=shuffle,
num_threads=5)
model = tf.estimator.LinearClassifier(model_dir='model_dir',
feature_columns=feature_columns)
model.train(input_fn=input_fn(), steps=100)