What changes should I make in this code to predict the output based on all the parameters listed on the dataset and predict the next day opening stock price?
When I tried to run it showed an out of shape error. This code works fine with only 1 parameter.
My code is as follows:
dataset_train = pd.read_csv('ongc_train.csv')
dataset_train = dataset_train.dropna()
training_set = dataset_train.iloc[:, 1:2].value
# Creating a dataset with 60 timesteps and 1 output
X_train = []
Y_train = []
for i in range(60,2493):
X_train.append(training_set_scaled[i-60:i, 0])
Y_train.append(training_set_scaled[i, 0])
X_train, Y_train = np.array(X_train), np.array(Y_train)
# Reshaping
X_train np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
# Fitting the RNN to the training set
regressor.fit(X_train, Y_train, epochs=100, batch_size=32)
# Getting the predicted stock price of 2017
# Concatenating the original training and test set
# Vertical concatenating of open stock prices
dataset_total = pd.concat((dataset_train['Open'], dataset_test['Open']), axis=0)
inputs = dataset_total[len(dataset_total) - len(dataset_test) - 60:].values
inputs = inputs.reshape(-1, 1)
inputs = sc.transform(inputs)
X_test = []
for i in range(60, 61):
X_test.append(inputs[i-60:i, 0])
X_test=np.array(X_test)
X_test=np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
predicted_stock_price = regressor.predict(X_test)
predicted_stock_price = sc.inverse_transform(predicted_stock_price)
actual = dataset_test.iloc[:, 1:2].values
print("Predicted Stock Price:",predicted_stock_price)
Thank you.
Related
I'm trying to follow the youtube tutorial from the link below, but I'm getting the error: "Tuple index out of range" I've compared my code to the video and it seems to match. It's the very last line of code that returns the error. I did create and compile the model, had to remove that code in order to post. Any ideas on how to correct this?
#https://www.youtube.com/watch?v=PuZY9q-aKLw
#Load data
company = 'FB'
start = dt.datetime(2012,1,1)
end = dt.datetime(2020,1,1)
data = web.DataReader(company, 'yahoo', start, end)
#Prepare Data
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(data['Close'].values.reshape(-1,1))
prediction_days=60
x_train = []
y_train = []
for x in range(prediction_days, len(scaled_data)):
x_train.append(scaled_data[x-prediction_days:x, 0])
y_train.append(scaled_data[x, 0])
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
#Test the model accuracy
test_start = dt.date(2020,1,1)
test_end = dt.datetime.now()
test_data = web.DataReader(company, 'yahoo', test_start, test_end)
actual_prices = test_data['Close'].values
total_dataset = pd.concat((data['Close'], test_data['Close']), axis=0)
model_inputs = total_dataset[len(total_dataset)- len(test_data) - prediction_days:].values
model_inputs = model_inputs.reshape(-1, 1)
model_inputs = scaler.transform(model_inputs)
#Make predictions on test data
x_test = []
for x in range(prediction_days, len(model.inputs)):
x_test.append(model_inputs[x-prediction_days:x, 0])
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
I have made my LSTM model to estimate next day's stock prices. I have used tensorflow and keras.
However, I do not understand why my model's predicted price is almost always 2 or 3 factors higher than the current stock price. Is there anybody who knows what I am doing wrong?
The code is shown below:
def StockPredictor(stock, startdate, enddate, pricetype):
#Get the stock quote
df = web.DataReader(stock, data_source = 'yahoo', start=startdate, end=enddate)
#df = pd.read_csv('StockData/TATA.csv')
#Create a new dataframe with only the price type chosen
data = df.filter([pricetype])
dataset = data.values #convert dataset into a numpy array
training_data_len = math.ceil(len(dataset) * 0.80) #ik wil 80% van de dataset gebruiken om het LSTM model te trainen (naar boven afronden met math.ceil)
#Scale the data (normalizing imput data) (helps the model)
scaler = MinMaxScaler(feature_range=(0,1)) #scaled_data allemaal waardes tussen 0 en 1
scaled_data = scaler.fit_transform(dataset) #computes min and max values for scaling and transforms data based on these values
#Create the training data set
#Create the scaled training data set
train_data = scaled_data[0:training_data_len , :]
#split data into x_train and y_train datasets
x_train, y_train = [], [] #x_train independent training feature, y dependent
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i,0]) #bevat de waardes van 60 vorige periodes
y_train.append(train_data[i, 0]) #bevat 61e waarde waarvan we willen dat model het voorspelt
#Convert x_train and y_train to numpy arrays
x_train, y_train = np.array(x_train), np.array(y_train)
#reshape data (LSTM expects data to be 3D in form of no. of samples, no. of timestamps and no. of features) (x_train is now 2D)
x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1], 1)) #reshape tot 3D, x_train.shape[0] = no of rows in 2D x_train, [1] is no of colums van 2D x_train
#Build the LSTM model
model=Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(LSTM(50, return_sequences= False))
model.add(Dense(25))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error') #model has loss function and optimizer
#training the model with the fit function
model.fit(x_train, y_train, batch_size=1, epochs=1) #epoch is no of iterations of the dataset forth and backwarth in neural network
#Create the testing data set
#Create new array containing scale valuels from index
test_data = scaled_data[training_data_len - 60: , :]
#create datasets x_test and y_test
x_test = []
y_test = dataset[training_data_len:, :]
for i in range(60, len(test_data)):
x_test.append(test_data[i-60:i,0])
#convert data into numpy array
x_test = np.array(x_test)
#Reshape data (zelfde uitleg als regel 65)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
#Get models predicted price values
#predictions afhankelijk van x_test moeten zelfde values krijgen als y_test
predictions = model.predict(x_test) #want predictions to contain same values as y_test
predictions = scaler.inverse_transform(predictions) #unscale the values
#Get the RMSE (om het model te testen)
rmse = np.sqrt(np.mean(predictions - y_test)**2)
rmse
#Plot the data
train = data[:training_data_len]
valid = data[training_data_len:]
valid['Predictions'] = predictions
print('The RMSE for the training model =', rmse)
new_df = df.filter([pricetype])
#get the last 60 days
last_60_days = new_df[-60:].values
last_60_days_scaled = scaler.transform(last_60_days)
#create empty list
X_test = []
#append past 60 days to list
X_test.append(last_60_days)
#Convert X_test to numpy array
X_test = np.array(X_test)
#reshape to 3D
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1],1))
#Get predicted scaled price
pred_price = model.predict(X_test)
#undo scaling
pred_price = scaler.inverse_transform(pred_price)
print('Predicted price for the next day is :',pred_price)
return pred_price
allprices = []
for i in range(10):
pred_price = StockPredictor()
allprices.append(pred_price)
average_pred_price = sum(allprices) / len(allprices)
You are using a min max scaler between 0 and 1, where the defined highs are historical. Your LSTM model will predict a new high and when you inverse_transform the prediction, it will likely be higher than the min and max that has been fitted to the scaler.
Therefore the scaler is the likely culprit that is resulting in your predictions being a factor of 2x higher. Using a standard scaler might help, or without scaling at all.
Side Note
LSTMs to predict stock prices simply on price data will not work.
What is likely to happen is that your LSTM model will predict prices with a T+1 lag - predicting the price with a 1 day lag.
Price data inherently contains noise, brought about by retail traders and especially now with social sentiment trading. An LSTM is likely to overfit on historical noise and therefore is unrepresentative of future "noises"
For more information on the problem of noise, check out this link - https://www.investopedia.com/articles/trading/06/marketnoise.asp
I am trying to predict the temperature for the given area (its integer number from 1 to 142) for the given date and time.
The problem is that I have CSV with the following columns:
DateTime,AreaID,Temperature
How to reframe the data-frame for LSTM (Apologise as I am a new bee for the LSTM)?
For the information, I have data for two months with a measured by the period of every 5 minutes.
I have coded LSTM for Input DateTime. But I want to include AreaID too. to predict Temperature.
The dataset created for the Training and Testing sets are using the following code block:
dataset = dataset.temperature.values #numpy.ndarray
dataset = dataset.astype('float32')
dataset = np.reshape(dataset, (-1, 1))
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
train_size = int(len(dataset) * 0.80)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]
def create_dataset(dataset, look_back=1):
X, Y = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back), 0]
X.append(a)
Y.append(dataset[i + look_back, 0])
return np.array(X), np.array(Y)
look_back = 30
X_train, Y_train = create_dataset(train, look_back)
X_test, Y_test = create_dataset(test, look_back)
# reshape input to be [samples, time steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
Before this, The sample code have sorted the data frame based on DateTime like:
dataset.sort_values('timestamp', inplace=True, ascending=True)
I want to change LSTM to take two inputs
1. DateTime
2. AreaID
& One Output :
1. Temperature
How to code LSTM for this requirements? (Please help me I am a new bee in the area of neural network)
Just for hint.
You prepare new dataset into x_train and y_train
Take an starting 60 days to train the model and predict 61th days thats my logic
X_train=[]
y_train=[]
count=0
for i in range(60,train.shape[0]):
count=count+1
X_train.append(df[i-60:i])
y_train.append(train['targetcol'][i])
I have a problem I don't know how to fix transform to add new features in order to make more proper forecast. The code below predicts stock prices by Close value. Data:
Open High Low Close Adj Close Volume
Datetime
2020-03-10 09:30:00+03:00 5033.0 5033.0 4690.0 4840.0 4840.0 702508
2020-03-10 10:30:00+03:00 4840.0 4870.0 4700.0 4746.5 4746.5 1300648
2020-03-10 11:30:00+03:00 4746.5 4783.0 4706.0 4745.5 4745.5 1156482
2020-03-10 12:30:00+03:00 4745.5 4884.0 4730.0 4870.0 4870.0 1213268
2020-03-10 13:30:00+03:00 4874.0 4990.5 4867.5 4886.5 4886.5 1958028
... ... ... ... ... ... ...
2020-04-03 14:30:00+03:00 5177.0 5217.0 5164.0 5211.5 5211.5 385696
2020-04-03 15:30:00+03:00 5212.0 5364.0 5191.0 5269.5 5269.5 1091066
2020-04-03 16:30:00+03:00 5270.0 5297.0 5209.0 5220.5 5220.5 518686
2020-04-03 17:30:00+03:00 5222.0 5271.0 5184.0 5220.5 5220.5 665096
2020-04-03 18:30:00+03:00 5217.5 5223.5 5197.0 5204.5 5204.5 261400
I want to add Volume and Open features, but getting error:
predictions = scaler.inverse_transform(predictions)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/sklearn/preprocessing/_data.py", line 436, in inverse_transform
X -= self.min_
ValueError: non-broadcastable output operand with shape (40,1) doesn't match the broadcast shape (40,3)
Q1: How to change inverse_transform and what else do I need to change (input_shape argument maybe) to get correct results?
Q2: The result will be prediction of Close value. But how do I predict Volume value also? I guess I need to set model.add(Dense(2)), but can I do 2 predictions correctly in one code, or I need to execute script separately? How to do that? How do I get Volume than Open when model.add(Dense(2))?
Full code:
from math import sqrt
from numpy import concatenate
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding
from keras.layers import LSTM
import numpy as np
from datetime import datetime, timedelta
import yfinance as yf
start = (datetime.now() - timedelta(days=30))
end = (datetime.now() - timedelta(days=0))
df = yf.download(tickers="LKOH.ME", start=start.strftime("%Y-%m-%d"), end=end.strftime("%Y-%m-%d"), interval="60m")
df = df.loc[start.strftime("%Y-%m-%d"):end.strftime("%Y-%m-%d")]
# I need here add another features
# df.filter(['Close', 'Open', 'Volume']) <-- this will make further an error with shapes
data = df.filter(['Close'])
dataset = data.values
#Get the number of rows to train the model on, 40 rows for test
training_data_len = len(dataset) - 40
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)
train_data = scaled_data[0:int(training_data_len), :]
x_train = []
y_train = []
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i, 0])
y_train.append(train_data[i, 0])
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
model = Sequential()
# should i change to input_shape=(x_train.shape[1], 3) ?
model.add(LSTM(50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, batch_size=1, epochs=1)
test_data = scaled_data[training_data_len - 60: , :]
x_test = []
y_test = dataset[training_data_len:, :]
for i in range(60, len(test_data)):
x_test.append(test_data[i-60:i, 0])
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1 ))
predictions = model.predict(x_test)
predictions = scaler.inverse_transform(predictions) # error here
The problem is that you are fitting MinMaxScaler on dataset, then splitting dataset into x_train and y_train and then later on trying to use the inverse_transform method on the predictions, which have the same shape as y_train. I suggest you create x_train and y_train and fit MinMaxScaler only to x_train. y_train doesn't need to be scaled for the model and that will save you needing to inverse_transform the predictions completely.
So instead of
#Get the number of rows to train the model on, 40 rows for test
training_data_len = len(dataset) - 40
scaler = MinMaxScaler(feature_range=(0,1))
scaled_data = scaler.fit_transform(dataset)
train_data = scaled_data[0:int(training_data_len), :]
x_train = []
y_train = []
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i, 0])
y_train.append(train_data[i, 0])
x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
Use
#Get the number of rows to train the model on, 40 rows for test
training_data_len = len(dataset) - 40
train_data = scaled_data[0:int(training_data_len), :]
x_train = []
y_train = []
for i in range(60, len(train_data)):
x_train.append(train_data[i-60:i, 0])
y_train.append(train_data[i, 0])
x_train, y_train = np.array(x_train), np.array(y_train)
scaler = MinMaxScaler(feature_range=(0,1))
x_train = scaler.fit_transform(x_train) # Only scaling x_train
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
and just delete the line predictions = scaler.inverse_transform(predictions).
Updates relating to additional questions in the comments
The definition of y_test is inconsistent with y_train. Specifically, y_test is defined as y_test = dataset[training_data_len:, :] which is using all of the columns of dataset. Instead, to be consistent with y_train, it should be dataset[training_data_len:, 0].
Handling splitting the data is often clearer and less error-prone if done in pandas:
# Starting with the dataframe 'data'
data = df.filter(['Close', 'Open', 'Volume'])
# Create x/y test/train directly from 'data'
training_data_len = len(data) - 40
x_train = data[['Open', 'Volume']][:training_data_len]
y_train = data.Close[:training_data_len]
x_test = data[['Open', 'Volume']][training_data_len:]
y_test = data.Close[training_data_len:]
# Then confirm you have the expected subsets by checking things like
# shape (and info(), describe(), etc.)
x_train.shape, x_test.shape
> ((160, 2), (40, 2))
y_train.shape, y_test.shape
> ((160,), (40,))
I'm trying to do a Sentiment Analysis prediction using the text and the scores of random IMDB reviews. I turned all the words into a Bag Of Words and put it all in a neural network. The prediction however does not seem to be correct and it always shows a 50% positive and a 50% negative prediction for anything that I type as a review.
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)
Y = (labels=='positive').astype(np.int_)
print(type(reviews))
print(reviews.head())
print(labels.head())
#Split into train/test
x_train, x_test, y_train, y_test = train_test_split(reviews,Y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train)
#min_df = 19 seems to be the first number that fills all 10 000 entries - thus the 10 most commonly used words
vect = CountVectorizer(min_df=19, max_features=10000)
fitter = vect.fit(x_train[0])
X_train = fitter.transform(x_train[0])
X_test = fitter.transform(x_test[0])
X_val = fitter.transform(x_val[0])
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("Vocabulary content:\n {}".format(fitter.vocabulary_))
X_train = pad_sequences(X_train.toarray(), maxlen=100, value=0.)
X_test = pad_sequences(X_test.toarray(), maxlen=100, value=0.)
X_val = pad_sequences(X_val.toarray(), maxlen=100, value=0.)
Y_train = to_categorical(y_train, 2)
Y_test = to_categorical(y_test, 2)
Y_val = to_categorical(y_val, 2)
tensorflow.reset_default_graph()
input_layer = tflearn.input_data(shape=[None, 100])
net = tflearn.embedding(input_layer, input_dim=10000, output_dim=128)
hid = tflearn.fully_connected(input_layer, 10, activation='tanh') # a hidden layer with 10 neurons
output_layer = tflearn.fully_connected(hid, 2, activation='softmax')
sgd = tflearn.SGD(learning_rate=0.04, lr_decay=0.96, decay_step=1000)
net = tflearn.regression(output_layer, optimizer=sgd, loss='categorical_crossentropy')
model = tflearn.DNN(net, tensorboard_verbose=3, tensorboard_dir='tfdir')
try:
model.fit(X_train, Y_train, n_epoch=5, validation_set=(X_val, Y_val), batch_size=100, show_metric=True, run_id="Imdb")
except KeyboardInterrupt as e:
print("Stopped by user")
The training, validation and test accuracy is always ~0.65 at maximum no matter how much I tune the hyperparameters.
my_review = "This movie sucks"
my_review_enc = fitter.transform([my_review])
my_review_enc_pad = pad_sequences(my_review_enc.toarray(), maxlen=100, value=0.)
prediction = model.predict(my_review_enc_pad)
prediction
As you can see, the positive and negative prediction is always at 50%
What am I doing wrong?