Extremely poor prediction: LSTM time-series - python

I tried to implement LSTM model for time-series prediction. Below is my trial code. This code runs without error. You can also try it without dependency.
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Bidirectional
from sklearn.metrics import mean_squared_error, accuracy_score
from scipy.stats import linregress
from sklearn.utils import shuffle
fi = 'pollution.csv'
raw = pd.read_csv(fi, delimiter=',')
raw = raw.drop('Dates', axis=1)
print (raw.shape)
scaler = MinMaxScaler(feature_range=(-1, 1))
raw = scaler.fit_transform(raw)
time_steps = 7
def create_ds(data, t_steps):
data = pd.DataFrame(data)
data_s = data.copy()
for i in range(time_steps):
data = pd.concat([data, data_s.shift(-(i+1))], axis = 1)
data.dropna(axis=0, inplace=True)
return data.values
ds = create_ds(raw, time_steps)
print (ds.shape)
n_feats = raw.shape[1]
n_obs = time_steps * n_feats
n_rows = ds.shape[0]
train_size = int(n_rows * 0.8)
train_data = ds[:train_size, :]
train_data = shuffle(train_data)
test_data = ds[train_size:, :]
x_train = train_data[:, :n_obs]
y_train = train_data[:, n_obs:]
x_test = test_data[:, :n_obs]
y_test = test_data[:, n_obs:]
x_train = x_train.reshape(1, x_train.shape[0], x_train.shape[1])
y_train = y_train.reshape(1, y_train.shape[0], y_train.shape[1])
x_test = x_test.reshape(1, x_test.shape[0], x_test.shape[1])
print (x_train.shape)
print (y_train.shape)
print (x_test.shape)
print (y_test.shape)
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(None, x_train.shape[2]), stateful=True, batch_size=1))
model.add(LSTM(32, return_sequences=True, stateful=True))
model.add(LSTM(n_feats, return_sequences=True, stateful=True))
model.compile(loss='mse', optimizer='rmsprop')
model.fit(x_train, y_train, epochs=10, batch_size=1, verbose=2)
y_predict = model.predict(x_test)
y_predict = y_predict.reshape(y_predict.shape[1], y_predict.shape[2])
y_predict = scaler.inverse_transform(y_predict)
y_test = scaler.inverse_transform(y_test)
y_test = y_test[:,0]
y_predict = y_predict[:,0]
print (y_test.shape)
print (y_predict.shape)
plt.plot(y_test, label='True')
plt.plot(y_predict, label='Predict')
plt.legend()
plt.show()
However, prediction is extremely poor. How to improve the predictin? Do you have any ideas to improve it?
Any ideas for improving prediction by re-designing architecture and/or layers?

If you want to use the model in my code (the link you passed), you need to have the data correctly shaped: (1 sequence, total_time_steps, 5 features)
Important: I don't know if this is the best way or the best model to do this, but this model is predicting 7 time steps ahead of the input (time_shift=7)
Data and initial vars
fi = 'pollution.csv'
raw = pd.read_csv(fi, delimiter=',')
raw = raw.drop('Dates', axis=1)
print("raw shape:")
print (raw.shape)
#(1789,5) - 1789 time steps / 5 features
scaler = MinMaxScaler(feature_range=(-1, 1))
raw = scaler.fit_transform(raw)
time_shift = 7 #shift is the number of steps we are predicting ahead
n_rows = raw.shape[0] #n_rows is the number of time steps of our sequence
n_feats = raw.shape[1]
train_size = int(n_rows * 0.8)
#I couldn't understand how "ds" worked, so I simply removed it because in the code below it's not necessary
#getting the train part of the sequence
train_data = raw[:train_size, :] #first train_size steps, all 5 features
test_data = raw[train_size:, :] #I'll use the beginning of the data as state adjuster
#train_data = shuffle(train_data) !!!!!! we cannot shuffle time steps!!! we lose the sequence doing this
x_train = train_data[:-time_shift, :] #the entire train data, except the last shift steps
x_test = test_data[:-time_shift,:] #the entire test data, except the last shift steps
x_predict = raw[:-time_shift,:] #the entire raw data, except the last shift steps
y_train = train_data[time_shift:, :]
y_test = test_data[time_shift:,:]
y_predict_true = raw[time_shift:,:]
x_train = x_train.reshape(1, x_train.shape[0], x_train.shape[1]) #ok shape (1,steps,5) - 1 sequence, many steps, 5 features
y_train = y_train.reshape(1, y_train.shape[0], y_train.shape[1])
x_test = x_test.reshape(1, x_test.shape[0], x_test.shape[1])
y_test = y_test.reshape(1, y_test.shape[0], y_test.shape[1])
x_predict = x_predict.reshape(1, x_predict.shape[0], x_predict.shape[1])
y_predict_true = y_predict_true.reshape(1, y_predict_true.shape[0], y_predict_true.shape[1])
print("\nx_train:")
print (x_train.shape)
print("y_train")
print (y_train.shape)
print("x_test")
print (x_test.shape)
print("y_test")
print (y_test.shape)
Model
Your model wasn't very powerful for this task, so I tried a bigger one (this on the other hand is too powerful)
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(None, x_train.shape[2])))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(n_feats, return_sequences=True))
model.compile(loss='mse', optimizer='adam')
Fitting
Notice that I had to train 2000+ epochs for the model to have good results.
I added the validation data so we can compare the loss for train and test.
#notice that I'm predicting from the ENTIRE sequence, including x_train
#is important for the model to adjust its states before predicting the end
model.fit(x_train, y_train, epochs=1000, batch_size=1, verbose=2, validation_data=(x_test,y_test))
Predicting
Important: as for predicting the end of a sequence based on the beginning, it's important that the model sees the beginning to adjust the internal states, so I'm predicting the entire data (x_predict), not only the test data.
y_predict_model = model.predict(x_predict)
print("\ny_predict_true:")
print (y_predict_true.shape)
print("y_predict_model: ")
print (y_predict_model.shape)
def plot(true, predicted, divider):
predict_plot = scaler.inverse_transform(predicted[0])
true_plot = scaler.inverse_transform(true[0])
predict_plot = predict_plot[:,0]
true_plot = true_plot[:,0]
plt.figure(figsize=(16,6))
plt.plot(true_plot, label='True',linewidth=5)
plt.plot(predict_plot, label='Predict',color='y')
if divider > 0:
maxVal = max(true_plot.max(),predict_plot.max())
minVal = min(true_plot.min(),predict_plot.min())
plt.plot([divider,divider],[minVal,maxVal],label='train/test limit',color='k')
plt.legend()
plt.show()
test_size = n_rows - train_size
print("test length: " + str(test_size))
plot(y_predict_true,y_predict_model,train_size)
plot(y_predict_true[:,-2*test_size:],y_predict_model[:,-2*test_size:],test_size)
Showing entire data
Showing the end portion of it for more detail
Please notice that this model is overfitting, it means it can learn the training data and get bad results in test data.
To solve this you must experimentally try smaller models, use dropout layers and other techniques to prevent overfitting.
Notice also that this data very probably contains A LOT of random factors, meaning the models will not be able to learn anything useful from it. As you make smaller models to avoid overfitting, you may also find that the model will present worse predictions for training data.
Finding the perfect model is not an easy task, it's an open question and you must experiment. Maybe LSTM models simply aren't the solution. Maybe your data is simply not predictable, etc. There isn't a definitive answer for this.
How to know the model is good
With the validation data in training, you can compare loss for train and test data.
Train on 1 samples, validate on 1 samples
Epoch 1/1000
9s - loss: 0.4040 - val_loss: 0.3348
Epoch 2/1000
4s - loss: 0.3332 - val_loss: 0.2651
Epoch 3/1000
4s - loss: 0.2656 - val_loss: 0.2035
Epoch 4/1000
4s - loss: 0.2061 - val_loss: 0.1696
Epoch 5/1000
4s - loss: 0.1761 - val_loss: 0.1601
Epoch 6/1000
4s - loss: 0.1697 - val_loss: 0.1476
Epoch 7/1000
4s - loss: 0.1536 - val_loss: 0.1287
Epoch 8/1000
.....
Both should go down together. When the test data stops going down, but the train data continues to improve, your model is starting to overfit.
Trying another model
The best I could do (but I didn't really try much) was using this model:
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(None, x_train.shape[2])))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(n_feats, return_sequences=True))
model.compile(loss='mse', optimizer='adam')
When the losses were about:
loss: 0.0389 - val_loss: 0.0437
After this point, the validation loss started going up (so training beyond this point is totally useless)
Result:
This shows that all this model could learn was very overall behaviour, such as zones with higher values.
But the high frequency was either too random or the model wasn't good enough for this...

you may consider changing your model:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Bidirectional
from sklearn.metrics import mean_squared_error, accuracy_score
from scipy.stats import linregress
from sklearn.utils import shuffle
fi = 'pollution.csv'
raw = pd.read_csv(fi, delimiter=',')
raw = raw.drop('Dates', axis=1)
print (raw.shape)
scaler = MinMaxScaler(feature_range=(-1, 1))
raw = scaler.fit_transform(raw)
time_steps = 7
def create_ds(data, t_steps):
data = pd.DataFrame(data)
data_s = data.copy()
for i in range(time_steps):
data = pd.concat([data, data_s.shift(-(i+1))], axis = 1)
data.dropna(axis=0, inplace=True)
return data.values
ds = create_ds(raw, time_steps)
print (ds.shape)
n_feats = raw.shape[1]
n_obs = time_steps * n_feats
n_rows = ds.shape[0]
train_size = int(n_rows * 0.8)
train_data = ds[:train_size, :]
train_data = shuffle(train_data)
test_data = ds[train_size:, :]
x_train = train_data[:, :n_obs]
y_train = train_data[:, n_obs:]
x_test = test_data[:, :n_obs]
y_test = test_data[:, n_obs:]
print (x_train.shape)
print (x_test.shape)
print (y_train.shape)
print (y_test.shape)
x_train = x_train.reshape(x_train.shape[0], time_steps, n_feats)
x_test = x_test.reshape(x_test.shape[0], time_steps, n_feats)
print (x_train.shape)
print (x_test.shape)
print (y_train.shape)
print (y_test.shape)
model = Sequential()
model.add(LSTM(64, input_shape=(time_steps, n_feats), return_sequences=True))
model.add(LSTM(32, return_sequences=False))
model.add(Dense(n_feats))
model.compile(loss='mse', optimizer='rmsprop')
model.fit(x_train, y_train, epochs=10, batch_size=1, verbose=1, shuffle=False)
y_predict = model.predict(x_test)
print (y_predict.shape)
y_predict = scaler.inverse_transform(y_predict)
y_test = scaler.inverse_transform(y_test)
y_test = y_test[:,0]
y_predict = y_predict[:,0]
print (y_test.shape)
print (y_predict.shape)
plt.plot(y_test, label='True')
plt.plot(y_predict, label='Predict')
plt.legend()
plt.show()
But I really do not know merits of your implementation:
* both x and y are 3d (1,steps,features) rather than x in 3d (samples, time-steps, features) and y in 2d (samples, features)
* input_shape=(None, x_train.shape[2])
* last layer - model.add(LSTM(n_feats, return_sequences=True, stateful=True))
Someone may provide better answer.

Reading the original code, it seems the author first scales the dataset and then splits it up into Training and Testing subsets. This means that information about the Testing subset (e.g., volatility etc.) has "leaked" into the Training subset.
The recommended approach is to first perform the Training/Testing split up, calculate the scaling parameters using only the Training subset, and using these parameters perform the scaling of the Training and the Testing subsets separately.

I’m not exactly sure what you could do, that data looks as if it has no discernible pattern. If I can’t see one I doubt an LSTM could. Your prediction does look like a good regression line though.

I am at a point myself with creating a model that predicts data like this I created a SMOTErnn soultion to add as past data, and I have found using TimeSeriesGenrator on batch_size higher with higher strides it performs much bettter.

Related

How to predict next X values in my model?

I have this program that is used to predict outflow of a reservoir and I cannot seem to get it to predict the next day worth of data. The data is in 30 minute increments and would expect 48 points of data to represent the next 24 hours. Is there something I am doing incorrect or have missed?
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
# Prepare data
data = pd.read_csv('data.csv')
data["Date"] = pd.to_datetime(data["Date"])
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(data["Outlet"].values.reshape(-1, 1))
prediction_days = 30
x_train, y_train = [], []
for x in range(prediction_days, len(scaled_data)):
x_train.append(scaled_data[x-prediction_days:x, 0])
y_train.append(scaled_data[x, 0])
x_train, y_train = np.array(x_train), np.array(x_train)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
# Create neural network
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(x_train, y_train, epochs=25)
# Test the model
test_start = dt.datetime(2022, 1, 1)
test_end = dt.datetime.now()
actual_outlet = data[(data['Date'] >= test_start) & (data['Date'] <= test_end)]['Outlet']
total_dataset = pd.concat((data['Outlet'], actual_outlet), axis=0)
model_inputs = total_dataset[len(total_dataset) - len(actual_outlet) - prediction_days:].values
model_inputs = model_inputs.reshape(-1, 1)
model_inputs = scaler.transform(model_inputs)
x_test = []
for x in range(prediction_days, len(model_inputs)):
x_test.append(model_inputs[x-prediction_days:x, 0])
x_test = np.array(x_test)
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
prediction_outlet = model.predict(x_test)
prediction_outlet = scaler.inverse_transform(prediction_outlet)
print(len(prediction_outlet))
The length of my prediction_outlet comes to 1056. Have I missed some steps?
Your test data has 22 days of data since you are taking the slice from test_start to today. since you have 48 data points for each day it adds up to 1056 data points to predict. So you are making the right length of predictions. On the other hand, since you don't have it ordered some of the dates are mixed up. In one instance you have half of the data points from Feb-02 and the rest from Jan-13 which will be very confusing for the network. You might want to order them and make sure you don't take missing dates in your slices.

Tenserflow model for text classification doesn't predict as expected?

I am trying to train a model for sentiment analysis and it shows an accuracy of 90% when splitting the data into training and testing! But whenever I am testing it on a new phrase is has pretty much the same result(usually it's in the range 0.86 - 0.95)!
Here is the code:
sentences = data['text'].values.astype('U')
y = data['label'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=1000)
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
embedding_dim = 50
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim,input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
history = model.fit(X_train, y_train,
epochs=5,
verbose=True,
validation_data=(X_test, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
The training data is a CSV file with 3 coumns: (id, text, labels(0,1)), where 0 is positive and 1 is negative.
Training Accuracy: 0.9855
Testing Accuracy: 0.9013
Testing it on new sentences like 'This is just a text!' and 'hate preachers!' would predict the same result [0.85],[0.83].
It seems that you're victim of overfitting. In other words, our model would overfit to the training data. Although it’s often possible to achieve high accuracy on the training set, as in your case, what we really want is to develop models that generalize well to testing data (or data they haven’t seen before).
You can follow these steps in order to prevent overfitting.
Also, in order to increase the algorithm performance I suggest you to increase the number of neurons for Dense layer and set more epochsin order to increase the performance of the algorithm when comes to testing it to new data.

is this correctly work on predict next value in keras?

here is my code
...
look_back = 20
train_size = int(len(data) * 0.80)
test_size = len(data) - train_size
train = data[0:train_size]
test = data[train_size:len(data)]
x_train, y_train = create_dataset(train, look_back)
x_test, y_test = create_dataset(test, look_back)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
y_train=np.repeat(y_train.reshape(-1,1), 20, axis=1).reshape(-1,20,1)
y_test=np.repeat(y_test.reshape(-1,1), 20, axis=1).reshape(-1,20,1)
...
model = Sequential()
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(1, return_sequences=True))
model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, epochs=10, batch_size=64)
p = model.predict(x_test)
and I want to predict the next value So,
predictions = model.predict(x_train) and shape is (62796, 20, 1)
and I coded the following site how to use the Keras model to forecast for future dates or events?
future = []
currentStep = predictions[-20:, :, :] # -20 is last look_back number
for i in range(10):
currentStep = model.predict(currentStep)
future.append(currentStep)
in this code future's result is
but p = model.predict(x_test)'s [:4000] result is
The difference between the two results is very large.
is this right way to Predict the next value??
I don't know where it went wrong or the code went wrong.
I hope for your opinion.
full source is https://gist.github.com/Lay4U/654f70bd1fb9c4f7d5bdb21ddcb588ab
According to your code you are trying to predict next value using lstm.
So here you have to reshape your input data correctly to reflect the time steps and features.
model.add(LSTM(512, return_sequences=True))
instead of this code you have to write :
model.add(LSTM(512, input_shape=(look_back,x)))
x = input features in your training data.
I guess this article will help to moderate your code and predict the future value:
enter link description here
This article will help you to understand more about how to predict future value:
enter link description here
Thank you
There are multiple methods you can try. There is no one right way at the moment. You can train a seperate model for predicting t+1, t+2 ... t+n. One LSTM model predicts t+1 while another predicts t+n. That is called a DIRMO strategy.
Your strategy (recursive strategy) is particularly risky because the model can propagate the error through multiple time horizons.
You can find a good comparison of alternative strategies in this paper.
https://www.sciencedirect.com/science/article/pii/S0957417412000528?via%3Dihub

Question about setting up my input feature array for training an LSTM classifier to take past observations into account

I am trying to understand how to set up an LSTM with keras for a time series binary classification problem. I have set up a sample LSTM example, but it doesn't seem to be picking up information from prior observations. I think that my current approach is only using the feature data from the current observation.
Below is my standalone demo code.
My question is this: for an LSTM to pick up the pattern from previous observations, do I need to define a sliding window so that each observation actually includes the data from the the previous observations that comprise the sliding window period, or does keras get those itself from the features array?
import random
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from sklearn.model_selection import train_test_split
from keras.layers.recurrent import LSTM
from sklearn.preprocessing import LabelEncoder
# this section just generates some sample data
# the pattern we are trying to pick up on is that
# shift_value number of observations prior to a True
# label, the features are always [.5, .5, .5]
shift_value = 5
n_examples = 10000
features = []
labels = []
random.seed(1)
# create the labels
for i in range(n_examples + shift_value):
labels.append(random.choice([True, False]))
# create the features
for label in labels:
if label:
features.append([.5, .5, .5])
else:
feature_1 = random.random()
feature_2 = random.random()
feature_3 = random.random()
features.append([feature_1, feature_2, feature_3])
df = pd.DataFrame(features)
df['label'] = labels
df.columns = ['A', 'B', 'C', 'label']
df['label'] = df['label'].shift(5)
df = df.dropna()
features_array = df[['A', 'B', 'C']].values
labels_array = df[['label']].values
# reshape the data
X_train, X_test, Y_train, Y_test = train_test_split(features_array, labels_array, test_size = .1, shuffle=False)
X_train_reshaped = np.reshape(X_train, (len(X_train), 1, X_train.shape[1]))
X_test_reshaped = np.reshape(X_test, (len(X_test), 1, X_train.shape[1]))
encoder = LabelEncoder()
Y_train_encoded = encoder.fit_transform(Y_train)
Y_test_encoded = encoder.fit_transform(Y_test)
# define and run the model
neurons = 10
batch_size = 100
model = Sequential()
model.add(LSTM(neurons,
batch_input_shape=(batch_size,
X_train_reshaped.shape[1],
X_train_reshaped.shape[2]
),
activation = 'sigmoid',
stateful = False)
)
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_reshaped,
Y_train_encoded,
validation_data=(X_test_reshaped, Y_test_encoded),
epochs=10,
batch_size=batch_size)
The above example never converges, and I don't think it's taking prior observations into account at all. It should be able to find the basic pattern of 5 observations prior to a True is always [.5, .5, .5]
It is a sequence to sequence problem. Think of this learning problem as below
Given a sequence of length seq_length if the input at time step t is [0.5,0.5,0.5] then the output at t+shift_value == 1
else t+shift_value == 0
To model this learning problem, you will use an LSTM which will unroll seq_length times and each time step will take an input of size 3. Also each timestep has a corresponding output of size 1 (correspoiding to True of False). This is depicted as below:
Code:
import random
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM
shift_value = 5
seq_length = 50
def generate_data(n, shift_value, seq_length):
X = np.random.rand(n, seq_length, 3)
Y = np.random.randint(0,2,size=(n, seq_length))
for j in range(len(Y)):
for i in range(shift_value,len(Y[j])):
if Y[j][i] == 1:
X[j][i-shift_value] = np.array([0.5,0.5,0.5])
return X, Y.reshape(n,seq_length, 1)
# Generate Train and Test Data
X_train, Y_train = generate_data(9000,shift_value,seq_length)
X_test, Y_test = generate_data(100,shift_value,seq_length)
# Train the model
neurons = 32
batch_size = 100
model = Sequential()
model.add(LSTM(neurons,
batch_input_shape=(batch_size, seq_length, 3),
activation = 'relu',
stateful = False,
return_sequences = True))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train,
Y_train,
validation_data=(X_test, Y_test),
epochs=30,
batch_size=batch_size)
Output (Filtered):
...
Epoch 30/30
9000/9000 [=========] - loss: 0.1650 - acc: 0.9206 - val_loss: 0.1362 - val_acc: 0.9324
In 30 epochs it reached validation acc of 93%. Even though it is a deterministic function, the model will never be 100% accurate because of ambiguity within the first shift_value labels.

keras BLSTM for sequence labeling

I'm relatively new to neural nets so please excuse my ignorance. I'm trying to adapt the keras BLSTM example here. The example reads in texts and classifies them as 0 or 1. I want a BLSTM that does something very much like POS tagging, though extras like lemmatizing or other advanced features are not neccessary, I just want a basic model. My data is a list of sentences and each word is given a category 1-8. I want to train a BLSTM that can use this data to predict the category for each word in an unseen sentence.
e.g. input = ['The', 'dog', 'is', 'red'] gives output = [2, 4, 3, 7]
If the keras example is not the best route, I'm open to other suggestions.
I currently have this:
'''Train a Bidirectional LSTM.'''
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge
from prep_nn import prep_scan
np.random.seed(1337) # for reproducibility
max_features = 20000
batch_size = 16
maxlen = 18
print('Loading data...')
(X_train, y_train), (X_test, y_test) = prep_scan(nb_words=max_features,
test_split=0.2)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
print("Pad sequences (samples x time)")
# type issues here? float/int?
X_train = sequence.pad_sequences(X_train, value=0.)
X_test = sequence.pad_sequences(X_test, value=0.) # pad with zeros
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
# need to pad y too, because more than 1 ouput value, not classification?
y_train = sequence.pad_sequences(np.array(y_train), value=0.)
y_test = sequence.pad_sequences(np.array(y_test), value=0.)
print('y_train shape:', X_train.shape)
print('y_test shape:', X_test.shape)
# this is the placeholder tensor for the input sequences
sequence = Input(shape=(maxlen,), dtype='int32')
# this embedding layer will transform the sequences of integers
# into vectors of size 128
embedded = Embedding(max_features, 128, input_length=maxlen)(sequence)
# apply forwards LSTM
forwards = LSTM(64)(embedded)
# apply backwards LSTM
backwards = LSTM(64, go_backwards=True)(embedded)
# concatenate the outputs of the 2 LSTMs
merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
after_dp = Dropout(0.5)(merged)
# number after dense has to corresponse to output matrix?
output = Dense(17, activation='sigmoid')(after_dp)
model = Model(input=sequence, output=output)
# try using different optimizers and different optimizer configs
model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
print('Train...')
model.fit(X_train, y_train,
batch_size=batch_size,
nb_epoch=4,
validation_data=[X_test, y_test])
X_test_new = np.array([[0,0,0,0,0,0,0,0,0,12,3,55,4,34,5,45,3,9],[0,0,0,0,0,0,0,1,7,65,34,67,34,23,24,67,54,43,]])
classes = model.predict(X_test_new, batch_size=16)
print(classes)
My output is the right dimension, but is giving me floats 0-1. I think this is because it's still looking for binary classfication. Anyone know how to fix this?
SOLVED
Just make sure the labels are each binary arrays:
(X_train, y_train), (X_test, y_test), maxlen, word_ids, tags_ids = prep_model(
nb_words=nb_words, test_len=75)
W = (y_train > 0).astype('float')
print(len(X_train), 'train sequences')
print(int(len(X_train)*val_split), 'validation sequences')
print(len(X_test), 'heldout sequences')
# this is the placeholder tensor for the input sequences
sequence = Input(shape=(maxlen,), dtype='int32')
# this embedding layer will transform the sequences of integers
# into vectors of size 256
embedded = Embedding(nb_words, output_dim=hidden,
input_length=maxlen, mask_zero=True)(sequence)
# apply forwards LSTM
forwards = LSTM(output_dim=hidden, return_sequences=True)(embedded)
# apply backwards LSTM
backwards = LSTM(output_dim=hidden, return_sequences=True,
go_backwards=True)(embedded)
# concatenate the outputs of the 2 LSTMs
merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
after_dp = Dropout(0.15)(merged)
# TimeDistributed for sequence
# change activation to sigmoid?
output = TimeDistributed(
Dense(output_dim=nb_classes,
activation='softmax'))(after_dp)
model = Model(input=sequence, output=output)
# try using different optimizers and different optimizer configs
# loss=binary_crossentropy, optimizer=rmsprop
model.compile(loss='categorical_crossentropy',
metrics=['accuracy'], optimizer='adam',
sample_weight_mode='temporal')
print('Train...')
model.fit(X_train, y_train,
batch_size=batch_size,
nb_epoch=epochs,
shuffle=True,
validation_split=val_split,
sample_weight=W)
Solved. The main issue was reshaping the data for the classification categories as binary arrays. Also used TimeDistributed and set return_sequences to True.
I knows that this thread is very old but i hope will i can help.
I modified the model for a binary model:
sequence = Input(shape=(X_train.shape[1],), dtype='int32')
embedded = Embedding(max_fatures,embed_dim,input_length=X_train.shape[1], mask_zero=True)(sequence)
# apply forwards LSTM
forwards = LSTM(output_dim=hidden, return_sequences=True)(embedded)
# apply backwards LSTM
backwards = LSTM(output_dim=hidden, return_sequences=True,go_backwards=True)(embedded)
# concatenate the outputs of the 2 LSTMs
merged = concatenate([forwards, backwards])
after_dp = Dropout(0.15)(merged)
# add now layer LSTM without return_sequence
lstm_normal = LSTM(hidden)(merged)
# TimeDistributed for sequence
# change activation to sigmoid?
#output = TimeDistributed(Dense(output_dim=2,activation='sigmoid'))(after_dp)
#I changed output layer TimeDistributed for a Dense, for the problem of dimensionality and output_dim = 1 (output binary)
output = Dense(output_dim=1,activation='sigmoid')(lstm_normal)
model = Model(input=sequence, output=output)
# try using different optimizers and different optimizer configs
# loss=binary_crossentropy, optimizer=rmsprop
# I changed modelo compile by to binary and remove sample_weight_mode parameter
model.compile(loss='binary_crossentropy',
metrics=['accuracy'], optimizer='adam',
)
print(model.summary())
###################################
#this is the line of training
model.fit(X_train, Y_train,
batch_size=128,
epochs=10,
shuffle=True,
validation_split=0.2,
#sample_weight=W
)
#In this moment work fine.....
Train on 536000 samples, validate on 134000 samples
Epoch 1/10
536000/536000 [==============================] - 1814s 3ms/step - loss: 0.4794 - acc: 0.7679 - val_loss: 0.4624 - val_acc: 0.7784
Epoch 2/10
536000/536000 [==============================] - 1829s 3ms/step - loss: 0.4502 - acc: 0.7857 - val_loss: 0.4551 - val_acc: 0.7837
Epoch 3/10
99584/536000 [====>.........................] - ETA: 23:10 - loss: 0.4291 - acc: 0.7980

Categories