# input_shape = (137861, 21, 1)
# output_sequence_length = 21
# english_vocab_size = 199
# french_vocab_size = 344
def embed_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
'''
Build and train a RNN model using word embedding on x and y
:param input_shape: Tuple of input shape
:param output_sequence_length: Length of output sequence
:param english_vocab_size: Number of unique English words in the dataset
:param french_vocab_size: Number of unique French words in the dataset
:return: Keras model built, but not trained
'''
learning_rate = 1e-3
model = Sequential()
model.add(Embedding(english_vocab_size, 128, input_length=output_sequence_length, input_shape=input_shape[1:]))
model.add(GRU(units=128, return_sequences=True))
model.add(TimeDistributed(Dense(french_vocab_size)))
model.add(Activation('softmax'))
model.summary()
model.compile(loss=sparse_categorical_crossentropy,
optimizer=Adam(learning_rate),
metrics=['accuracy'])
return model
When invoking this method to train a model, it gets the error:
ValueError: Input 0 is incompatible with layer gru_1: expected ndim=3, found ndim=4
How to fix the shape error between Embedding Layer and GRU Layer?
The problem is that the Embedding layer takes a 2D array as the input. However, the shape of the input array is (137861, 21, 1) which makes it a 3D array. Simply remove the last axis using squeeze() method from numpy:
data = np.squeeze(data, axis=-1)
As a side, there is no need to use TimeDistributed layer here, since the Dense layer is applied on the last axis by defualt.
Related
I am trying to fine-tune a pre-trained time-series model with 12 dimension physiological signals as inputs, yet, my dataset is only one dimension. Therefore, I built the first conv1d layer and set the weight from the pre-trained model. After that, I add the rest parts from the pre-trained model. But, an error popout and says the dimension mismatch.
Here are the details of my code and error:
#Load Whole Model and Weight of the First Conv1d Layer
BaseModel = keras.models.load_model('model.hdf5')
input_w = BaseModel.layers[1].get_weights()[0]
input_w_1_lead = input_w[:,1,:].reshape(input_w.shape[0], 1, input_w.shape[2])
#Creat the Input and First Conv1d Layer
model = tf.keras.Sequential([
keras.Input(shape=(4096, 1), dtype=np.float32, name='signal'),
keras.layers.Conv1D(64, 16, padding='same',
use_bias=False, name='conv1d_1',),
])
#Set the Weight from Pre-Trained Model
model.layers[0].set_weights([input_w_1_lead])
#Add the Rest Parts from Pre-Trained Model
for layer in BaseModel.layers[2:]:
model.add(layer)
#And here is the error:
ValueError: Exception encountered when calling layer "add_1" (type Add).
A merge layer should be called on a list of inputs. Received: inputs=Tensor("Placeholder:0", shape=(None, 256, 128), dtype=float32) (not a list of tensors)
Call arguments received by layer "add_1" (type Add):
• inputs=tf.Tensor(shape=(None, 256, 128), dtype=float32)
Then, I looked into the details of my code and found out that I could only add layers until the 11th one.
test_model = tf.keras.Sequential()
for layer in BaseModel.layers[:11]:
#Not work if I set BaseModel.layers[:12]
test_model.add(layer)
With model.summary(), dimension information seems missing after the pooling layer. Here are the outputs for both BaseModel.summary() and test_model.summary():
BaseModel
test_model
However, I couldn't find the solution.
Dear All,
I figure out an alternative solution for my task. I modified the model's input and output layers from source code. Then set the weight from source model.
Here is the solution code:
'Load Pre-Trained Model to Get Weight'
BaseModel = keras.models.load_model('model.hdf5')
input_w = BaseModel.layers[1].get_weights()[0]
input_w_1_lead = input_w[:,1,:].reshape(input_w.shape[0], 1, input_w.shape[2])
BaseModel.summary()
'Build the Fine-Tuned Model'
model = get_model(1)
model.layers[1].set_weights([input_w_1_lead])
'Load Weight from Source Model'
for i in range(2, len(BaseModel.layers) -1):
model.layers[i].set_weights(BaseModel.layers[i].get_weights())
'Check the Weight is Equal to Source Code'
for i in range(2, len(BaseModel.layers) -1):
if np.allclose(
BaseModel.layers[i].get_weights(),
model.layers[i].get_weights()) is False:
print('this way is no correct')
Ta-da! It's work and nothing printed in the console.
I have a keras model that is trained on a sequence of data with a single label. I'm assuming a categorically encoded feature which passes through an embedding layer before a GRU layer.
samples, timesteps, features = 2000, 10, 1
inputs_1 = np.random.randint(1, 50, [samples, timesteps, features]).astype(np.float32)
labels = np.random.randint(0, 2, [samples, 1])
# Input
input_ = Input(shape=(None,))
# Embeddings
emb = Embedding(input_dim=int(50),
output_dim=20,
input_length=(None,),
mask_zero=False,
name="cat_feat_0" + "_emb")(input_)
gru = GRU(32,
activation="tanh",
dropout=0,
recurrent_dropout=0,
go_backwards=False,
return_sequences=False,
name="gru_cat")(emb)
y = Dense(10, activation = "tanh")(gru)
y = Dropout(0.4)(y)
y = Dense(1, activation = "sigmoid")(y)
model = Model(inputs=input_, outputs=y)
model.compile(loss=BCE_Last_Event,
optimizer=Adam(beta_1=0.9, beta_2=0.999),
metrics=["accuracy"])
model.predict(inputs_1).shape
When I predict my data, the output shape is (2000,1) given that it predicts a single label for the sequence. Would it be possible to output the scores for every event in the sequence such that the model returns predictions of shape (2000, 10, 1)?
I know I can return the sequence in the GRU layer which will be propagated. However, I still only have a single label so the loss function would be erroneous. My current thinking is either:
Create a new model which returns the sequences using the same weights as the trained model
Wrap the model in a TimeDistributed layer such that it predicts every event in the sequence.
I am concerned that the second solution will be error-prone as it will only take as input a single event throughout the entire length of the sequence, rather than the entire sequence for its prediction. Is this thinking correct?
What are the best solutions?
I have a data set with the shape (3340, 6). I want to use a CNN-LSTM to read a sequence of 30 rows and predict the next row's (6) elements. From what I have read, this is considered a multi-parallel time series. I have been primarily following this machine learning mastery tutorial and am having trouble implementing the CNN-LSTM architecture for a multi-parallel time series.
I have used this function to split the data into 30 day time step frames
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
X, y = list(), list()
for i in range(len(sequences)):
# find the end of this pattern
end_ix = i + n_steps
# check if we are beyond the dataset
if end_ix > len(sequences)-1:
break
# gather input and output parts of the pattern
seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix, :]
X.append(seq_x)
y.append(seq_y)
return array(X), array(y)
Here is a sample of the data frames produced by the function above.
# 30 Time Step Input Frame X[0], X.shape = (3310, 30, 6)
[4.951e-02, 8.585e-02, 5.941e-02, 8.584e-02, 8.584e-02, 5.000e+00],
[8.584e-02, 9.307e-02, 7.723e-02, 8.080e-02, 8.080e-02, 4.900e+01],
[8.080e-02, 8.181e-02, 7.426e-02, 7.474e-02, 7.474e-02, 2.000e+01],
[7.474e-02, 7.921e-02, 6.634e-02, 7.921e-02, 7.921e-02, 4.200e+01],
...
# 1 Time Step Output Array y[0], y.shape = (3310, 6)
[6.550e-02, 7.690e-02, 6.243e-02, 7.000e-02, 7.000e-02, 9.150e+02]
Here is the following model that I am using:
model = Sequential()
model.add(TimeDistributed(Conv1D(64, 1, activation='relu'), input_shape=(None, 30, 6)))
model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
model.add(TimeDistributed(Flatten()))
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(Dense(6))
model.compile(optimizer='adam', loss='mse')
When I run model.fit, I receive the following error:
ValueError: Error when checking input: expected time_distributed_59_input to have
4 dimensions, but got array with shape (3310, 30, 6)
I am at a loss at how to properly shape my input layer so that I can get this model learning. I have done several Conv2D nets in the past but this is my first time series model so I apologize if there's an obvious answer here that I am missing.
Remove TimeDistributed from Conv1D and MaxPooling1D; 3D inputs are supported
Remove Flatten(), as it destroys timesteps-channels relationships
Add TimeDistributed to the last Dense layer, as Dense does not support 3D inputs (returned by LSTM(return_sequences=True); alternatively, use return_sequences=False)
The problem is the following. I have a categorical prediction task of vocabulary size 25K. On one of them (input vocab 10K, output dim i.e. embedding 50), I want to introduce a trainable weight matrix for a matrix multiplication between the input embedding (shape 1,50) and the weights (shape(50,128)) (no bias) and the resulting vector score is an input for a prediction task along with other features.
The crux is, I think that the trainable weight matrix varies for each input, if I simply add it in. I want this weight matrix to be common across all inputs.
I should clarify - by input here I mean training examples. So all examples would learn some example specific embedding and be multiplied by a shared weight matrix.
After every so many epochs, I intend to do a batch update to learn these common weights (or use other target variables to do multiple output prediction)
LSTM? Is that something I should look into here?
With the exception of an Embedding layer, layers apply to all examples in the batch.
Take as an example a very simple network:
inp = Input(shape=(4,))
h1 = Dense(2, activation='relu', use_bias=False)(inp)
out = Dense(1)(h1)
model = Model(inp, out)
This a simple network with 1 input layer, 1 hidden layer and an output layer. If we take the hidden layer as an example; this layer has a weights matrix of shape (4, 2,). At each iteration the input data which is a matrix of shape (batch_size, 4) is multiplied by the hidden layer weights (feed forward phase). Thus h1 activation is dependent on all samples. The loss is also computed on a per batch_size basis. The output layer has a shape (batch_size, 1). Given that in the forward phase all the batch samples affected the values of the weights, the same is true for backdrop and gradient updates.
When one is dealing with text, often the problem is specified as predicting a specific label from a sequence of words. This is modelled as a shape of (batch_size, sequence_length, word_index). Lets take a very basic example:
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
sequence_length = 80
emb_vec_size = 100
vocab_size = 10_000
def make_model():
inp = Input(shape=(sequence_length, 1))
emb = Embedding(vocab_size, emb_vec_size)(inp)
emb = Reshape((sequence_length, emb_vec_size))(emb)
h1 = Dense(64)(emb)
recurrent = LSTM(32)(h1)
output = Dense(1)(recurrent)
model = Model(inp, output)
model.compile('adam', 'mse')
return model
model = make_model()
model.summary()
You can copy and paste this into colab and see the summary.
What this example is doing is:
Transform a sequence of word indices into a sequence of word embedding vectors.
Applying a Dense layer called h1 to all the batches (and all the elements in the sequence); this layer reduces the dimensions of the embedding vector. It is not a typical element of a network to process text (in isolation). But this seemed to match your question.
Using a recurrent layer to reduce the sequence into a single vector per example.
Predicting a single label from the "sentence" vector.
If I get the problem correctly you can reuse layers or even models inside another model.
Example with a Dense layer. Let's say you have 10 Inputs
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
# defining 10 inputs in a List with (X,) shape
inputs = [Input(shape = (X,),name='input_{}'.format(k)) for k in
range(10)]
# defining a common Dense layer
D = Dense(64, name='one_layer_to_rule_them_all')
nets = [D(inp) for inp in inputs]
model = Model(inputs = inputs, outputs = nets)
model.compile(optimizer='adam', loss='categorical_crossentropy')
This code is not going to work if the inputs have different shapes. The first call to D defines its properties. In this example, outputs are set directly to nets. But of course you can concatenate, stack, or whatever you want.
Now if you have some trainable model you can use it instead of the D:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
# defining 10 inputs in a List with (X,) shape
inputs = [Input(shape = (X,),name='input_{}'.format(k)) for k in
range(10)]
# defining a shared model with the same weights for all inputs
nets = [special_model(inp) for inp in inputs]
model = Model(inputs = inputs, outputs = nets)
model.compile(optimizer='adam', loss='categorical_crossentropy')
The weights of this model are shared among all inputs.
I have created a stacked keras decoder model using the following loop:
# Create the encoder
# Define an input sequence.
encoder_inputs = keras.layers.Input(shape=(None, num_input_features))
# Create a list of RNN Cells, these are then concatenated into a single layer with the RNN layer.
encoder_cells = []
for hidden_neurons in hparams['encoder_hidden_layers']:
encoder_cells.append(keras.layers.GRUCell(hidden_neurons,
kernel_regularizer=regulariser,
recurrent_regularizer=regulariser,
bias_regularizer=regulariser))
encoder = keras.layers.RNN(encoder_cells, return_state=True)
encoder_outputs_and_states = encoder(encoder_inputs)
# Discard encoder outputs and only keep the states. The outputs are of no interest to us, the encoder's job is to create
# a state describing the input sequence.
encoder_states = encoder_outputs_and_states[1:]
print(encoder_states)
if hparams['encoder_hidden_layers'][-1] != hparams['decoder_hidden_layers'][0]:
encoder_states = Dense(hparams['decoder_hidden_layers'][0])(encoder_states[-1])
# Create the decoder, the decoder input will be set to zero
decoder_inputs = keras.layers.Input(shape=(None, 1))
decoder_cells = []
for hidden_neurons in hparams['decoder_hidden_layers']:
decoder_cells.append(keras.layers.GRUCell(hidden_neurons,
kernel_regularizer=regulariser,
recurrent_regularizer=regulariser,
bias_regularizer=regulariser))
decoder = keras.layers.RNN(decoder_cells, return_sequences=True, return_state=True)
# Set the initial state of the decoder to be the output state of the encoder. his is the fundamental part of the
# encoder-decoder.
decoder_outputs_and_states = decoder(decoder_inputs, initial_state=encoder_states)
# Only select the output of the decoder (not the states)
decoder_outputs = decoder_outputs_and_states[0]
# Apply a dense layer with linear activation to set output to correct dimension and scale (tanh is default activation for
# GRU in Keras
decoder_dense = keras.layers.Dense(num_output_features,
activation='linear',
kernel_regularizer=regulariser,
bias_regularizer=regulariser)
decoder_outputs = decoder_dense(decoder_outputs)
model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)
model.compile(optimizer=optimiser, loss=loss)
model.summary()
This setup works when I have a single layer encoder and a single layer decoder where the number of neurons is the same. However it does not work when the number of layers of the decoder is more than one.
I get the following error message:
ValueError: An `initial_state` was passed that is not compatible with `cell.state_size`. Received `state_spec`=[InputSpec(shape=(None, 48), ndim=2)]; however `cell.state_size` is (48, 58)
My decoder_layers list contains the entries [48, 58]. Therefore my RNN layer that the decoder is comprised of, is a stacked GRU where the first GRU contains 48 neurons and the second contains 58. I would like to set the initial state of the first GRU. I run the states through a Dense layer so that the shape is compatible with the first layer of the decoder. The error message indicates that I am trying to set the initial state of both the first layer and the second layer when I pass the initial state keyword to the decoder RNN layer. Is this correct behaviour? Normally I would set the initial state of the first decoder layer (not built using a cell structure like this) which then would just feed it's inputs into subsequent layers. Is there a way to achieve such behaviour in keras by default when creating a keras.layers.RNN from a list of GRUCell of LSTMCells?
In my own experiments, your intial_states should have batch_size as its first dimension. In other words, each element in one batch may have a different initial state. From your code, I think you missed this dimension.