I have trained an RNN model with pytorch. I need to use the model for prediction in an environment where I'm unable to install pytorch because of some strange dependency issue with glibc. However, I can install numpy and scipy and other libraries. So, I want to use the trained model, with the network definition, without pytorch.
I have the weights of the model as I save the model with its state dict and weights in the standard way, but I can also save it using just json/pickle files or similar.
I also have the network definition, which depends on pytorch in a number of ways. This is my RNN network definition.
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
torch.manual_seed(1)
random.seed(1)
device = torch.device('cpu')
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size,num_layers, matching_in_out=False, batch_size=1):
super(RNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.num_layers = num_layers
self.batch_size = batch_size
self.matching_in_out = matching_in_out #length of input vector matches the length of output vector
self.lstm = nn.LSTM(input_size, hidden_size,num_layers)
self.hidden2out = nn.Linear(hidden_size, output_size)
self.hidden = self.init_hidden()
def forward(self, feature_list):
feature_list=torch.tensor(feature_list)
if self.matching_in_out:
lstm_out, _ = self.lstm( feature_list.view(len( feature_list), 1, -1))
output_space = self.hidden2out(lstm_out.view(len( feature_list), -1))
output_scores = torch.sigmoid(output_space) #we'll need to check if we need this sigmoid
return output_scores #output_scores
else:
for i in range(len(feature_list)):
cur_ft_tensor=feature_list[i]#.view([1,1,self.input_size])
cur_ft_tensor=cur_ft_tensor.view([1,1,self.input_size])
lstm_out, self.hidden = self.lstm(cur_ft_tensor, self.hidden)
outs=self.hidden2out(lstm_out)
return outs
def init_hidden(self):
#return torch.rand(self.num_layers, self.batch_size, self.hidden_size)
return (torch.rand(self.num_layers, self.batch_size, self.hidden_size).to(device),
torch.rand(self.num_layers, self.batch_size, self.hidden_size).to(device))
I am aware of this question, but I'm willing to go as low level as possible. I can work with numpy array instead of tensors, and reshape instead of view, and I don't need a device setting.
Based on the class definition above, what I can see here is that I only need the following components from torch to get an output from the forward function:
nn.LSTM
nn.Linear
torch.sigmoid
I think I can easily implement the sigmoid function using numpy. However, can I have some implementation for the nn.LSTM and nn.Linear using something not involving pytorch? Also, how will I use the weights from the state dict into the new class?
So, the question is, how can I "translate" this RNN definition into a class that doesn't need pytorch, and how to use the state dict weights for it?
Alternatively, is there a "light" version of pytorch, that I can use just to run the model and yield a result?
EDIT
I think it might be useful to include the numpy/scipy equivalent for both nn.LSTM and nn.linear. It would help us compare the numpy output to torch output for the same code, and give us some modular code/functions to use. Specifically, a numpy equivalent for the following would be great:
rnn = nn.LSTM(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
c0 = torch.randn(2, 3, 20)
output, (hn, cn) = rnn(input, (h0, c0))
and also for linear:
m = nn.Linear(20, 30)
input = torch.randn(128, 20)
output = m(input)
You should try to export the model using torch.onnx. The page gives you an example that you can start with.
An alternative is to use TorchScript, but that requires torch libraries.
Both of these can be run without python. You can load torchscript in a C++ application https://pytorch.org/tutorials/advanced/cpp_export.html
ONNX is much more portable and you can use in languages such as C#, Java, or Javascript
https://onnxruntime.ai/ (even on the browser)
A running example
Just modifying a little your example to go over the errors I found
Notice that via tracing any if/elif/else, for, while will be unrolled
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
torch.manual_seed(1)
random.seed(1)
device = torch.device('cpu')
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size,num_layers, matching_in_out=False, batch_size=1):
super(RNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.num_layers = num_layers
self.batch_size = batch_size
self.matching_in_out = matching_in_out #length of input vector matches the length of output vector
self.lstm = nn.LSTM(input_size, hidden_size,num_layers)
self.hidden2out = nn.Linear(hidden_size, output_size)
def forward(self, x, h0, c0):
lstm_out, (hidden_a, hidden_b) = self.lstm(x, (h0, c0))
outs=self.hidden2out(lstm_out)
return outs, (hidden_a, hidden_b)
def init_hidden(self):
#return torch.rand(self.num_layers, self.batch_size, self.hidden_size)
return (torch.rand(self.num_layers, self.batch_size, self.hidden_size).to(device).detach(),
torch.rand(self.num_layers, self.batch_size, self.hidden_size).to(device).detach())
# convert the arguments passed during onnx.export call
class MWrapper(nn.Module):
def __init__(self, model):
super(MWrapper, self).__init__()
self.model = model;
def forward(self, kwargs):
return self.model(**kwargs)
Run an example
rnn = RNN(10, 10, 10, 3)
X = torch.randn(3,1,10)
h0,c0 = rnn.init_hidden()
print(rnn(X, h0, c0)[0])
Use the same input to trace the model and export an onnx file
torch.onnx.export(MWrapper(rnn), {'x':X,'h0':h0,'c0':c0}, 'rnn.onnx',
dynamic_axes={'x':{1:'N'},
'c0':{1: 'N'},
'h0':{1: 'N'}
},
input_names=['x', 'h0', 'c0'],
output_names=['y', 'hn', 'cn']
)
Notice that you can use symbolic values for the dimensions of some axes of some inputs. Unspecified dimensions will be fixed with the values from the traced inputs. By default LSTM uses dimension 1 as batch.
Next we load the ONNX model and pass the same inputs
import onnxruntime
ort_model = onnxruntime.InferenceSession('rnn.onnx')
print(ort_model.run(['y'], {'x':X.numpy(), 'c0':c0.numpy(), 'h0':h0.numpy()}))
Basically implementing it in numpy and copying weights from your pytorch model can do the trick. For your usecase you will only need to do a forward pass so we just need to implement that only
#Set Parameters for a small LSTM network
input_size = 2 # size of one 'event', or sample, in our batch of data
hidden_dim = 3 # 3 cells in the LSTM layer
output_size = 1 # desired model output
num_layers=3
torch_lstm = RNN( input_size,
hidden_dim ,
output_size,
num_layers,
matching_in_out=True
)
state = torch_lstm.state_dict() # state will capture the weights of your model
Now for LSTM in numpy these functions will be used:
got the below code from this link: https://towardsdatascience.com/the-lstm-reference-card-6163ca98ae87
### NOT MY CODE
import numpy as np
from scipy.special import expit as sigmoid
def forget_gate(x, h, Weights_hf, Bias_hf, Weights_xf, Bias_xf, prev_cell_state):
forget_hidden = np.dot(Weights_hf, h) + Bias_hf
forget_eventx = np.dot(Weights_xf, x) + Bias_xf
return np.multiply( sigmoid(forget_hidden + forget_eventx), prev_cell_state )
def input_gate(x, h, Weights_hi, Bias_hi, Weights_xi, Bias_xi, Weights_hl, Bias_hl, Weights_xl, Bias_xl):
ignore_hidden = np.dot(Weights_hi, h) + Bias_hi
ignore_eventx = np.dot(Weights_xi, x) + Bias_xi
learn_hidden = np.dot(Weights_hl, h) + Bias_hl
learn_eventx = np.dot(Weights_xl, x) + Bias_xl
return np.multiply( sigmoid(ignore_eventx + ignore_hidden), np.tanh(learn_eventx + learn_hidden) )
def cell_state(forget_gate_output, input_gate_output):
return forget_gate_output + input_gate_output
def output_gate(x, h, Weights_ho, Bias_ho, Weights_xo, Bias_xo, cell_state):
out_hidden = np.dot(Weights_ho, h) + Bias_ho
out_eventx = np.dot(Weights_xo, x) + Bias_xo
return np.multiply( sigmoid(out_eventx + out_hidden), np.tanh(cell_state) )
We would need the sigmoid function as well so
def sigmoid(x):
return 1/(1 + np.exp(-x))
Because pytorch stores weights in stacked manner so we need to break it up for that we would need the below function
def get_slices(hidden_dim):
slices=[]
breaker=(hidden_dim*4)
slices=[[i,i+3] for i in range(0, breaker, breaker//4)]
return slices
Now we have the functions ready for lstm, now we create an lstm class to copy the weights from pytorch class and get the output from it.
class numpy_lstm:
def __init__( self, layer_num=0, hidden_dim=1, matching_in_out=False):
self.matching_in_out=matching_in_out
self.layer_num=layer_num
self.hidden_dim=hidden_dim
def init_weights_from_pytorch(self, state):
slices=get_slices(self.hidden_dim)
print (slices)
#Event (x) Weights and Biases for all gates
lstm_weight_ih='lstm.weight_ih_l'+str(self.layer_num)
self.Weights_xi = state[lstm_weight_ih][slices[0][0]:slices[0][1]].numpy() # shape [h, x]
self.Weights_xf = state[lstm_weight_ih][slices[1][0]:slices[1][1]].numpy() # shape [h, x]
self.Weights_xl = state[lstm_weight_ih][slices[2][0]:slices[2][1]].numpy() # shape [h, x]
self.Weights_xo = state[lstm_weight_ih][slices[3][0]:slices[3][1]].numpy() # shape [h, x]
lstm_bias_ih='lstm.bias_ih_l'+str(self.layer_num)
self.Bias_xi = state[lstm_bias_ih][slices[0][0]:slices[0][1]].numpy() #shape is [h, 1]
self.Bias_xf = state[lstm_bias_ih][slices[1][0]:slices[1][1]].numpy() #shape is [h, 1]
self.Bias_xl = state[lstm_bias_ih][slices[2][0]:slices[2][1]].numpy() #shape is [h, 1]
self.Bias_xo = state[lstm_bias_ih][slices[3][0]:slices[3][1]].numpy() #shape is [h, 1]
lstm_weight_hh='lstm.weight_hh_l'+str(self.layer_num)
#Hidden state (h) Weights and Biases for all gates
self.Weights_hi = state[lstm_weight_hh][slices[0][0]:slices[0][1]].numpy() #shape is [h, h]
self.Weights_hf = state[lstm_weight_hh][slices[1][0]:slices[1][1]].numpy() #shape is [h, h]
self.Weights_hl = state[lstm_weight_hh][slices[2][0]:slices[2][1]].numpy() #shape is [h, h]
self.Weights_ho = state[lstm_weight_hh][slices[3][0]:slices[3][1]].numpy() #shape is [h, h]
lstm_bias_hh='lstm.bias_hh_l'+str(self.layer_num)
self.Bias_hi = state[lstm_bias_hh][slices[0][0]:slices[0][1]].numpy() #shape is [h, 1]
self.Bias_hf = state[lstm_bias_hh][slices[1][0]:slices[1][1]].numpy() #shape is [h, 1]
self.Bias_hl = state[lstm_bias_hh][slices[2][0]:slices[2][1]].numpy() #shape is [h, 1]
self.Bias_ho = state[lstm_bias_hh][slices[3][0]:slices[3][1]].numpy() #shape is [h, 1]
def forward_lstm_pass(self,input_data):
h = np.zeros(self.hidden_dim)
c = np.zeros(self.hidden_dim)
output_list=[]
for eventx in input_data:
f = forget_gate(eventx, h, self.Weights_hf, self.Bias_hf, self.Weights_xf, self.Bias_xf, c)
i = input_gate(eventx, h, self.Weights_hi, self.Bias_hi, self.Weights_xi, self.Bias_xi,
self.Weights_hl, self.Bias_hl, self.Weights_xl, self.Bias_xl)
c = cell_state(f,i)
h = output_gate(eventx, h, self.Weights_ho, self.Bias_ho, self.Weights_xo, self.Bias_xo, c)
if self.matching_in_out: # doesnt make sense but it was as it was in main code :(
output_list.append(h)
if self.matching_in_out:
return output_list
else:
return h
Similarly for fully connected layer,
class fully_connected_layer:
def __init__(self,state, dict_name='fc', ):
self.fc_Weight = state[dict_name+'.weight'][0].numpy()
self.fc_Bias = state[dict_name+'.bias'][0].numpy() #shape is [,output_size]
def forward(self,lstm_output, is_sigmoid=True):
res=np.dot(self.fc_Weight, lstm_output)+self.fc_Bias
print (res)
if is_sigmoid:
return sigmoid(res)
else:
return res
Now we would need one class to call all of them together and generalise them with respect to multiple layers
You can modify the below class if you need more Fully connected layers or want to set false condition for sigmoid etc.
class RNN_model_Numpy:
def __init__(self, state, input_size, hidden_dim, output_size, num_layers, matching_in_out=True):
self.lstm_layers=[]
for i in range(0, num_layers):
lstm_layer_obj=numpy_lstm(layer_num=i, hidden_dim=hidden_dim, matching_in_out=True)
lstm_layer_obj.init_weights_from_pytorch(state)
self.lstm_layers.append(lstm_layer_obj)
self.hidden2out=fully_connected_layer(state, dict_name='hidden2out')
def forward(self, feature_list):
for x in self.lstm_layers:
lstm_output=x.forward_lstm_pass(feature_list)
feature_list=lstm_output
return self.hidden2out.forward(feature_list, is_sigmoid=False)
Sanity check on a numpy variable:
data = np.array(
[[1,1],
[2,2],
[3,3]])
check=RNN_model_Numpy(state, input_size, hidden_dim, output_size, num_layers)
check.forward(data)
EXPLANATION:
Since we just need forward pass, we would need certain functions that are required in LSTM, for that we have the forget gate, input gate, cell gate and output gate. They are just some operations that are done on the input that you give.
For get_slices function, this is used to break down the weight matrix that we get from pytorch state dictionary (state dictionary) is the dictionary which contains the weights of all the layers that we have in our network.
For LSTM particularly have it in this order ignore, forget, learn, output. So for that we would need to break it up for different LSTM cells.
For numpy_lstm class, we have init_weights_from_pytorch function which must be called, what it will do is that it will extract the weights from state dictionary which we got earlier from pytorch model object and then populate the numpy array weights with the pytorch weights. You can first train your model and then save the state dictionary through pickle and then use it.
The fully connected layer class just implements the hidden2out neural network.
Finally our rnn_model_numpy class is there to ensure that if you have multiple layers then it is able to send the output of one layer of lstm to other layer of lstm.
Lastly there is a small sanity check on data variable.
IMPORTANT NOTE: PLEASE NOTE THAT YOU MIGHT GET DIMENSION ERROR AS PYTORCH WAY OF HANDLING INPUT IS COMPLETELY DIFFERENT SO PLEASE ENSURE THAT YOU INPUT NUMPY IS OF SIMILAR SHAPE AS DATA VARIABLE.
Important references:
https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
https://christinakouridi.blog/2019/06/19/backpropagation-lstm/
I am new to TensorFlow and I am trying to make a custom RNN cell that behaves like a stacked LSTM cell, with residual connection between LSTM layers. Each LSTM layer also has drop-out implemented. So far I've tried to subclass keras.layers.AbstractRNNCell, but during training, the drop-out mask of the cell is not reset after every batches when I iterate the cell with keras.layers.RNN. I've read the source code of keras.layers.RNN and I found out that it only resets the drop-out mask for cells that are instance of DropoutRNNCellMixin via method _maybe_reset_cell_dropout_mask.
My question is: How do I reset the drop-out mask of my custom cell after every calls of keras.layers.RNN?
The following is the code that I wrote and its behavior. I'm using Tensorflow version 2.5.0:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class StackedLSTMCell(layers.AbstractRNNCell):
def __init__(self, params, **kwargs):
super(StackedLSTMCell, self).__init__(**kwargs)
self.params = params
self.depth = params['depth']
self.units = [layers.LSTMCell(params['hidden_dim'],
dropout=params['dropout_rate_input'],
recurrent_dropout=params['dropout_rate_output'],
unit_forget_bias=True,
name=f'LSTM_{i}')
for i in range(params['depth'])]
#property
def state_size(self):
return tuple(self.units[i].state_size for i in range(self.depth))
#property
def output_size(self):
return self.units[-1].output_size
def get_config(self):
return {'params': self.params}
def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
return tuple(self.units[i].get_initial_state(inputs, batch_size, dtype) \
for i in range(self.depth))
def call(self, token, inp_state):
input_sum = token
out_state = []
for i in range(self.depth):
h, h_and_c = self.units[i](token, inp_state[i])
if i+1<self.depth:
token = layers.Add()([input_sum, h])
input_sum = layers.Add()([input_sum, token])
out_state.append(h_and_c)
return h, tuple(out_state)
params = {
'hidden_dim':128,
'dropout_rate_input':0.2,
'dropout_rate_output':0.2,
'depth':3}
cell = StackedLSTMCell(params)
RNN = layers.RNN(cell, return_sequences=True)
inputs = tf.ones((1, 1, 128)) #input shape format is (batch_size, time_steps, features)
print(set([RNN(inputs, training=True).numpy()[0, 0, 0] for _ in range(5)]))
#return: {-0.19621503}
#RNN returns same output every calls.
Here is the desired behavior, taking the example of layers.LSTM:
RNN = layers.LSTM(
params['hidden_dim'],
dropout=params['dropout_rate_input'],
recurrent_dropout=params['dropout_rate_output'],
return_sequences=True)
print(set([RNN(inputs, training=True).numpy()[0, 0, 0] for _ in range(5)]))
#return: {-0.053168092, -0.016183555, -0.024903715, -0.040428974, 0.025961103}
#RNN returns different outputs after each call.
This is the source code of keras.layers.RNN:
https://github.com/tensorflow/tensorflow/blob/v2.5.0/tensorflow/python/keras/layers/recurrent.py#L198-L1002
Thank you for spending time for my problem. Any idea is appreciated.
I’m creating an Artificial Neural Network (ANN) using Kera’s Functional API. Link to the data csv file: https://github.com/dpintof/SPX_Options_ANN/blob/master/MLP3/call_df.csv. Relevant part of the code that reproduces problem:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow import keras
import tensorflow as tf
from tensorflow.keras import layers
# Data
call_df = pd.read_csv("call_df.csv")
call_X_train, call_X_test, call_y_train, call_y_test = train_test_split(call_df.drop(["Option_Average_Price"],
axis = 1), call_df.Option_Average_Price, test_size = 0.01)
# Hyperparameters
n_hidden_layers = 2 # Number of hidden layers.
n_units = 128 # Number of neurons of the hidden layers.
# Create input layer
inputs = keras.Input(shape = (call_X_train.shape[1],))
x = layers.LeakyReLU(alpha = 1)(inputs)
"""
Function that creates a hidden layer by taking a tensor as input and applying a
modified ELU (MELU) activation function.
"""
def hl(tensor):
# Create custom MELU activation function
def melu(z):
return tf.cond(z > 0, lambda: ((z**2)/2 + 0.02*z) / (z - 2 + 1/0.49),
lambda: 0.49*(keras.activations.exponential(z)-1))
y = layers.Dense(n_units, activation = melu)(tensor)
return y
# Create hidden layers
for _ in range(n_hidden_layers):
x = hl(x)
# Create output layer
outputs = layers.Dense(1, activation = keras.activations.softplus)(x)
# Actually create the model
model = keras.Model(inputs=inputs, outputs=outputs)
# QUICK TEST
model.compile(loss = "mse", optimizer = keras.optimizers.Adam())
history = model.fit(call_X_train, call_y_train,
batch_size = 4096, epochs = 1,
validation_split = 0.01, verbose = 1)
This is the error I get when I do model.fit(…) (notice that 4096 is my batch size and 128 is the number of neurons of the hidden layers):
InvalidArgumentError: The second input must be a scalar, but it has shape [4096,128]
[[{{node dense/cond/dense/BiasAdd/_5}}]] [Op:__inference_keras_scratch_graph_1074]
Function call stack:
keras_scratch_graph
I know the problem has to do with the custom activation function because the program runs fine if I use the following hl function instead:
def hl(tensor):
lr = layers.Dense(n_units, activation = layers.LeakyReLU())(tensor)
return lr
I got the same error when trying to define melu(z) like this:
#tf.function
def melu(z):
if z > 0:
return ((z**2)/2 + 0.02*z) / (z - 2 + 1/0.49)
else:
return 0.49*(keras.activations.exponential(z)-1)
From How do you create a custom activation function with Keras? I also tried the following, but without success:
def hl(tensor):
# Create custom MELU activation function
def melu(z):
return tf.cond(z > 0, lambda: ((z**2)/2 + 0.02*z) / (z - 2 + 1/0.49),
lambda: 0.49*(keras.activations.exponential(z)-1))
from keras.utils.generic_utils import get_custom_objects
get_custom_objects().update({'melu': layers.Activation(melu)})
x = layers.Dense(n_units)(tensor)
y = layers.Activation(melu)(x)
return y
This issue happens because tf.cond expects a scalar for the condition argument (instead of a multi-dimensional tensor). Instead, you can use tf.where to apply the conditional element-wise.
For example, you can define melu as follows:
def melu(z):
return tf.where(z > 0, ((z**2)/2 + 0.02*z) / (z - 2 + 1/0.49),
0.49*(keras.activations.exponential(z)-1))
NOTE: Not tested.
In the keras doc, it says that if we want to pick the intermediate layer's output of the model (sequential and functional), all we need to do as follows:
model = ... # create the original model
layer_name = 'my_layer'
intermediate_layer_model = keras.Model(inputs=model.input,
outputs=model.get_layer(layer_name).output)
intermediate_output = intermediate_layer_model(data)
So, here we get two models, the intermediate_layer_model is the sub-model of its parent model. And they're independent as well. Likewise, if we get the intermediate layer's output feature maps of the parent model (or base model), and do some operation with it and get some output feature maps from this operation, then we can also impute this output feature maps back to the parent model.
input = tf.keras.Input(shape=(size,size,3))
model = tf.keras.applications.DenseNet121(input_tensor = input)
layer_name = "conv1_block1" # for example
output_feat_maps = SomeOperationLayer()(model.get_layer(layer_name).output)
# assume, they're able to add up
base = Add()([model.output, output_feat_maps])
# bind all
imputed_model = tf.keras.Model(inputs=[model.input], outputs=base)
So, in this way we have one modified model. It's quite easy with functional API. All the keras imagenet models are written with functional API (mostly). In model subclassing API, we can use these models. My concern here is, what to do if we need the intermediate output feature maps of these functional API models' inside call function.
class Subclass(tf.keras.Model):
def __init__(self, dim):
super(Subclass, self).__init__()
self.dim = dim
self.base = DenseNet121(input_shape=self.dim)
# building new model with the desired output layer of base model
self.mid_layer_model = tf.keras.Model(self.base.inputs,
self.base.get_layer(layer_name).output)
def call(self, inputs):
# forward with base model
x = self.base(inputs)
# forward with mid_layer_model
mid_feat = self.mid_layer_model(inputs)
# do some op with it
mid_x = SomeOperationLayer()(mid_feat)
# assume, they're able to add up
out = tf.keras.layers.add([x, mid_x])
return out
The issue is, here we've technically two models in a joint fashion. But unlike building a model like this, here we simply want the intermediate output feature maps (from some inputs) of the base model forward manner and use it somewhere else and get some output. Like this
mid_x = SomeOperationLayer()(self.base.get_layer(layer_name).output)
But it gives ValueError: Graph disconnected. So, currently, we have to build a new model from the base model based on our desired intermediate layer. In the init method we define or create new self.mid_layer_model model that gives our desired output feature maps like this: mid_feat = self.mid_layer_model(inputs). Next, we take the mid_faet and do some operation and get some output and lastly add them with tf.keras.layers.add([x, mid_x]). So by creating a new model with desired intermediate out works but by the same time, we repeat the same operation twice i.e the base model and its subset model. Maybe I'm missing something obvious, please add up something. Is it how it is! or there some strategies we can adopt. I've asked in the forum here, no response yet.
Update
Here is a working example. Let's say we have a custom layer like this
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
class ConvBlock(tf.keras.layers.Layer):
def __init__(self, kernel_num=32, kernel_size=(3,3), strides=(1,1), padding='same'):
super(ConvBlock, self).__init__()
# conv layer
self.conv = tf.keras.layers.Conv2D(kernel_num,
kernel_size=kernel_size,
strides=strides, padding=padding)
# batch norm layer
self.bn = tf.keras.layers.BatchNormalization()
def call(self, input_tensor, training=False):
x = self.conv(input_tensor)
x = self.bn(x, training=training)
return tf.nn.relu(x)
And we want to impute this layer into an ImageNet model and construct a model like this
input = tf.keras.Input(shape=(32, 32, 3))
base = DenseNet121(weights=None, input_tensor = input)
# get output feature maps of at certain layer, ie. conv2_block1_0_relu
cb = ConvBlock()(base.get_layer("conv2_block1_0_relu").output)
flat = Flatten()(cb)
dense = Dense(1000)(flat)
# adding up
adding = Add()([base.output, dense])
model = tf.keras.Model(inputs=[base.input], outputs=adding)
from tensorflow.keras.utils import plot_model
plot_model(model,
show_shapes=True, show_dtype=True,
show_layer_names=True,expand_nested=False)
Here the computation from input to layer conv2_block1_0_relu is computed one time. Next, if we want to translate this functional API to subclassing API, we had to build a model from the base model's input to layer conv2_block1_0_relu first. Like
class ModelWithMidLayer(tf.keras.Model):
def __init__(self, dim=(32, 32, 3)):
super().__init__()
self.dim = dim
self.base = DenseNet121(input_shape=self.dim, weights=None)
# building sub-model from self.base which gives
# desired output feature maps: ie. conv2_block1_0_relu
self.mid_layer = tf.keras.Model(self.base.inputs,
self.base.get_layer("conv2_block1_0_relu").output)
self.flat = Flatten()
self.dense = Dense(1000)
self.add = Add()
self.cb = ConvBlock()
def call(self, x):
# forward with base model
bx = self.base(x)
# forward with mid layer
mx = self.mid_layer(x)
# make same shape or do whatever
mx = self.dense(self.flat(mx))
# combine
out = self.add([bx, mx])
return out
def build_graph(self):
x = tf.keras.layers.Input(shape=(self.dim))
return tf.keras.Model(inputs=[x], outputs=self.call(x))
mwml = ModelWithMidLayer()
plot_model(mwml.build_graph(),
show_shapes=True, show_dtype=True,
show_layer_names=True,expand_nested=False)
Here model_1 is actually a sub-model from DenseNet, which probably leads the whole model (ModelWithMidLayer) to compute the same operation twice. If this observation is correct, then this gives us concern.
I thought it might be much complex but it's actually rather very simple. We just need to build a model with desired output layers at the __init__ method and use it normally in the call method.
import tensorflow as tf
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
class ConvBlock(tf.keras.layers.Layer):
def __init__(self, kernel_num=32, kernel_size=(3,3), strides=(1,1), padding='same'):
super(ConvBlock, self).__init__()
# conv layer
self.conv = tf.keras.layers.Conv2D(kernel_num,
kernel_size=kernel_size,
strides=strides, padding=padding)
# batch norm layer
self.bn = tf.keras.layers.BatchNormalization()
def call(self, input_tensor, training=False):
x = self.conv(input_tensor)
x = self.bn(x, training=training)
return tf.nn.relu(x)
class ModelWithMidLayer(tf.keras.Model):
def __init__(self, dim=(32, 32, 3)):
super().__init__()
self.dim = dim
self.base = DenseNet121(input_shape=self.dim, weights=None)
# building sub-model from self.base which gives
# desired output feature maps: ie. conv2_block1_0_relu
self.mid_layer = tf.keras.Model(
inputs=[self.base.inputs],
outputs=[
self.base.get_layer("conv2_block1_0_relu").output,
self.base.output])
self.flat = Flatten()
self.dense = Dense(1000)
self.add = Add()
self.cb = ConvBlock()
def call(self, x):
# forward with base model
bx = self.mid_layer(x)[1] # output self.base.output
# forward with mid layer
mx = self.mid_layer(x)[0] # output base.get_layer("conv2_block1_0_relu").output
# make same shape or do whatever
mx = self.dense(self.flat(mx))
# combine
out = self.add([bx, mx])
return out
def build_graph(self):
x = tf.keras.layers.Input(shape=(self.dim))
return tf.keras.Model(inputs=[x], outputs=self.call(x))
mwml = ModelWithMidLayer()
tf.keras.utils.plot_model(mwml.build_graph(),
show_shapes=True, show_dtype=True,
show_layer_names=True,expand_nested=False)
I have a simple model with one custom layer which works fine in the normal case.
When I switched to eager execution via tf.enable_eager_execution(), I got stuck on a weird error.
Here is the code so far:
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, Input
from tensorflow.keras.losses import kullback_leibler_divergence
tf.enable_eager_execution()
class ClusteringLayer(Layer):
def __init__(self, output_dim, input_dim=None, alpha=1.0, **kwargs):
self.output_dim = output_dim
self.input_dim = input_dim
self.alpha = alpha
super(ClusteringLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.W = self.add_weight(name='kernel', shape=(self.output_dim, input_shape[1]), initializer='uniform', trainable=True)
super(ClusteringLayer, self).build(input_shape)
def call(self, x, mask=None):
q = 1.0/(1.0 + K.sqrt(K.sum(K.square(K.expand_dims(x, 1) - self.W), axis=2))**2 /self.alpha)
q = q**((self.alpha+1.0)/2.0)
q = K.transpose(K.transpose(q)/K.sum(q, axis=1))
return q
def compute_output_shape(self, input_shape):
return (input_shape[0], self.output_dim)
def clustering_loss(y_true, y_pred):
a = K.square(y_pred) / K.sum(y_pred, axis=0)
p = K.transpose(K.transpose(a) / K.sum(a, axis=1))
loss = kullback_leibler_divergence(p, y_pred)
return loss
input1 = Input(shape=(10,), name="input")
out = ClusteringLayer(output_dim = 5, name='clustering')(input1)
model = Model(inputs=input1, outputs=out)
model.compile(optimizer=tf.train.AdamOptimizer(1e-3), loss={'clustering' : clustering_loss})
X = np.random.random((20, 10)).astype(np.float32)
Y = np.random.random((20, 5)).astype(np.float32)
model.fit(x={'input' : X}, y={'clustering' : Y}, batch_size=1, epochs=10)
The error message is related to the "fit" function:
AssertionError: Could not compute output DeferredTensor('None', shape=(5,), dtype=float32)
When I tried to check the output of my custom layer, I was surprised to find that this layer is generating two outputs. The first one is ambiguous and undesired.
Code:
input1 = Input(shape=(10,), name="input")
layer = ClusteringLayer(output_dim = 5, name='clustering')
out = layer(input1)
print(out)
Output:
[<DeferredTensor 'None' shape=(?,) dtype=float32>, <DeferredTensor 'None' shape=(5,) dtype=float32>]
Even when I changed my custom layer with the simplistic custom layer from the Keras documentation, I got the same error:
AssertionError: Could not compute output DeferredTensor('None', shape=(5,), dtype=float32)
I asked the question in GitHub since it seems more like a bug.
They have recommended using a workaround until they fix the internal problem.
I m quoting from here :github
As a workaround, you could wrap the output shape returned by
compute_output_shape in a TensorShape. For example:
TensorShape((input_shape[0], self.output_dim)). Let me know if this
works.