I recently created a Tensorflow/Keras model with Keras Transformers. To do this, the custom PositionalEmbedding & TransformerEncoder classes were created and used to build the model architecture. There are created as such:
class PositionalEmbedding(layers.Layer):
def __init__(self, sequence_length, output_dim, **kwargs):
super().__init__(**kwargs)
self.position_embeddings = layers.Embedding(
input_dim=sequence_length, output_dim=output_dim
)
self.sequence_length = sequence_length
self.output_dim = output_dim
def call(self, inputs):
# The inputs are of shape: `(batch_size, frames, num_features)`
length = tf.shape(inputs)[1]
positions = tf.range(start=0, limit=length, delta=1)
embedded_positions = self.position_embeddings(positions)
return inputs + embedded_positions
def compute_mask(self, inputs, mask=None):
mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
return mask
class TransformerEncoder(layers.Layer):
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.dense_dim = dense_dim
self.num_heads = num_heads
self.attention = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=embed_dim, dropout=0.3
)
self.dense_proj = keras.Sequential(
[layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
)
self.layernorm_1 = layers.LayerNormalization()
self.layernorm_2 = layers.LayerNormalization()
def call(self, inputs, mask=None):
if mask is not None:
mask = mask[:, tf.newaxis, :]
attention_output = self.attention(inputs, inputs, attention_mask=mask)
proj_input = self.layernorm_1(inputs + attention_output)
proj_output = self.dense_proj(proj_input)
return self.layernorm_2(proj_input + proj_output)
At first, I was unable to even save this model using the typical model.save() method. However, I was able to solve for this by updating the config for the classes like so:
### FOR THE PositionalEmbedding CLASS
def get_config(self):
config = super().get_config().copy()
config.update({
'position_embeddings': self.position_embeddings,
'sequence_length': self.sequence_length,
'output_dim': self.output_dim
})
return config
### FOR THE TransformerEncoder CLASS
def get_config(self):
config = super().get_config().copy()
config.update({
'embed_dim': self.embed_dim,
'dense_dim': self.dense_dim,
'num_heads': self.num_heads,
'attention': self.attention,
'dense_proj': self.dense_proj,
'layernorm_1': self.layernorm_1,
'layernorm_2': self.layernorm_2
})
return config
However, when I try to load the model using the keras load_model() method without the custom_objects argument, I get the following error:
ValueError: Unknown layer: PositionalEmbedding. Please ensure this object is passed to the `custom_objects` argument.
And if I use the load _model() method without initializing the classes, using the custom_objects argument for the two classes as such load_model('my_model.h5', custom_objects= {'PositionalEmbedding':PositionalEmbedding,'TransformerEncoder':TransformerEncoder}), I get the following error:
NameError: name 'PositionalEmbedding' is not defined
And finally, if I do initialize the classes with the updated configs before loading, and use the load_model() method as shown in the previous example, I get the following error:
TypeError: ('Keyword argument not understood:', 'position_embeddings')
Anyone know what might be causing this issue and how I can resolve them to load this model? Any help is appreciated!
Thanks!
Sam
So I was actually able to solve this problem with a workaround. Instead of saving the model and loading it the old-fashioned way, I saved a checkpoint for the model while training, then loaded it by creating a new model from scratch and loading the checkpoint as the weights.
The code for that is below:
### SAVING THE MODEL WITH CHECKPOINT
filepath = "/content/drive/MyDrive/tmp/model_checkpoint.ckpt"
checkpoint = keras.callbacks.ModelCheckpoint(
filepath, save_weights_only=True, save_best_only=True, verbose=1
)
history = model.fit(
train_data,
train_labels,
validation_split=0.3,
epochs=250,
batch_size=256,
callbacks=[checkpoint],
)
### CREATING NEW MODEL & LOADING CHECKPOINT AS WEIGHTS
def get_compiled_model():
sequence_length = MAX_SEQ_LENGTH
embed_dim = NUM_FEATURES
dense_dim = 4
num_heads = 1
classes = len(label_processor.get_vocabulary())
inputs = keras.Input(shape=(None, None))
x = PositionalEmbedding(
sequence_length, embed_dim, name="frame_position_embedding"
)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(classes, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.compile(
optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
return model
model = get_compiled_model()
model.load_weights("/content/drive/MyDrive/tmp/model_checkpoint.ckpt")
Related
I am doing a creating custom pytorch layer and model training using Trainer API function on top of Hugging face model.
When I run on single GPU, it trains fine. But when I train it on multiple GPU it throws me error.
TypeError: zip argument #1 must support iteration training in multiple GPU
Training Code
bert_model = BertForTokenClassification.from_pretrained( model_checkpoint,id2label=id2label,label2id=label2id)
bert_model.config.output_hidden_states=True
class BERT_CUSTOM(nn.Module):
def __init__(self, bert_model,id2label,num_labels):
super(BERT_CUSTOM, self).__init__()
self.bert = bert_model
self.config=self.bert.config
self.dropout = nn.Dropout(0.25)
self.classifier = nn.Linear(768, num_labels)
self.crf = CRF(num_labels, batch_first = True)
def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
outputs = self.bert(input_ids, attention_mask=attention_mask)
sequence_output = torch.stack((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4])).mean(dim=0)
sequence_output = self.dropout(sequence_output)
emission = self.classifier(sequence_output) # [32,256,21] logits
if labels is not None:
labels=labels.reshape(attention_mask.size()[0],attention_mask.size()[1])
loss = -self.crf(log_soft(emission, 2), labels, mask=attention_mask.type(torch.uint8), reduction='mean')
prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
return [loss, prediction]
else:
prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
prediction=[id2label[k] for k in prediction]
return prediction
Training API
model = BERT_CUSTOM(bert_model, id2label,num_labels=len(label2id))
model.to(device)
args = TrainingArguments(
"model",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=2,
weight_decay=0.01,
per_device_train_batch_size=32,
fp16=True
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_data,
tokenizer=tokenizer)
trainer.train()
I'm looking to save a regression model using TensorFlow/Keras, but upon doing so I am getting the following error;
ValueError: Cannot pickle Tensor -- its value is not known statically: Tensor("variational_gaussian_process_15/Softplus:0", shape=(), dtype=float64).
The model is;
tf.random.set_seed(11)
# For numeric stability, set the default floating-point dtype to float64
tf.keras.backend.set_floatx('float64')
x_range = np.array([0,1])
# Build mvodel.
num_inducing_points = 40
model = tf.keras.Sequential([
tf.keras.layers.InputLayer(input_shape=[1078]),
tf.keras.layers.Dense(1, activation='sigmoid'),
tf.keras.layers.Dense(1, kernel_initializer='ones', use_bias=False),
tfp.layers.VariationalGaussianProcess(
num_inducing_points=num_inducing_points,
kernel_provider=RBFKernelFn(),
event_shape=[1],
inducing_index_points_initializer=tf.constant_initializer(
np.linspace(*x_range, num=num_inducing_points,
dtype=X_train.dtype)[..., np.newaxis]),
unconstrained_observation_noise_variance_initializer=(
tf.constant_initializer(np.array(0.54).astype(X_train.dtype))),
),
])
batch_size = 32
loss = lambda y_train, rv_y: rv_y.variational_loss(
y_train, kl_weight=np.array(batch_size, X_train.dtype) / X_train.shape[0])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss=loss)
print(model)
print("Start training..\n")
history = model.fit(X_train, y_train, batch_size=batch_size, epochs=50, validation_split=0.2)
print("Done.")
[print(np.squeeze(w.numpy())) for w in model.weights];
There's also a class included, which I think might be causing the issue, as atop of the error message when saving is a warning;
WARNING:tensorflow:Skipping full serialization of Keras layer <__main__.RBFKernelFn object at 0x7f872a0299a0>, because it is not built.
The RBFKernel looks as follows;
class RBFKernelFn(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super(RBFKernelFn, self).__init__(**kwargs)
dtype = kwargs.get('dtype', None)
self._amplitude = self.add_variable(
initializer=tf.constant_initializer(0),
dtype=dtype,
name='amplitude')
self._length_scale = self.add_variable(
initializer=tf.constant_initializer(0),
dtype=dtype,
name='length_scale')
def call(self, x):
# Never called -- this is just a layer so it can hold variables
# in a way Keras understands.
return x
#property
def kernel(self):
return tfp.math.psd_kernels.ExponentiatedQuadratic(
amplitude=tf.nn.softplus(0.1 * self._amplitude),
length_scale=tf.nn.softplus(5. * self._length_scale)
)
Lastly, I tried the following to save
I am trying to prune a model in tensorflow but coming across an error I don't know how to tackle. The error is ValueError: Please initialize "Prune" with a supported layer. Layers should either be a "PrunableLayer" instance, or should be supported by the PruneRegistry. You passed: <class 'base_transformer_tf.TransformerEncoder'>
The model is created using following
def transformer_encoder(num_columns, num_labels, num_layers, d_model, num_heads, dff, window_size, dropout_rate, weight_decay, label_smoothing, learning_rate):
inp = tf.keras.layers.Input(shape = (window_size, num_columns))
x = tf.keras.layers.BatchNormalization()(inp)
x = tf.keras.layers.Dense(d_model)(x)
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('swish')(x)
x = tf.keras.layers.SpatialDropout1D(dropout_rate)(x)
x = TransformerEncoder(num_layers, d_model, num_heads, dff, window_size, dropout_rate)(x)
out = tf.keras.layers.Dense(num_labels, activation = 'sigmoid', dtype=tf.float32)(x[:, -1, :])
model = tf.keras.models.Model(inputs = inp, outputs = out)
model.compile(optimizer = tfa.optimizers.AdamW(weight_decay = weight_decay, learning_rate = learning_rate),
loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = label_smoothing),
metrics = tf.keras.metrics.AUC(name = 'AUC'),
)
return model
The pruning portion of code is following
pruning_params = {
'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay(initial_sparsity=0.00,
final_sparsity=0.50,
begin_step=0,
end_step=end_step)
}
model_for_pruning = prune_low_magnitude(model, **pruning_params)
# `prune_low_magnitude` requires a recompile.
model_for_pruning.compile(optimizer='adam',
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
logdir = tempfile.mkdtemp()
callbacks = [
tfmot.sparsity.keras.UpdatePruningStep(),
tfmot.sparsity.keras.PruningSummaries(log_dir=logdir),
]
model_for_pruning.fit(np.concatenate((X_tr2, X_val)), np.concatenate((y_tr2, y_val)),
batch_size=batch_size, epochs=epochs, validation_split=validation_split,
callbacks=callbacks)
Any help would be appreciated
Tensorflow does not know how to prune your custom TransformerEncoder Keras layer. You should specify which weights to sparsify, as in this example: Prune custom Keras layer or modify parts of layer to prune.
That would look like:
class TransformerEncoder(tf.keras.layers.Layer, tfmot.sparsity.keras.PrunableLayer):
def get_prunable_weights(self):
return [self.my_weight, ..]
I use Tensorflow 2.0 and want to extract all weights and biases from a trained model. Here is what I did so far:
I create a model class:
class MyModel(Model):
def __init__(self):
super(MyModel, self).__init__() # MyModel comes from a Basis Class
self.conv1 = Conv2D(filters=32, kernel_size=3, strides=[2,2], activation='relu')
self.flatten = Flatten()
self.d1 = Dense(units=64, activation="relu")
self.d2 = Dense(units=10, activation="softmax")
def call(self, x):
x = self.conv1(x)
x = self.flatten(x)
x = self.d1(x)
x = self.d2(x)
return x
During and after the training, I save my model:
checkpoint_path = "./logs/model.ckpt"
checkpoint_dir = "./logs/"
self.model.save_weights(checkpoint_path)
self.model.save(checkpoint_dir)
At this point I ask myself already, how to save the model correctly? Do I use save_weights or just save? I want to be able to
retrain the model if necessary
extract the model's weights for further analysis
Currently I load my trained model (in a new file) by doing:
model = MyModel()
model.load_weights(checkpoint_path)
But how can I access the network's weights? I already tried tf.compat.v1.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) which did not work.
I highly appreciate any help!
Firstly the difference between two saving methods:
model.save_weights(): You save only weights. So, you need the model code to reconstruct the model as model = MyModel() with initial weights. Then you replace weights by .load_weights()
model.save(): It saves the whole model including the architecture, optimizer states and weights. So, you can reproduce the entire mode without the code that defines MyModel().
By the way, another option in TF2 is to use checkpoint manager.
In your case, I would go with .save_weights() or checkpoint manager
Next, you can analyze the weights by:
import tf.keras.Model as Model
from tf.keras.layers import Conv2D, Flatten, Dense
class MyModel(tf.keras.Model):
def __init__(self):
super(MyModel, self).__init__() # MyModel comes from a Basis Class
self.conv1 = Conv2D(filters=32, kernel_size=3, strides=[2,2], activation='relu')
self.flatten = Flatten()
self.d1 = Dense(units=64, activation="relu")
self.d2 = Dense(units=10, activation="softmax")
def call(self, x):
x = self.conv1(x)
x = self.flatten(x)
x = self.d1(x)
x = self.d2(x)
return x
m = MyModel()
input_shape = tf.TensorShape([None,64,64,1]) # For exmaple, 64x64 images with arbitrary batch size
m.build(input_shape)
# Train
# Save weights
# Load weights
# Analyze weights
conv1_weights, conv1_bias = m.conv1.weights
d1_weights, d1_bias = m.d1.weights
d2_weights, d2_bias = m.d2.weights
I'm trying to follow this tutrial https://colab.research.google.com/github/tensorflow/examples/blob/master/community/en/transformer_chatbot.ipynb, However, when I tried to save the model in order to load it again without training I got an error mentioned here NotImplementedError: Layers with arguments in `__init__` must override `get_config`
I understood from the answer that I need to make the encoder and decoder as classes and customise it(instead of leaving it as functions like the colab tutrial) so I went back to tensor flow documentation of this model here: https://www.tensorflow.org/tutorials/text/transformer#encoder_layer and tried to edit in it. I made the encoder layer as:
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, rate=0.1,**kwargs,):
#super(EncoderLayer, self).__init__()
super().__init__(**kwargs)
self.mha = MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
def get_config(self):
config = super().get_config().copy()
config.update({
#'vocab_size': self.vocab_size,
#'num_layers': self.num_layers,
#'units': self.units,
'd_model': self.d_model,
'num_heads': self.num_heads,
'dropout': self.dropout,
})
return config
def call(self, x, training, mask):
attn_output, _ = self.mha(x, x, x, mask) # (batch_size, input_seq_len, d_model)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
return out2
and same for the decoder layer class. Then the same encoder in the documentation of tf
class Encoder(tf.keras.layers.Layer):
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
maximum_position_encoding, rate=0.1):
super(Encoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
self.pos_encoding = positional_encoding(maximum_position_encoding,
self.d_model)
self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
seq_len = tf.shape(x)[1]
# adding embedding and position encoding.
x = self.embedding(x) # (batch_size, input_seq_len, d_model)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :seq_len, :]
x = self.dropout(x, training=training)
for i in range(self.num_layers):
x = self.enc_layers[i](x, training, mask)
return x # (batch_size, input_seq_len, d_model)
the function of the model as:
def transformer(vocab_size,
num_layers,
units,
d_model,
num_heads,
dropout,
name="transformer"):
inputs = tf.keras.Input(shape=(None,), name="inputs")
dec_inputs = tf.keras.Input(shape=(None,), name="dec_inputs")
enc_padding_mask = tf.keras.layers.Lambda(
create_padding_mask, output_shape=(1, 1, None),
name='enc_padding_mask')(inputs)
# mask the future tokens for decoder inputs at the 1st attention block
look_ahead_mask = tf.keras.layers.Lambda(
create_look_ahead_mask,
output_shape=(1, None, None),
name='look_ahead_mask')(dec_inputs)
# mask the encoder outputs for the 2nd attention block
dec_padding_mask = tf.keras.layers.Lambda(
create_padding_mask, output_shape=(1, 1, None),
name='dec_padding_mask')(inputs)
enc_outputs = Encoder(
num_layers=num_layers, d_model=d_model, num_heads=num_heads,
input_vocab_size=vocab_size,
)(inputs=[inputs, enc_padding_mask])
dec_outputs = Decoder(
num_layers=num_layers, d_model=d_model, num_heads=num_heads,
target_vocab_size=vocab_size,
)(inputs=[dec_inputs, enc_outputs, look_ahead_mask, dec_padding_mask])
outputs = tf.keras.layers.Dense(units=vocab_size, name="outputs")(dec_outputs)
return tf.keras.Model(inputs=[inputs, dec_inputs], outputs=outputs, name=name)
and calling the model:
#the model itself with its paramters:
# Hyper-parameters
NUM_LAYERS = 3
D_MODEL = 256
#D_MODEL=tf.cast(D_MODEL, tf.float32)
NUM_HEADS = 8
UNITS = 512
DROPOUT = 0.1
model = transformer(
vocab_size=VOCAB_SIZE,
num_layers=NUM_LAYERS,
units=UNITS,
d_model=D_MODEL,
num_heads=NUM_HEADS,
dropout=DROPOUT)
However, I got that error:
TypeError: __init__() missing 2 required positional arguments: 'dff' and 'maximum_position_encoding'
I am really confused and I don't understand what dff and maximum position encoding mean in the documentation and when I removed them from the encoder and decoder classes, I got anther error as positional_encoding function takes maximum position as input and also dff is passed as input inside the class. I am not so sure what I should do as I am not sure whether I am following the right steps or not
If you get this error while calling transformer then your problem is with creating the model, not saving it.
Other than that, I see several issues with your get_config:
You defined dropout instead of rate.
The attributes you address (self.d_model etc.) are not defined or assigned at __init__.
It doesn't exist for your Encoder class.