I'm finetuning sentence-bert to do some task like sentence cosine-similarity calculation in Tensorflow. I set up a encoder, let's say, encoder1 using the code below:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
embeddings = model.encode(sentences)
This is using the sentence-transformers API. And I also set up another encoder, call encoder2 using the code below:
from transformers import AutoTokenizer, TFAutoModel
import tensorflow as tf
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2', from_pt=True)
model_tf = TFAutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2', from_pt=True)
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='tf')
outputs = model_tf(**encoded_input)
def mean_pooling(model_output, input_mask):
# seq_output shape=[batch_size, max_seq_len, hidden_size]
# input_mask shape=[batch_size, max_seq_len]
# expand input_mask
seq_output = model_output[0]
input_mask_expanded = tf.cast(tf.broadcast_to(tf.expand_dims(input_mask, -1), seq_output.shape), tf.float32)
# pooled = tf.reduce_sum(seq_output * input_mask_expanded, 1) / tf.clip_by_value(tf.reduce_sum(input_mask_expanded, 1), clip_value_min=-1, clip_value_max=)
pooled = tf.reduce_sum(seq_output * input_mask_expanded, 1) / tf.reduce_sum(input_mask_expanded, 1)
# shape = [batch_size, hidden_size]
return pooled
sentence_embeddings = mean_pooling(outputs, encoded_input['attention_mask'])
pooled_output, _ = tf.linalg.normalize(sentence_embeddings, 2, axis=1)
This loads a pre-trained model from Huggingface, which I have tested and I'm sure that it will produce the same results(pooled_output and embeddings) as in encoder1.
However, the weird thing is that, when I load this encoder2 into my tf.Model, and try to run a classifier to see whether two sentences are close, with the same input, same model trainable weights, same model, it gives different values. Does the network randomly initialize everything after I load the model?
Here's my encoder code:
class ApplicationCLS(tf.keras.layers.Layer):
def __init__(self, bert_encoder_path, batch_size):
super().__init__()
self.bert_encoder = TFAutoModel.from_pretrained(bert_encoder_path, from_pt=True)
self.classifier = CLSlayer(256, 1)
self.loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
self.metric_fn = tf.keras.metrics.BinaryAccuracy(name="accuracy")
self.auc_fn = tf.keras.metrics.AUC()
self.batch_size = batch_size
def mean_pooling(self, model_output, input_mask):
seq_output = model_output[0]
shape = [self.batch_size, seq_output.shape[1], seq_output.shape[2]]
input_mask_expanded = tf.cast(tf.broadcast_to(tf.expand_dims(input_mask, -1), shape), tf.float32)
pooled = tf.reduce_sum(seq_output * input_mask_expanded, 1) / tf.reduce_sum(input_mask_expanded, 1)
return pooled
def call(self, inputs, labels, training=True):
sent1_inputs = inputs["sent1_inputs"]
sent2_inputs = inputs["sent2_inputs"]
# inputs: {"input_ids": input_ids, "input_mask":input_mask, "type_ids": type_id}
sent1_outputs = self.bert_encoder(**sent1_inputs)
tf.print("output:", sent1_outputs[0][0])
sent1_embeddings = self.mean_pooling(sent1_outputs, sent1_inputs['attention_mask'])
sent1_pooled_output, _ = tf.linalg.normalize(sent1_embeddings, 2, axis=1)
sent2_outputs = self.bert_encoder(**sent2_inputs)
sent2_embeddings = self.mean_pooling(sent2_outputs, sent2_inputs['attention_mask'])
sent2_pooled_output, _ = tf.linalg.normalize(sent2_embeddings, 2, axis=1)
# concat
interaction = tf.concat([sent1_pooled_output, sent2_pooled_output], 1)
# classification
logits = self.classifier(interaction)
loss = self.loss_fn(labels, logits)
self.add_loss(loss)
acc = self.metric_fn(labels, logits)
auc = self.auc_fn(labels, logits)
self.add_metric(loss, name="loss")
self.add_metric(acc, name="acc")
self.add_metric(auc, name="auc")
return tf.nn.softmax(logits, name="prediction")
where the sent1_inputs is the same, but the printed outputs are different, what happened?
Related
I want to know how much the fine-tuned model improves compared to the model without fine-tuning.I want to compare the performance of the pre-trained model(BERT) and the model(fine-tuned BERT) obtained by fine-tuning the pre-trained model on text classification.I know how to fine-tune BERT for text classification, but not very clear on how to use BERT directly for classification.what should I do?The following is the code for fine-tuning the model, how to rewrite it to directly use the pre-trained model.
<!-- language: python -->
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim
from sklearn.metrics import accuracy_score,matthews_corrcoef
from sklearn.model_selection import train_test_split
tokenizer_model = BertTokenizer.from_pretrained('bert-base-uncased')
pretrained_model = BertModel.from_pretrained("bert-base-uncased")
class MyDataSet(Data.Dataset):
def __init__ (self, data, label):
self.data = data
self.label = label
self.tokenizer = tokenizer_model
def __getitem__(self, idx):
text = self.data[idx]
label = self.label[idx]
inputs = self.tokenizer(text, return_tensors="pt",padding='max_length',max_length=256,truncation=True)
input_ids = inputs.input_ids.squeeze(0)
#token_type_ids = inputs.token_type_ids.squeeze(0)
attention_mask = inputs.attention_mask.squeeze(0)
#return input_ids, token_type_ids, attention_mask, label
return input_ids, attention_mask, label
def __len__(self):
return len(self.data)
data,label = [],[]
with open(path) as f:
for line in f.readlines():
a,b = line.strip().split('\t')
data.append(b)
if a == 'LOW':
label.append('0')
elif a == 'MEDIUM':
label.append('1')
else:
label.append('2')
label = [int(i) for i in label]
train_x,test_x,train_y,test_y = train_test_split(data, label, test_size = 0.15,random_state = 32, stratify=label)
dataset_train = MyDataSet(train_x,train_y)
dataset_test = MyDataSet(test_x,test_y)
dataloader_train = Data.DataLoader(dataset_train, batch_size=128, shuffle=True,num_workers=32,pin_memory=True)
dataloader_test = Data.DataLoader(dataset_test, batch_size=128, shuffle=True,num_workers=32,pin_memory=True)
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.bert = pretrained_model
self.linear = nn.Linear(768,3)
def forward(self, input_ids, attention_mask):
output = self.bert(input_ids, attention_mask).pooler_output
print(output.shape) # torch.Size([1, 768])
output = self.linear(output)
return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
print("Use", torch.cuda.device_count(), 'gpus')
model = MyModel()
model = nn.DataParallel(model)
model = model.to(device)
## model = MyModel().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
for epoch in range(10):
for input_ids,attention_mask,label in dataloader_train:
train_input_ids,train_attention_mask,train_label = input_ids.to(device),attention_mask.to(device),label.to(device)
model.train()
pred = model(train_input_ids,train_attention_mask)
print('epoch:',epoch)
#print('pred,label:',pred,label)
loss = loss_fn(pred, train_label)
print('Loss:',loss.item())
pred = torch.argmax(pred,dim=1)
acc = (pred == train_label).float().mean()
print('acc:',acc)
loss.backward()
optimizer.step()
optimizer.zero_grad()
savename_train = str(path) +'_' + str(name) + '_train' + '.txt'
with open(savename_train,'a') as f:
f.write(str(epoch)+'\t'+str(loss.item())+'\t'+str(acc.item())+'\n')
model.eval()
with torch.no_grad():
for input_ids,attention_mask,label in dataloader_test:
validation_input_ids,validation_attention_mask,validation_label = input_ids.to(device),attention_mask.to(device),label.to(device)
pred = model(validation_input_ids,validation_attention_mask)
loss = loss_fn(pred, validation_label)
pred = torch.argmax(pred, dim=1)
acc = (pred == validation_label).float().mean()
print('acc:',acc)
savename_eval = str(path) +'_' + str(name) + '_val' + '.txt'
with open(savename_eval,'a') as f:
f.write(str(epoch)+'\t'+str(loss.item())+'\t'+str(acc.item())+'\n')
What you are trying to do does not make sense. The naive BERT model was retrained using a combination of masked language modelling objective and next sentence prediction. So, all it can do is predicting masked tokens, predicting if a pair of given sentence can be next to each other in a text. Most importantly, it can provide embeddings.
To use for classification you have to add a classification head to the end of the model. Initially, the weights of that layer is randomly initialised. If you do not fine tune the last layer, what do you really expect from random weights?
If you really want to compare the fine-tuned model to a baseline, take the embeddings vector from the BERT and use a tradional ML model like SVM or Tree based calssifier.
I was trainning the MNIST dataset from keras. And I copy the example from keras. First I met the problem, I tried to record each variable when concating. But I didn't find any strange shape that the ERROR indicates me. Here is the code:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import numpy as np
batch_size = 64
num_channels = 1
num_classes = 10
image_size = 28
latent_dim = 128
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
all_digits = np.concatenate([x_train, x_test])
all_labels = np.concatenate([y_train, y_test])
all_digits = all_digits.astype("float32") / 255.0
# print(all_digits.shape)
all_digits = np.reshape(all_digits, (-1, 28, 28, 1))
# print(all_digits.shape)
all_labels = keras.utils.to_categorical(all_labels, 10)
dataset = tf.data.Dataset.from_tensor_slices((all_digits, all_labels))
dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)
generator_in_channels = latent_dim + num_classes
discriminator_in_channels = num_channels + num_classes
discriminator = keras.Sequential(
[
keras.layers.InputLayer((28,28,discriminator_in_channels)),
layers.Conv2D(64,(3,3),strides=(2,2),padding='same'),
layers.LeakyReLU(alpha=0.2),
layers.Conv2D(128,(3,3),strides=(2,2),padding="same"),
layers.LeakyReLU(alpha=0.2),
layers.GlobalMaxPool2D(),
layers.Dense(1)
],
name='discriminator'
)
generator = keras.Sequential(
[
keras.layers.InputLayer((generator_in_channels,)),
layers.Dense(7*7*generator_in_channels),
layers.LeakyReLU(alpha=0.2),
layers.Reshape((7,7,generator_in_channels)),
layers.Conv2DTranspose(8,(4,4),strides=(2,2),padding='same'),
layers.LeakyReLU(alpha=0.2),
layers.Conv2DTranspose(8,(4,4),strides=(2,2),padding='same'),
layers.LeakyReLU(alpha=0.2),
layers.Conv2D(1,(3,3),padding='same',activation='sigmoid')
],
name='generator'
)
generator.summary()
discriminator.summary()
class ConditionalGan(keras.Model):
def __init__(self,discriminator,generator,latent_dim):
super(ConditionalGan,self).__init__()
self.discriminator = discriminator
self.generator = generator
self.latent_dim = latent_dim
self.gen_loss_tracker = keras.metrics.Mean(name='generator_loss')
self.disc_loss_tracker = keras.metrics.Mean(name='discriminator_loss')
#property
def metrics(self):
return [self.gen_loss_tracker,self.disc_loss_tracker]
def compile(self,d_optimizer,g_optimizer,loss_fn):
super(ConditionalGan,self).compile()
self.d_optimizer = d_optimizer
self.g_optimizer = g_optimizer
self.loss_fn = loss_fn
def train_step(self,data):
real_image , one_hot_labels = data
image_one_hot_labels = one_hot_labels[:,:,None,None]
image_one_hot_labels = tf.repeat(image_one_hot_labels,repeats=[image_size*image_size])
image_one_hot_labels = tf.reshape(image_one_hot_labels,shape=(-1,image_size,image_size,num_classes))
#Disciminator
random_latent_vector = tf.random.normal(shape=(batch_size,latent_dim))
print(random_latent_vector.shape)
print(one_hot_labels.shape)
random_vector_labels = tf.concat((random_latent_vector,one_hot_labels),axis=1)
generator_image = self.generator(random_vector_labels)
fake_image_and_labels = tf.concat([generator_image,image_one_hot_labels],-1)
real_image_and_labels = tf.concat([real_image,image_one_hot_labels],-1)
print(generator_image.shape)
print(real_image.shape)
combine_images = tf.concat([real_image_and_labels,fake_image_and_labels],0)
labels = tf.concat([tf.ones((batch_size,1)),tf.zeros((batch_size,1))],0)
with tf.GradientTape() as tape:
pred = self.discriminator(combine_images)
d_loss = self.loss_fn(labels,pred)
grads = tape.gradient(d_loss,self.discriminator.trainable_weights)
self.d_optimizer.apply_gradients(zip(grads,self.discriminator.trainable_weights))
#Generator
random_latent_vector = tf.random.normal(shape=(batch_size,latent_dim))
random_vector_labels = tf.concat((random_latent_vector,one_hot_labels),-1)
print(random_latent_vector.shape)
print(one_hot_labels.shape)
misleading_labels = tf.zeros((batch_size,1))
with tf.GradientTape() as tape:
fake_image = self.generator(random_vector_labels)
print(fake_image.shape)
print(image_one_hot_labels.shape)
fake_image_and_labels = tf.concat([fake_image,image_one_hot_labels],-1)
pred = self.discriminator(fake_image_and_labels)
g_loss = self.loss_fn(misleading_labels,pred)
grads = tape.gradient(g_loss,self.generator.trainable_weights)
self.g_optimizer.apply_gradients(zip(grads,self.generator.trainable_weights))
#Monitor loss.
self.gen_loss_tracker.update_state(g_loss)
self.disc_loss_tracker.update_state(d_loss)
return {
'g_loss':self.gen_loss_tracker.result(),
'd_loss':self.disc_loss_tracker.result()
}
cond_gan = ConditionalGan(
discriminator=discriminator, generator=generator, latent_dim=latent_dim
)
cond_gan.compile(
d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
)
cond_gan.fit(dataset, epochs=20)
And the ERROR is below. And i have search for some answering. Some of them are solving this kind of problems by editing the batch_size. I have tried, but failed. The error doesn't indicate the right line. This code is compiling in Kaggle Notebook. So maybe the location doesn't right.
InvalidArgumentError: ConcatOp : Dimensions of inputs should match: shape[0] = [64,128] vs. shape[1] = [48,10]
[[node concat (defined at tmp/ipykernel_34/2191800439.py:95) ]] [Op:__inference_train_function_38770]
Errors may have originated from an input operation.
Input Source operations connected to node concat:
IteratorGetNext (defined at tmp/ipykernel_34/2191800439.py:156)
random_normal (defined at tmp/ipykernel_34/2191800439.py:90)
Function call stack:
train_function
THANK U FOR ANSWERING!!!!
The problem is here random_vector_labels = tf.concat((random_latent_vector,one_hot_labels),-1)
It throws "InvalidArgumentError" error because random_latent_vector shape [64,128] and your one_hot_labels shape[48,10] don't match. your random_latent_vector has 64 rows of data but your one_hot_labels only has 48 rows of data. So when you are trying to concatenate two arrays with different dimensions make sure that they have the same number of rows. So make sure your random_latent_vector shape matches with one_hot_labels shape.
I'm trying to train TFBertForNextSentencePrediction on my own corpus, not from scratch, but rather taking the existing bert model with only a next sentence prediction head and further train it on a specific cuprous of text (pairs of sentences). Then I want to use the model I trained to be able to extract sentence embeddings from the last hidden state for other texts.
Currently the problem I encounter is that after I train the keras model I am not able to extract the hidden states of the last layer before the next sentence prediction head.
Below is the code. Here I only train it on a few sentences just to make sure the code works.
Any help will be greatly appreciated.
Thanks,
Ayala
import numpy as np
import pandas as pd
import tensorflow as tf
from datetime import datetime
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import BertTokenizer, PreTrainedTokenizer, BertConfig, TFBertForNextSentencePrediction
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
PRETRAINED_MODEL = 'bert-base-uncased'
# set paths and file names
time_stamp = str(datetime.now().year) + "_" + str(datetime.now().month) + "_" + str(datetime.now().day) + "_" + \
str(datetime.now().hour) + "_" + str(datetime.now().minute)
model_name = "pretrained_nsp_model"
model_dir_data = model_name + "_" + time_stamp
model_fn = model_dir_data + ".h5"
base_path = os.path.dirname(__file__)
input_path = os.path.join(base_path, "input_data")
output_path = os.path.join(base_path, "output_models")
model_path = os.path.join(output_path, model_dir_data)
if not os.path.exists(model_path):
os.makedirs(model_path)
# set model checkpoint
checkpoint = ModelCheckpoint(os.path.join(model_path, model_fn), monitor="val_loss", verbose=1, save_best_only=True,
save_weights_only=True, mode="min")
# read data
max_length = 512
def get_tokenizer(pretrained_model_name):
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
return tokenizer
def tokenize_nsp_data(A, B, max_length):
data_inputs = tokenizer(A, B, add_special_tokens=True, max_length=max_length, truncation=True,
pad_to_max_length=True, return_attention_mask=True,
return_tensors="tf")
return data_inputs
def get_data_features(data_inputs, max_length):
data_features = {}
for key in data_inputs:
data_features[key] = sequence.pad_sequences(data_inputs[key], maxlen=max_length, truncating="post",
padding="post", value=0)
return data_features
def get_transformer_model(transformer_model_name):
# get transformer model
config = BertConfig(output_attentions=True)
config.output_hidden_states = True
config.return_dict = True
transformer_model = TFBertForNextSentencePrediction.from_pretrained(transformer_model_name, config=config)
return transformer_model
def get_keras_model(transformer_model):
# get keras model
input_ids = tf.keras.layers.Input(shape=(max_length,), name='input_ids', dtype='int32')
input_masks_ids = tf.keras.layers.Input(shape=(max_length,), name='attention_mask', dtype='int32')
token_type_ids = tf.keras.layers.Input(shape=(max_length,), name='token_type_ids', dtype='int32')
X = transformer_model({'input_ids': input_ids, 'attention_mask': input_masks_ids, 'token_type_ids': token_type_ids})[0]
model = tf.keras.Model(inputs=[input_ids, input_masks_ids, token_type_ids], outputs=X)
model.summary()
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.optimizers.Adam(learning_rate=0.00005), metrics=['accuracy'])
return model
def get_metrices(true_values, pred_values):
cm = confusion_matrix(true_values, pred_values)
acc_score = accuracy_score(true_values, pred_values)
f1 = f1_score(true_values, pred_values, average="binary")
precision = precision_score(true_values, pred_values, average="binary")
recall = recall_score(true_values, pred_values, average="binary")
metrices = {'confusion_matrix': cm,
'acc_score': acc_score,
'f1': f1,
'precision': precision,
'recall': recall
}
for k, v in metrices.items():
print(k, ':\n', v)
return metrices
# get tokenizer
tokenizer = get_tokenizer(PRETRAINED_MODEL)
# train
prompt = ["Hello", "Hello", "Hello", "Hello"]
next_sentence = ["How are you?", "Pizza", "How are you?", "Pizza"]
train_labels = [0, 1, 0, 1]
train_labels = to_categorical(train_labels)
train_inputs = tokenize_nsp_data(prompt, next_sentence, max_length)
train_data_features = get_data_features(train_inputs, max_length)
# val
prompt = ["Hello", "Hello", "Hello", "Hello"]
next_sentence = ["How are you?", "Pizza", "How are you?", "Pizza"]
val_labels = [0, 1, 0, 1]
val_labels = to_categorical(val_labels)
val_inputs = tokenize_nsp_data(prompt, next_sentence, max_length)
val_data_features = get_data_features(val_inputs, max_length)
# get transformer model
transformer_model = get_transformer_model(PRETRAINED_MODEL)
# get keras model
model = get_keras_model(transformer_model)
callback_list = []
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, min_delta=0.005, verbose=1)
callback_list.append(early_stop)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, epsilon=0.001)
callback_list.append(reduce_lr)
callback_list.append(checkpoint)
history = model.fit([train_data_features['input_ids'], train_data_features['attention_mask'],
train_data_features['token_type_ids']], np.array(train_labels), batch_size=2, epochs=3,
validation_data=([val_data_features['input_ids'], val_data_features['attention_mask'],
val_data_features['token_type_ids']], np.array(val_labels)), verbose=1,
callbacks=callback_list)
model.layers[3].save_pretrained(model_path) # need to save this and make sure i can get the hidden states
## predict
# load model
transformer_model = get_transformer_model(model_path)
model = get_keras_model(transformer_model)
model.summary()
model.load_weights(os.path.join(model_path, model_fn))
# test
prompt = ["Hello", "Hello"]
next_sentence = ["How are you?", "Pizza"]
test_labels = [0, 1]
test_df = pd.DataFrame({'A': prompt, 'B': next_sentence, 'label': test_labels})
test_labels = to_categorical(val_labels)
test_inputs = tokenize_nsp_data(prompt, next_sentence, max_length)
test_data_features = get_data_features(test_inputs, max_length)
# predict
pred_test = model.predict([test_data_features['input_ids'], test_data_features['attention_mask'], test_data_features['token_type_ids']])
preds = tf.keras.activations.softmax(tf.convert_to_tensor(pred_test)).numpy()
true_test = test_df['label'].to_list()
pred_test = [1 if p[1] > 0.5 else 0 for p in preds]
test_df['pred_val'] = pred_test
metrices = get_metrices(true_test, pred_test)
I am also attaching a picture from the debugging mode in which I try (with no success) to view the hidden state. The problem is I am not able to see and save the transform model I trained and view the embeddings of the last hidden state. I tried converting the KerasTensor to numpy array but without success.
The issue resides in your 'get_keras_model()' function. You defined here that you are only interested in the first of the element of the output (i.e. logits) with:
X = transformer_model({'input_ids': input_ids, 'attention_mask': input_masks_ids, 'token_type_ids': token_type_ids})[0]
Just do the index selection as conditional like this to get the whole output of the model
def get_keras_model(transformer_model, is_training=True):
###your other code
X = transformer_model({'input_ids': input_ids, 'attention_mask': input_masks_ids, 'token_type_ids': token_type_ids})
if is_training:
X= X[0]
###your other code
return model
#predict
###your other code
model = get_keras_model(transformer_model, is_training=False)
###your other code
print(pred_test.keys())
Output:
odict_keys(['logits', 'hidden_states', 'attentions'])
P.S.: The BertTokenizer can truncate and add padding by themself (documentation).
am using an Seq2Seq project from Google that use Encoder/Decoder, there is the 2 encoder and decoder class :
#ENCODER
class EncoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,input_vocab_size,embedding_dims, rnn_units ):
super().__init__()
self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
output_dim=embedding_dims)
self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units,return_sequences=True,
return_state=True )
encoder_embedding = self.encoder_embedding
encoder_rnnlayer = self.encoder_rnnlayer
#DECODER
class DecoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,output_vocab_size, embedding_dims, rnn_units):
super().__init__()
self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,
output_dim=embedding_dims)
self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
# Sampler
self.sampler = tfa.seq2seq.sampler.TrainingSampler()
# Create attention mechanism with memory = None
self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[Tx])
self.rnn_cell = self.build_rnn_cell(BATCH_SIZE)
self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler= self.sampler,
output_layer=self.dense_layer)
def build_attention_mechanism(self, units,memory, memory_sequence_length):
return tfa.seq2seq.LuongAttention(units, memory = memory,
memory_sequence_length=memory_sequence_length)
#return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)
# wrap decodernn cell
def build_rnn_cell(self, batch_size ):
rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
attention_layer_size=dense_units)
return rnn_cell
def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size,
dtype = Dtype)
decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
return decoder_initial_state
i create an instance of EncoderNetwork and DecoderNetwork with my argument and use the loss_function and train_step already defined to train my model
def loss_function(y_pred, y):
#shape of y [batch_size, ty]
#shape of y_pred [batch_size, Ty, output_vocab_size]
sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
reduction='none')
loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
mask = tf.logical_not(tf.math.equal(y,0)) #output 0 for y=0 else output 1
mask = tf.cast(mask, dtype=loss.dtype)
loss = mask* loss
loss = tf.reduce_mean(loss)
return loss
def train_step(input_batch, output_batch,encoder_initial_cell_state):
#initialize loss = 0
loss = 0
with tf.GradientTape() as tape:
encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp,
initial_state =encoder_initial_cell_state)
#[last step activations,last memory_state] of encoder passed as input to decoder Network
# Prepare correct Decoder input & output sequence data
decoder_input = output_batch[:,:-1] # ignore <end>
#compare logits with timestepped +1 version of decoder_input
decoder_output = output_batch[:,1:] #ignore <start>
# Decoder Embeddings
decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)
#Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
decoderNetwork.attention_mechanism.setup_memory(a)
decoder_initial_state = decoderNetwork.build_decoder_initial_state(BATCH_SIZE,
encoder_state=[a_tx, c_tx],
Dtype=tf.float32)
#BasicDecoderOutput
outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp,initial_state=decoder_initial_state,
sequence_length=BATCH_SIZE*[Ty-1])
logits = outputs.rnn_output
#Calculate loss
loss = loss_function(logits, decoder_output)
#Returns the list of all layer variables / weights.
variables = encoderNetwork.trainable_variables + decoderNetwork.trainable_variables
# differentiate loss wrt variables
gradients = tape.gradient(loss, variables)
#grads_and_vars – List of(gradient, variable) pairs.
grads_and_vars = zip(gradients,variables)
optimizer.apply_gradients(grads_and_vars)
return loss
the training does not use fit() methode but like this :
epochs = 20
for i in range(1, epochs+1):
encoder_initial_cell_state = initialize_initial_state()
total_loss = 0.0
for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
total_loss += batch_loss
if (batch+1)%5 == 0:
print("total loss: {} epoch {} batch {} ".format(batch_loss.numpy(), i, batch+1))
the result are fine and the predict fonction work perfectly (custom predict function), but how can i save the model ? i tried pickel and keras.save() but it doesn't work any idea ?
I am trying to save a fine tuned bert model. I have ran the code correctly - it works fine, and in the ipython console I am able to call getPrediction and have it result the result.
I have my weight files saved (highest being model.ckpt-333.data-00000-of-00001
I have no idea how I would go about saving the model to be reuseable.
I am using bert-tensorflow.
import json
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from sklearn.model_selection import train_test_split
import os
print("tensorflow version : ", tf.__version__)
print("tensorflow_hub version : ", hub.__version__)
#Importing BERT modules
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
#set output directory of the model
OUTPUT_DIR = 'model'
##markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = False ##param {type:"boolean"}
if DO_DELETE:
try:
tf.gfile.DeleteRecursively(OUTPUT_DIR)
except:
pass
tf.io.gfile.makedirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))
### Load the data
data = pd.read_csv("data/bbc-text.csv")
data.columns = ['category', 'text']
print('*****Data Loaded: {} *****'.format(data.head()))
#check to see if any null values are present.
print('*****Empty Data: {} *****'.format(data[data.isnull().any(axis=1)]))
#encode category variable into numeric
data.category = pd.Categorical(data.category)
data['code'] = data.category.cat.codes
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=200)
## 2 -- Data Visualisation
print(data.code.unique())
import matplotlib.pyplot as plt
train['code'].value_counts().plot(kind = 'bar')
DATA_COLUMN = 'text'
LABEL_COLUMN = 'code'
label_list = [0, 1, 2, 3, 4]
plt.show()
## 2 -- Data Preprocessing
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None,
text_a = x[DATA_COLUMN],
text_b = None,
label = x[LABEL_COLUMN]), axis = 1)
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None,
text_a = x[DATA_COLUMN],
text_b = None,
label = x[LABEL_COLUMN]), axis = 1)
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
def create_tokenizer_from_hub_module():
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(BERT_MODEL_HUB)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.compat.v1.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return bert.tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
tokenizer = create_tokenizer_from_hub_module()
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and validation features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
#Example on first observation in the training set
print("Example of train[0] as a training set")
print("Sentence : ", train_InputExamples.iloc[0].text_a)
print("-"*30)
print("Tokens : ", tokenizer.tokenize(train_InputExamples.iloc[0].text_a))
print("-"*30)
print("Input IDs : ", train_features[0].input_ids)
print("-"*30)
print("Input Masks : ", train_features[0].input_mask)
print("-"*30)
print("Segment IDs : ", train_features[0].segment_ids)
## 3. Creating a Multiclass Classifier
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
num_labels):
bert_module = hub.Module(
BERT_MODEL_HUB,
trainable=True)
bert_inputs = dict(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids)
bert_outputs = bert_module(
inputs=bert_inputs,
signature="tokens",
as_dict=True)
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
output_layer = bert_outputs["pooled_output"]
hidden_size = output_layer.shape[-1].value
# Create our own layer to tune for politeness data.
output_weights = tf.compat.v1.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.compat.v1.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.compat.v1.variable_scope("loss"):
# Dropout helps prevent overfitting
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
# Convert labels into one-hot encoding
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
# If we're predicting, we want predicted labels and the probabiltiies.
if is_predicting:
return (predicted_labels, log_probs)
# If we're train/eval, compute loss between predicted and actual label
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, predicted_labels, log_probs)
#A function that adapts our model to work for training, evaluation, and prediction.
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
num_warmup_steps):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
# TRAIN and EVAL
if not is_predicting:
(loss, predicted_labels, log_probs) = create_model(
is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
train_op = bert.optimization.create_optimizer(
loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
# Calculate evaluation metrics.
def metric_fn(label_ids, predicted_labels):
accuracy = tf.compat.v1.metrics.accuracy(label_ids, predicted_labels)
true_pos = tf.compat.v1.metrics.true_positives(
label_ids,
predicted_labels)
true_neg = tf.compat.v1.metrics.true_negatives(
label_ids,
predicted_labels)
false_pos = tf.compat.v1.metrics.false_positives(
label_ids,
predicted_labels)
false_neg = tf.compat.v1.metrics.false_negatives(
label_ids,
predicted_labels)
return {
"eval_accuracy": accuracy,
"true_positives": true_pos,
"true_negatives": true_neg,
"false_positives": false_pos,
"false_negatives": false_neg
}
eval_metrics = metric_fn(label_ids, predicted_labels)
if mode == tf.estimator.ModeKeys.TRAIN:
return tf.estimator.EstimatorSpec(mode=mode,
loss=loss,
train_op=train_op)
else:
return tf.estimator.EstimatorSpec(mode=mode,
loss=loss,
eval_metric_ops=eval_metrics)
else:
(predicted_labels, log_probs) = create_model(
is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
predictions = {
'probabilities': log_probs,
'labels': predicted_labels
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
# Return the actual model function in the closure
return model_fn
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where the learning rate is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 300
SAVE_SUMMARY_STEPS = 100
# Compute train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
model_dir=OUTPUT_DIR,
save_summary_steps=SAVE_SUMMARY_STEPS,
save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)
# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
model_dir=OUTPUT_DIR,
save_summary_steps=SAVE_SUMMARY_STEPS,
save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)
#Initializing the model and the estimator
model_fn = model_fn_builder(
num_labels=len(label_list),
learning_rate=LEARNING_RATE,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config,
params={"batch_size": BATCH_SIZE})
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
features=train_features,
seq_length=MAX_SEQ_LENGTH,
is_training=True,
drop_remainder=False)
# Create an input function for validating. drop_remainder = True for using TPUs.
test_input_fn = run_classifier.input_fn_builder(
features=test_features,
seq_length=MAX_SEQ_LENGTH,
is_training=False,
drop_remainder=False)
# #Training the model
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)
#Evaluating the model with Validation set
accuracy = estimator.evaluate(input_fn=test_input_fn, steps=None)
# A method to get predictions
def getPrediction(in_sentences):
# A list to map the actual labels to the predictions
labels = ["business", "entertainment", "politics", "sports", "tech"]
# Transforming the test data into BERT accepted form
input_examples = [run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences]
# Creating input features for Test data
input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
# Predicting the classes
predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH,
is_training=False, drop_remainder=False)
predictions = estimator.predict(predict_input_fn)
return [(sentence, prediction['probabilities'], prediction['labels'], labels[prediction['labels']]) for
sentence, prediction in zip(in_sentences, predictions)]
pred_sentences = list(test['text'])
predictions = getPrediction(pred_sentences)
enc_labels = []
act_labels = []
for i in range(len(predictions)):
enc_labels.append(predictions[i][2])
act_labels.append(predictions[i][3])
pd.DataFrame(enc_labels, columns = ['category']).to_excel('data/submission_bert.xlsx', index = False)
## Random tester
#Classifying random sentences
tests = getPrediction(['Mr.Modi is the Indian Prime Minister',
'Gaming machines are powered by efficient micro processores and GPUs',
'That HBO TV series is really good',
'A trillion dollar economy '
])
As the question clearly says to save the model, here is how it works:
import torch
torch.save(model, 'path/to/model')
saved_model = torch.load('path/to/model')
I think you can just rename your model.ckpt-333.data-00000-of-00001 to bert_model.ckpt and then use it in the same way you would use a non-finetuned model. For example, run
python run_classifier.py \
--task_name=MRPC \
--do_predict=true \
--data_dir=$GLUE_DIR/MRPC \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$TRAINED_CLASSIFIER
with --init_checkpoint pointing to your model's dir, or run bert-as-service
bert-serving-start -model_dir $TRAINED_CLASSIFIER
with the right -model_dir.
You can use these method:
model = MyModel(num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)
output_model = './models/nameOfYourModel.pth'
# save
def save(model, optimizer):
# save
torch.save({
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict()
}, output_model)
save(model, optimizer)
# load
checkpoint = torch.load(output_model, map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
Source: https://github.com/huggingface/transformers/issues/7849#issuecomment-718726121