Saving a 'fine-tuned' bert model - python

I am trying to save a fine tuned bert model. I have ran the code correctly - it works fine, and in the ipython console I am able to call getPrediction and have it result the result.
I have my weight files saved (highest being model.ckpt-333.data-00000-of-00001
I have no idea how I would go about saving the model to be reuseable.
I am using bert-tensorflow.
import json
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from sklearn.model_selection import train_test_split
import os
print("tensorflow version : ", tf.__version__)
print("tensorflow_hub version : ", hub.__version__)
#Importing BERT modules
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
#set output directory of the model
OUTPUT_DIR = 'model'
##markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = False ##param {type:"boolean"}
if DO_DELETE:
try:
tf.gfile.DeleteRecursively(OUTPUT_DIR)
except:
pass
tf.io.gfile.makedirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))
### Load the data
data = pd.read_csv("data/bbc-text.csv")
data.columns = ['category', 'text']
print('*****Data Loaded: {} *****'.format(data.head()))
#check to see if any null values are present.
print('*****Empty Data: {} *****'.format(data[data.isnull().any(axis=1)]))
#encode category variable into numeric
data.category = pd.Categorical(data.category)
data['code'] = data.category.cat.codes
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=200)
## 2 -- Data Visualisation
print(data.code.unique())
import matplotlib.pyplot as plt
train['code'].value_counts().plot(kind = 'bar')
DATA_COLUMN = 'text'
LABEL_COLUMN = 'code'
label_list = [0, 1, 2, 3, 4]
plt.show()
## 2 -- Data Preprocessing
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None,
text_a = x[DATA_COLUMN],
text_b = None,
label = x[LABEL_COLUMN]), axis = 1)
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None,
text_a = x[DATA_COLUMN],
text_b = None,
label = x[LABEL_COLUMN]), axis = 1)
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
def create_tokenizer_from_hub_module():
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(BERT_MODEL_HUB)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.compat.v1.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])
return bert.tokenization.FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)
tokenizer = create_tokenizer_from_hub_module()
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and validation features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
#Example on first observation in the training set
print("Example of train[0] as a training set")
print("Sentence : ", train_InputExamples.iloc[0].text_a)
print("-"*30)
print("Tokens : ", tokenizer.tokenize(train_InputExamples.iloc[0].text_a))
print("-"*30)
print("Input IDs : ", train_features[0].input_ids)
print("-"*30)
print("Input Masks : ", train_features[0].input_mask)
print("-"*30)
print("Segment IDs : ", train_features[0].segment_ids)
## 3. Creating a Multiclass Classifier
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
num_labels):
bert_module = hub.Module(
BERT_MODEL_HUB,
trainable=True)
bert_inputs = dict(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids)
bert_outputs = bert_module(
inputs=bert_inputs,
signature="tokens",
as_dict=True)
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
output_layer = bert_outputs["pooled_output"]
hidden_size = output_layer.shape[-1].value
# Create our own layer to tune for politeness data.
output_weights = tf.compat.v1.get_variable(
"output_weights", [num_labels, hidden_size],
initializer=tf.truncated_normal_initializer(stddev=0.02))
output_bias = tf.compat.v1.get_variable(
"output_bias", [num_labels], initializer=tf.zeros_initializer())
with tf.compat.v1.variable_scope("loss"):
# Dropout helps prevent overfitting
output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
logits = tf.matmul(output_layer, output_weights, transpose_b=True)
logits = tf.nn.bias_add(logits, output_bias)
log_probs = tf.nn.log_softmax(logits, axis=-1)
# Convert labels into one-hot encoding
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
# If we're predicting, we want predicted labels and the probabiltiies.
if is_predicting:
return (predicted_labels, log_probs)
# If we're train/eval, compute loss between predicted and actual label
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
return (loss, predicted_labels, log_probs)
#A function that adapts our model to work for training, evaluation, and prediction.
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
num_warmup_steps):
"""Returns `model_fn` closure for TPUEstimator."""
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
"""The `model_fn` for TPUEstimator."""
input_ids = features["input_ids"]
input_mask = features["input_mask"]
segment_ids = features["segment_ids"]
label_ids = features["label_ids"]
is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
# TRAIN and EVAL
if not is_predicting:
(loss, predicted_labels, log_probs) = create_model(
is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
train_op = bert.optimization.create_optimizer(
loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
# Calculate evaluation metrics.
def metric_fn(label_ids, predicted_labels):
accuracy = tf.compat.v1.metrics.accuracy(label_ids, predicted_labels)
true_pos = tf.compat.v1.metrics.true_positives(
label_ids,
predicted_labels)
true_neg = tf.compat.v1.metrics.true_negatives(
label_ids,
predicted_labels)
false_pos = tf.compat.v1.metrics.false_positives(
label_ids,
predicted_labels)
false_neg = tf.compat.v1.metrics.false_negatives(
label_ids,
predicted_labels)
return {
"eval_accuracy": accuracy,
"true_positives": true_pos,
"true_negatives": true_neg,
"false_positives": false_pos,
"false_negatives": false_neg
}
eval_metrics = metric_fn(label_ids, predicted_labels)
if mode == tf.estimator.ModeKeys.TRAIN:
return tf.estimator.EstimatorSpec(mode=mode,
loss=loss,
train_op=train_op)
else:
return tf.estimator.EstimatorSpec(mode=mode,
loss=loss,
eval_metric_ops=eval_metrics)
else:
(predicted_labels, log_probs) = create_model(
is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
predictions = {
'probabilities': log_probs,
'labels': predicted_labels
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
# Return the actual model function in the closure
return model_fn
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where the learning rate is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 300
SAVE_SUMMARY_STEPS = 100
# Compute train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
model_dir=OUTPUT_DIR,
save_summary_steps=SAVE_SUMMARY_STEPS,
save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)
# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
model_dir=OUTPUT_DIR,
save_summary_steps=SAVE_SUMMARY_STEPS,
save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)
#Initializing the model and the estimator
model_fn = model_fn_builder(
num_labels=len(label_list),
learning_rate=LEARNING_RATE,
num_train_steps=num_train_steps,
num_warmup_steps=num_warmup_steps)
estimator = tf.estimator.Estimator(
model_fn=model_fn,
config=run_config,
params={"batch_size": BATCH_SIZE})
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
features=train_features,
seq_length=MAX_SEQ_LENGTH,
is_training=True,
drop_remainder=False)
# Create an input function for validating. drop_remainder = True for using TPUs.
test_input_fn = run_classifier.input_fn_builder(
features=test_features,
seq_length=MAX_SEQ_LENGTH,
is_training=False,
drop_remainder=False)
# #Training the model
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)
#Evaluating the model with Validation set
accuracy = estimator.evaluate(input_fn=test_input_fn, steps=None)
# A method to get predictions
def getPrediction(in_sentences):
# A list to map the actual labels to the predictions
labels = ["business", "entertainment", "politics", "sports", "tech"]
# Transforming the test data into BERT accepted form
input_examples = [run_classifier.InputExample(guid="", text_a=x, text_b=None, label=0) for x in in_sentences]
# Creating input features for Test data
input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
# Predicting the classes
predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH,
is_training=False, drop_remainder=False)
predictions = estimator.predict(predict_input_fn)
return [(sentence, prediction['probabilities'], prediction['labels'], labels[prediction['labels']]) for
sentence, prediction in zip(in_sentences, predictions)]
pred_sentences = list(test['text'])
predictions = getPrediction(pred_sentences)
enc_labels = []
act_labels = []
for i in range(len(predictions)):
enc_labels.append(predictions[i][2])
act_labels.append(predictions[i][3])
pd.DataFrame(enc_labels, columns = ['category']).to_excel('data/submission_bert.xlsx', index = False)
## Random tester
#Classifying random sentences
tests = getPrediction(['Mr.Modi is the Indian Prime Minister',
'Gaming machines are powered by efficient micro processores and GPUs',
'That HBO TV series is really good',
'A trillion dollar economy '
])

As the question clearly says to save the model, here is how it works:
import torch
torch.save(model, 'path/to/model')
saved_model = torch.load('path/to/model')

I think you can just rename your model.ckpt-333.data-00000-of-00001 to bert_model.ckpt and then use it in the same way you would use a non-finetuned model. For example, run
python run_classifier.py \
--task_name=MRPC \
--do_predict=true \
--data_dir=$GLUE_DIR/MRPC \
--vocab_file=$BERT_BASE_DIR/vocab.txt \
--bert_config_file=$BERT_BASE_DIR/bert_config.json \
--init_checkpoint=$TRAINED_CLASSIFIER
with --init_checkpoint pointing to your model's dir, or run bert-as-service
bert-serving-start -model_dir $TRAINED_CLASSIFIER
with the right -model_dir.

You can use these method:
model = MyModel(num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)
output_model = './models/nameOfYourModel.pth'
# save
def save(model, optimizer):
# save
torch.save({
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict()
}, output_model)
save(model, optimizer)
# load
checkpoint = torch.load(output_model, map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
Source: https://github.com/huggingface/transformers/issues/7849#issuecomment-718726121

Related

How to use pre-trained models for text classification?Comparing a fine-tuned model with a pre-trained model without fine-tuning

I want to know how much the fine-tuned model improves compared to the model without fine-tuning.I want to compare the performance of the pre-trained model(BERT) and the model(fine-tuned BERT) obtained by fine-tuning the pre-trained model on text classification.I know how to fine-tune BERT for text classification, but not very clear on how to use BERT directly for classification.what should I do?The following is the code for fine-tuning the model, how to rewrite it to directly use the pre-trained model.
<!-- language: python -->
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim
from sklearn.metrics import accuracy_score,matthews_corrcoef
from sklearn.model_selection import train_test_split
tokenizer_model = BertTokenizer.from_pretrained('bert-base-uncased')
pretrained_model = BertModel.from_pretrained("bert-base-uncased")
class MyDataSet(Data.Dataset):
def __init__ (self, data, label):
self.data = data
self.label = label
self.tokenizer = tokenizer_model
def __getitem__(self, idx):
text = self.data[idx]
label = self.label[idx]
inputs = self.tokenizer(text, return_tensors="pt",padding='max_length',max_length=256,truncation=True)
input_ids = inputs.input_ids.squeeze(0)
#token_type_ids = inputs.token_type_ids.squeeze(0)
attention_mask = inputs.attention_mask.squeeze(0)
#return input_ids, token_type_ids, attention_mask, label
return input_ids, attention_mask, label
def __len__(self):
return len(self.data)
data,label = [],[]
with open(path) as f:
for line in f.readlines():
a,b = line.strip().split('\t')
data.append(b)
if a == 'LOW':
label.append('0')
elif a == 'MEDIUM':
label.append('1')
else:
label.append('2')
label = [int(i) for i in label]
train_x,test_x,train_y,test_y = train_test_split(data, label, test_size = 0.15,random_state = 32, stratify=label)
dataset_train = MyDataSet(train_x,train_y)
dataset_test = MyDataSet(test_x,test_y)
dataloader_train = Data.DataLoader(dataset_train, batch_size=128, shuffle=True,num_workers=32,pin_memory=True)
dataloader_test = Data.DataLoader(dataset_test, batch_size=128, shuffle=True,num_workers=32,pin_memory=True)
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.bert = pretrained_model
self.linear = nn.Linear(768,3)
def forward(self, input_ids, attention_mask):
output = self.bert(input_ids, attention_mask).pooler_output
print(output.shape) # torch.Size([1, 768])
output = self.linear(output)
return output
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
print("Use", torch.cuda.device_count(), 'gpus')
model = MyModel()
model = nn.DataParallel(model)
model = model.to(device)
## model = MyModel().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
for epoch in range(10):
for input_ids,attention_mask,label in dataloader_train:
train_input_ids,train_attention_mask,train_label = input_ids.to(device),attention_mask.to(device),label.to(device)
model.train()
pred = model(train_input_ids,train_attention_mask)
print('epoch:',epoch)
#print('pred,label:',pred,label)
loss = loss_fn(pred, train_label)
print('Loss:',loss.item())
pred = torch.argmax(pred,dim=1)
acc = (pred == train_label).float().mean()
print('acc:',acc)
loss.backward()
optimizer.step()
optimizer.zero_grad()
savename_train = str(path) +'_' + str(name) + '_train' + '.txt'
with open(savename_train,'a') as f:
f.write(str(epoch)+'\t'+str(loss.item())+'\t'+str(acc.item())+'\n')
model.eval()
with torch.no_grad():
for input_ids,attention_mask,label in dataloader_test:
validation_input_ids,validation_attention_mask,validation_label = input_ids.to(device),attention_mask.to(device),label.to(device)
pred = model(validation_input_ids,validation_attention_mask)
loss = loss_fn(pred, validation_label)
pred = torch.argmax(pred, dim=1)
acc = (pred == validation_label).float().mean()
print('acc:',acc)
savename_eval = str(path) +'_' + str(name) + '_val' + '.txt'
with open(savename_eval,'a') as f:
f.write(str(epoch)+'\t'+str(loss.item())+'\t'+str(acc.item())+'\n')
What you are trying to do does not make sense. The naive BERT model was retrained using a combination of masked language modelling objective and next sentence prediction. So, all it can do is predicting masked tokens, predicting if a pair of given sentence can be next to each other in a text. Most importantly, it can provide embeddings.
To use for classification you have to add a classification head to the end of the model. Initially, the weights of that layer is randomly initialised. If you do not fine tune the last layer, what do you really expect from random weights?
If you really want to compare the fine-tuned model to a baseline, take the embeddings vector from the BERT and use a tradional ML model like SVM or Tree based calssifier.

tensorboard,, the scalar value is not recorded

The scalar value is not being recorded as in these pictures.
enter image description here 2.
enter image description here
I don't know if I'm doing the wrong with code for training or code for tensorboard.
It's my first time using a tensorboard, so I don't know if I wrote the correct code for viewing the tensorboard.
If the scalar value is not recorded as shown in the attached image even though the code for the tensorboard is written correctly, I would like to look at the code part for training.
Therefore, I would appreciate it if you could check code only from recording the scalar value to verifying it with the tensorboard
part of the main function of <run.py>
def main():
args = parser.parse_args()
assert args.n_views == 2, "Only two view training is supported. Please use --n-views 2."
# check if gpu training is available
if not args.disable_cuda and torch.cuda.is_available():
args.device = torch.device('cuda')
cudnn.deterministic = True
cudnn.benchmark = True
else:
args.device = torch.device('cpu')
args.gpu_index = -1
dataset = ContrastiveLearningDataset(args.data)
train_dataset = dataset.__getitem__(args.dataset_name, args.n_views, 1)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=True, drop_last=True)
#f(*)
model = ResNetSimCLR(base_model=args.arch, out_dim=args.out_dim)
optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader), eta_min=0,
last_epoch=-1)
with torch.cuda.device(args.gpu_index):
simclr = SimCLR(model=model, optimizer=optimizer, scheduler=scheduler, args=args)
simclr.train(train_loader)
simclr.writer.flush()
<simclr.py> to write train function in main()
class SimCLR(object):
def __init__(self, *args, **kwargs):
self.args = kwargs['args']
self.model = kwargs['model'].to(self.args.device)
self.optimizer = kwargs['optimizer']
self.scheduler = kwargs['scheduler']
self.writer = SummaryWriter()
logging.basicConfig(filename=os.path.join(self.writer.log_dir, 'training.log'), level=logging.DEBUG)
self.criterion = torch.nn.CrossEntropyLoss().to(self.args.device)
def info_nce_loss(self, features):
labels = torch.cat([torch.arange(self.args.batch_size) for i in range(self.args.n_views)], dim=0)
labels = (labels.unsqueeze(0) == labels.unsqueeze(1)).float()
labels = labels.to(self.args.device)
features = F.normalize(features, dim=1)
similarity_matrix = torch.matmul(features, features.T)
mask = torch.eye(labels.shape[0], dtype=torch.bool).to(self.args.device)
labels = labels[~mask].view(labels.shape[0], -1)
similarity_matrix = similarity_matrix[~mask].view(similarity_matrix.shape[0], -1)
positives = similarity_matrix[labels.bool()].view(labels.shape[0], -1)
negatives = similarity_matrix[~labels.bool()].view(similarity_matrix.shape[0], -1)
logits = torch.cat([positives, negatives], dim=1)
labels = torch.zeros(logits.shape[0], dtype=torch.long).to(self.args.device)
logits = logits / self.args.temperature
return logits, labels
def train(self, train_loader):
scaler = GradScaler(enabled=self.args.fp16_precision)
# save config file
save_config_file(self.writer.log_dir, self.args)
n_iter = 0 #global optimization step
logging.info(f"Start SimCLR training for {self.args.epochs} epochs.")
logging.info(f"Training with gpu: {self.args.disable_cuda}.")
for epoch_counter in range(self.args.epochs):
for images, _ in tqdm(train_loader):
images = torch.cat(images, dim=0)
images = images.to(self.args.device)
with autocast(enabled=self.args.fp16_precision):
features = self.model(images)
logits, labels = self.info_nce_loss(features)
loss = self.criterion(logits, labels)
self.optimizer.zero_grad()
scaler.scale(loss).backward()
scaler.step(self.optimizer)
scaler.update()
if n_iter % self.args.log_every_n_steps == 0:
top1, top5 = accuracy(logits, labels, topk=(1, 5))
self.writer.add_scalar('loss', loss, global_step=n_iter)
self.writer.add_scalar('acc/top1', top1[0], global_step=n_iter)
self.writer.add_scalar('acc/top5', top5[0], global_step=n_iter)
self.writer.add_scalar('learning_rate', self.scheduler.get_lr()[0], global_step=n_iter)
n_iter += 1
# warmup for the first 10 epochs
if epoch_counter >= 1:
try:
self.scheduler.step()
except ZeroDivisionError:
print("ZeroDivision")
logging.debug(f"Epoch: {epoch_counter}\tLoss: {loss}\tTop1 accuracy: {top1[0]}")
logging.info("Training has finished.")
# save model checkpoints
checkpoint_name = 'checkpoint_{:04d}.pth.tar'.format(self.args.epochs)
save_checkpoint({
'epoch': self.args.epochs,
'arch': self.args.arch,
'state_dict': self.model.state_dict(),
'optimizer': self.optimizer.state_dict(),
}, is_best=False, filename=os.path.join(self.writer.log_dir, checkpoint_name))
logging.info(f"Model checkpoint and metadata has been saved at {self.writer.log_dir}.")

Using tensorflow and TFBertForNextSentencePrediction to further train bert on a specific corpus

I'm trying to train TFBertForNextSentencePrediction on my own corpus, not from scratch, but rather taking the existing bert model with only a next sentence prediction head and further train it on a specific cuprous of text (pairs of sentences). Then I want to use the model I trained to be able to extract sentence embeddings from the last hidden state for other texts.
Currently the problem I encounter is that after I train the keras model I am not able to extract the hidden states of the last layer before the next sentence prediction head.
Below is the code. Here I only train it on a few sentences just to make sure the code works.
Any help will be greatly appreciated.
Thanks,
Ayala
import numpy as np
import pandas as pd
import tensorflow as tf
from datetime import datetime
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import BertTokenizer, PreTrainedTokenizer, BertConfig, TFBertForNextSentencePrediction
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
PRETRAINED_MODEL = 'bert-base-uncased'
# set paths and file names
time_stamp = str(datetime.now().year) + "_" + str(datetime.now().month) + "_" + str(datetime.now().day) + "_" + \
str(datetime.now().hour) + "_" + str(datetime.now().minute)
model_name = "pretrained_nsp_model"
model_dir_data = model_name + "_" + time_stamp
model_fn = model_dir_data + ".h5"
base_path = os.path.dirname(__file__)
input_path = os.path.join(base_path, "input_data")
output_path = os.path.join(base_path, "output_models")
model_path = os.path.join(output_path, model_dir_data)
if not os.path.exists(model_path):
os.makedirs(model_path)
# set model checkpoint
checkpoint = ModelCheckpoint(os.path.join(model_path, model_fn), monitor="val_loss", verbose=1, save_best_only=True,
save_weights_only=True, mode="min")
# read data
max_length = 512
def get_tokenizer(pretrained_model_name):
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
return tokenizer
def tokenize_nsp_data(A, B, max_length):
data_inputs = tokenizer(A, B, add_special_tokens=True, max_length=max_length, truncation=True,
pad_to_max_length=True, return_attention_mask=True,
return_tensors="tf")
return data_inputs
def get_data_features(data_inputs, max_length):
data_features = {}
for key in data_inputs:
data_features[key] = sequence.pad_sequences(data_inputs[key], maxlen=max_length, truncating="post",
padding="post", value=0)
return data_features
def get_transformer_model(transformer_model_name):
# get transformer model
config = BertConfig(output_attentions=True)
config.output_hidden_states = True
config.return_dict = True
transformer_model = TFBertForNextSentencePrediction.from_pretrained(transformer_model_name, config=config)
return transformer_model
def get_keras_model(transformer_model):
# get keras model
input_ids = tf.keras.layers.Input(shape=(max_length,), name='input_ids', dtype='int32')
input_masks_ids = tf.keras.layers.Input(shape=(max_length,), name='attention_mask', dtype='int32')
token_type_ids = tf.keras.layers.Input(shape=(max_length,), name='token_type_ids', dtype='int32')
X = transformer_model({'input_ids': input_ids, 'attention_mask': input_masks_ids, 'token_type_ids': token_type_ids})[0]
model = tf.keras.Model(inputs=[input_ids, input_masks_ids, token_type_ids], outputs=X)
model.summary()
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.optimizers.Adam(learning_rate=0.00005), metrics=['accuracy'])
return model
def get_metrices(true_values, pred_values):
cm = confusion_matrix(true_values, pred_values)
acc_score = accuracy_score(true_values, pred_values)
f1 = f1_score(true_values, pred_values, average="binary")
precision = precision_score(true_values, pred_values, average="binary")
recall = recall_score(true_values, pred_values, average="binary")
metrices = {'confusion_matrix': cm,
'acc_score': acc_score,
'f1': f1,
'precision': precision,
'recall': recall
}
for k, v in metrices.items():
print(k, ':\n', v)
return metrices
# get tokenizer
tokenizer = get_tokenizer(PRETRAINED_MODEL)
# train
prompt = ["Hello", "Hello", "Hello", "Hello"]
next_sentence = ["How are you?", "Pizza", "How are you?", "Pizza"]
train_labels = [0, 1, 0, 1]
train_labels = to_categorical(train_labels)
train_inputs = tokenize_nsp_data(prompt, next_sentence, max_length)
train_data_features = get_data_features(train_inputs, max_length)
# val
prompt = ["Hello", "Hello", "Hello", "Hello"]
next_sentence = ["How are you?", "Pizza", "How are you?", "Pizza"]
val_labels = [0, 1, 0, 1]
val_labels = to_categorical(val_labels)
val_inputs = tokenize_nsp_data(prompt, next_sentence, max_length)
val_data_features = get_data_features(val_inputs, max_length)
# get transformer model
transformer_model = get_transformer_model(PRETRAINED_MODEL)
# get keras model
model = get_keras_model(transformer_model)
callback_list = []
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, min_delta=0.005, verbose=1)
callback_list.append(early_stop)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, epsilon=0.001)
callback_list.append(reduce_lr)
callback_list.append(checkpoint)
history = model.fit([train_data_features['input_ids'], train_data_features['attention_mask'],
train_data_features['token_type_ids']], np.array(train_labels), batch_size=2, epochs=3,
validation_data=([val_data_features['input_ids'], val_data_features['attention_mask'],
val_data_features['token_type_ids']], np.array(val_labels)), verbose=1,
callbacks=callback_list)
model.layers[3].save_pretrained(model_path) # need to save this and make sure i can get the hidden states
## predict
# load model
transformer_model = get_transformer_model(model_path)
model = get_keras_model(transformer_model)
model.summary()
model.load_weights(os.path.join(model_path, model_fn))
# test
prompt = ["Hello", "Hello"]
next_sentence = ["How are you?", "Pizza"]
test_labels = [0, 1]
test_df = pd.DataFrame({'A': prompt, 'B': next_sentence, 'label': test_labels})
test_labels = to_categorical(val_labels)
test_inputs = tokenize_nsp_data(prompt, next_sentence, max_length)
test_data_features = get_data_features(test_inputs, max_length)
# predict
pred_test = model.predict([test_data_features['input_ids'], test_data_features['attention_mask'], test_data_features['token_type_ids']])
preds = tf.keras.activations.softmax(tf.convert_to_tensor(pred_test)).numpy()
true_test = test_df['label'].to_list()
pred_test = [1 if p[1] > 0.5 else 0 for p in preds]
test_df['pred_val'] = pred_test
metrices = get_metrices(true_test, pred_test)
I am also attaching a picture from the debugging mode in which I try (with no success) to view the hidden state. The problem is I am not able to see and save the transform model I trained and view the embeddings of the last hidden state. I tried converting the KerasTensor to numpy array but without success.
The issue resides in your 'get_keras_model()' function. You defined here that you are only interested in the first of the element of the output (i.e. logits) with:
X = transformer_model({'input_ids': input_ids, 'attention_mask': input_masks_ids, 'token_type_ids': token_type_ids})[0]
Just do the index selection as conditional like this to get the whole output of the model
def get_keras_model(transformer_model, is_training=True):
###your other code
X = transformer_model({'input_ids': input_ids, 'attention_mask': input_masks_ids, 'token_type_ids': token_type_ids})
if is_training:
X= X[0]
###your other code
return model
#predict
###your other code
model = get_keras_model(transformer_model, is_training=False)
###your other code
print(pred_test.keys())
Output:
odict_keys(['logits', 'hidden_states', 'attentions'])
P.S.: The BertTokenizer can truncate and add padding by themself (documentation).

Tensorflow: How to setup a CheckpointSaverHook

When I start the training on my tf.estimator.Estimator object,
Tensorflow automatically creates a CheckpointSaverHook whilst printing
INFO:tensorflow:Create CheckpointSaverHook.
This automatically created SaverHook will save my model at the very start and the end of the training.
What I want though is to create a checkpoint every n training steps. For this I created my own saving hook and passed it to my estimator when training.
saver_hook = tf.train.CheckpointSaverHook(
checkpoint_dir = model_dir,
save_steps = 100
)
model.train(input_fn,steps=1500,hooks=[saver_hook])
This works in theory but my own CheckpointSaverHook will just save *.meta files, while the automatically created one saves *.meta, *.index and *.data-XXXXX-of-XXXXX files.
How can I configure my own SaverHook to do that aswell?
EDIT:
Added my whole network definition
network.py
import pickle
import random
import numpy as np
import tensorflow as tf
LEARNING_RATE = 0.002
class TFDotNet:
def __init__(self,model_dir):
# model def
self.model_dir = model_dir
self.model = tf.estimator.Estimator(model_fn=model_fn,model_dir=model_dir)
# hooks
self.summary_hook = tf.train.SummarySaverHook(
save_steps=50,
output_dir=model_dir,
scaffold=tf.train.Scaffold()
)
self.saver_hook = tf.train.CheckpointSaverHook(
checkpoint_dir=model_dir,
save_steps=100,
)
def train(self,x_train,y_train,steps=1500,batch_size=128):
""" train the neuralnetwork """
tf.logging.set_verbosity(tf.logging.INFO)
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'x': x_train}, y=y_train,batch_size=batch_size, num_epochs=None, shuffle=True
)
self.model.train(input_fn,steps=steps,hooks=[self.summary_hook,self.saver_hook])
def predict(self,x_predict):
""" predict some inputs """
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'x':x_predict}, y=None, batch_size=1, shuffle=False
)
return list(self.model.predict(input_fn))
def evaluate(self,x_test,y_test):
""" evaluate network on testset """
input_fn = tf.estimator.inputs.numpy_input_fn(
x={'x': x_test}, y=y_test,batch_size=1, shuffle=False
)
return self.model.evaluate(input_fn)
def load_dataset(self,dataset_path):
""" loads a dataset from a serialized data file """
with open(dataset_path,'rb') as f:
return pickle.load(f)
def split_dataset(self,dataset,ratio,random_state=42):
""" splits a loaded dataset into training and testset """
random.seed(random_state)
random.shuffle(dataset)
length = int(ratio * len(dataset))
test_data = dataset[:length]
training_data = dataset[length:]
x_train = np.hstack([x for (x, y) in training_data]).transpose().astype('float32')
y_train = np.asarray([y for (x, y) in training_data]).reshape(-1, 1).astype('float32')
x_test = np.hstack([x for (x, y) in test_data]).transpose().astype('float32')
y_test = np.asarray([y for (x, y) in test_data]).reshape(-1, 1).astype('float32')
return x_train, y_train, x_test, y_test
def export(self):
""" exports the conv net """
def serving_input_receiver_fn():
# The outer dimension (None) allows us to batch up inputs for
# efficiency. However, it also means that if we want a prediction
# for a single instance, we'll need to wrap it in an outer list.
inputs = {"x": tf.placeholder(shape=[None, 900], dtype=tf.float32)}
return tf.estimator.export.ServingInputReceiver(inputs, inputs)
self.model.export_savedmodel(
export_dir_base=self.model_dir,
serving_input_receiver_fn=serving_input_receiver_fn)
def cnn_layout(features,reuse,is_training):
with tf.variable_scope('cnn',reuse=reuse):
# resize input to [batchsize,height,width,channel]
x = tf.reshape(features['x'], shape=[-1,30,30,1])
# conv1, 32 filter, 5 kernel
conv1 = tf.layers.conv2d(x, 32, 5, activation=tf.nn.relu, name='conv1')
# pool1, 2 stride, 2 kernel
pool1 = tf.layers.max_pooling2d(conv1, 2, 2, name='pool1')
# conv2, 64 filter, 3 kernel
conv2 = tf.layers.conv2d(pool1, 64, 3, activation=tf.nn.relu, name='conv2')
# pool2, 2 stride, 2 kernel
pool2 = tf.layers.max_pooling2d(conv2, 2, 2, name='pool2')
# flatten pool2
flatten = tf.contrib.layers.flatten(pool2)
# fc1 with 1024 neurons
fc1 = tf.layers.dense(flatten, 1024, name='fc1')
# 75% dropout
drop = tf.layers.dropout(fc1, rate=0.75, training=is_training, name='dropout')
# output logits
output = tf.layers.dense(drop, 1, name='output_logits')
return output
def model_fn(features, labels, mode):
# setup two networks one for training one for prediction while sharing weights
logits_train = cnn_layout(features=features,reuse=False,is_training=True)
logits_test = cnn_layout(features=features,reuse=True,is_training=False)
# predictions
probabilites = tf.sigmoid(logits_test, name='probabilities')
predictions = tf.round(probabilites,name='predictions')
export_outputs = tf.estimator.export.PredictOutput(outputs={'predictions':predictions,'probabilities':probabilites})
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs={'outputs':export_outputs})
# define loss and optimizer
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits_train,labels=labels),name='loss')
optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE, name='optimizer')
train = optimizer.minimize(loss, global_step=tf.train.get_global_step(),name='train')
# accuracy for evaluation
accuracy = tf.metrics.accuracy(labels=labels,predictions=predictions,name='accuracy')
# summarys for tensorboard
tf.summary.scalar('loss',loss)
# return training and evalution spec
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train,
eval_metric_ops={'accuracy':accuracy}
)
training.py
from network import TFDotNet
from time import time
# settings
training_steps = 10000
mini_batch_size = 128
model_dir = 'neuralnet_data/02_networks/network01'
dataset_path = 'neuralnet_data/01_datasets/dataset.data'
# init dotnet
dotnet = TFDotNet(model_dir=model_dir)
# load dataset
print('loading dataset ...')
dataset = dotnet.load_dataset(dataset_path)
# split dataset
x_train, y_train, x_test, y_test = dotnet.split_dataset(dataset,0.1)
# train network
print('starting training ...')
t0 = time()
dotnet.train(x_train,y_train,steps=training_steps,batch_size=mini_batch_size)
print('Training took {}s'.format(time()-t0))
The problem here is that, when no Saver is specified (either directly or by the scaffold), CheckpointSaverHook will create a new Saver in its constructor. If the __init__ is not run in the same Graph as your model, then it won't find any variables so nothing will be saved (https://github.com/tensorflow/tensorflow/issues/13265).
Assuming you are using the tf.estimator framework, then the Graph you want simply does not exist yet before the call to train.
You should be able to work around that by creating the saver inside your model_fn, and pass it as a hook to the EstimatorSpec.
here my my code. it works fine, the complete code is on mygithub
start_time = datetime.datetime.now()
saver_hook = tf.train.CheckpointSaverHook(
checkpoint_dir=FLAGS.train_dir,
save_steps=100,
)
config = tf.estimator.RunConfig()
config = config.replace(session_config=sess_config)
per_example_hook = ExamplesPerSecondHook(FLAGS.train_batch_size, every_n_steps=100)
hooks = [per_example_hook,saver_hook]
classifier = tf.estimator.Estimator(
model_fn=model_fn_cnn,
model_dir= FLAGS.train_dir,
config=config,
)
classifier.train(input_fn=functools.partial(input_fn,subset="training"),
steps=FLAGS.train_steps,
hooks=hooks
)
train_time = datetime.datetime.now() - start_time

How to save estimator in Tensorflow for later use?

I followed the tutorial "A Guide to TF Layers: Building a Convolutional Neural Network" (here is the code: https://github.com/tensorflow/tensorflow/blob/r1.1/tensorflow/examples/tutorials/layers/cnn_mnist.py).
I adapted the tutorial for my needs, which is hand detection.
As far as I understand, this tutorial creates the estimator (which is a CNN), then does the fitting, and finally, it evaluates the performance of the estimator. Now, my problem is that I want to use the estimator int another file, which is going to be my main program. How do I access the estimator from another file? Do I have to fit the estimator every time I want to use it?? (I hope not)
I was wondering if someone could help me understand how to save the estimator to use it later. (as far as I understand, I cant create a saver with tf.train.Saver, because I don't have a session running).
Here is the code from my train.py file:
def main(unused_argv):
#Load training and eval data (part missing)
# Create the estimator
hand_detector = learn.Estimator(model_fn=cnn_model_fn, model_dir="\cnn_model_fn")
# Set up logging for predictions
# Log the values in the "Softmax" tensor with label "probabilities"
tensors_to_log = {"probabilities": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(
tensors=tensors_to_log, every_n_iter=50)
# Train the model
hand_detector.fit(
x=train_data,
y=train_labels,
batch_size=100,
steps=20000,
monitors=[logging_hook])
# Configure the accuracy metric for evaluation
metrics = {
"accuracy":
learn.MetricSpec(
metric_fn=tf.metrics.accuracy, prediction_key="classes"),
}
# Evaluate the model and print results
eval_results = hand_detector.evaluate(
x=eval_data, y=eval_labels, metrics=metrics)
print(eval_results)
# Save the model for later use (part missing!)
Almost all real applications of machine learning seek to train a model once and then save it for future uses with new data. Most classifiers spend hours in the training stage and just few seconds in the testing stage, so is fundamental learn how to save successfully a trained model.
I'm going to explain how to export "high level" Tensorflow models (using export_savedmodel).
The function export_savedmodel requires the argument serving_input_receiver_fn, that is a function without arguments, which defines the input from the model and the predictor. Therefore, you must create your own serving_input_receiver_fn, where the model input type match with the model input in the training script, and the predictor input type match with the predictor input in the testing script.
On the other hand, if you create a custom model, you must define the export_outputs, defined by the function tf.estimator.export.PredictOutput, which input is a dictionary that define the name that has to match with the name of the predictor output in the testing script.
For example:
TRAINING SCRIPT
def serving_input_receiver_fn():
serialized_tf_example = tf.placeholder(dtype=tf.string, shape=[None], name='input_tensors')
receiver_tensors = {"predictor_inputs": serialized_tf_example}
feature_spec = {"words": tf.FixedLenFeature([25],tf.int64)}
features = tf.parse_example(serialized_tf_example, feature_spec)
return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
def estimator_spec_for_softmax_classification(logits, labels, mode):
predicted_classes = tf.argmax(logits, 1)
if (mode == tf.estimator.ModeKeys.PREDICT):
export_outputs = {'predict_output': tf.estimator.export.PredictOutput({"pred_output_classes": predicted_classes, 'probabilities': tf.nn.softmax(logits)})}
return tf.estimator.EstimatorSpec(mode=mode, predictions={'class': predicted_classes, 'prob': tf.nn.softmax(logits)}, export_outputs=export_outputs) # IMPORTANT!!!
onehot_labels = tf.one_hot(labels, 31, 1, 0)
loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
if (mode == tf.estimator.ModeKeys.TRAIN):
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
eval_metric_ops = {'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_classes)}
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def model_custom(features, labels, mode):
bow_column = tf.feature_column.categorical_column_with_identity("words", num_buckets=1000)
bow_embedding_column = tf.feature_column.embedding_column(bow_column, dimension=50)
bow = tf.feature_column.input_layer(features, feature_columns=[bow_embedding_column])
logits = tf.layers.dense(bow, 31, activation=None)
return estimator_spec_for_softmax_classification(logits=logits, labels=labels, mode=mode)
def main():
# ...
# preprocess-> features_train_set and labels_train_set
# ...
classifier = tf.estimator.Estimator(model_fn = model_custom)
train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"words": features_train_set}, y=labels_train_set, batch_size=batch_size_param, num_epochs=None, shuffle=True)
classifier.train(input_fn=train_input_fn, steps=100)
full_model_dir = classifier.export_savedmodel(export_dir_base="C:/models/directory_base", serving_input_receiver_fn=serving_input_receiver_fn)
TESTING SCRIPT
def main():
# ...
# preprocess-> features_test_set
# ...
with tf.Session() as sess:
tf.saved_model.loader.load(sess, [tf.saved_model.tag_constants.SERVING], full_model_dir)
predictor = tf.contrib.predictor.from_saved_model(full_model_dir)
model_input = tf.train.Example(features=tf.train.Features( feature={"words": tf.train.Feature(int64_list=tf.train.Int64List(value=features_test_set)) }))
model_input = model_input.SerializeToString()
output_dict = predictor({"predictor_inputs":[model_input]})
y_predicted = output_dict["pred_output_classes"][0]
(Code tested in Python 3.6.3, Tensorflow 1.4.0)
Estimator has an export_savedmodel member function for that purpose. You will find the docs here.
Update to David Valenzuela Urrutia's answer(codes)
David Valenzuela Urrutia's answer was for Python 3.6.3, Tensorflow 1.4.0 so i thought of updating the answer(code samples) to Tensorflow 2.x because some funtionalities like tf.Session is not supported in Tensorflow version 2 so you need to replace it with tf.compat.v1.Session for it to work. Visit this link to know more about the changes added to tensorflow version 2
Training script updated code
def serving_input_receiver_fn():
serialized_tf_example = tf.compat.v1.placeholder(dtype=tf.string, shape=[None],
name='input_tensors')
receiver_tensors = {"predictor_inputs": serialized_tf_example}
feature_spec = {"words": tf.io.FixedLenFeature([25],tf.int64)}
features = tf.io.parse_example(serialized=serialized_tf_example,
features=feature_spec)
return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
def estimator_spec_for_softmax_classification(logits, labels, mode):
predicted_classes = tf.argmax(input=logits, axis=1)
if (mode == tf.estimator.ModeKeys.PREDICT):
export_outputs = {'predict_output':
tf.estimator.export.PredictOutput({"pred_output_classes": predicted_classes, 'probabilities': tf.nn.softmax(logits)})}
return tf.estimator.EstimatorSpec(mode=mode, predictions={'class': predicted_classes, 'prob': tf.nn.softmax(logits)}, export_outputs=export_outputs) # IMPORTANT!!!
onehot_labels = tf.one_hot(labels, 31, 1, 0)
loss =tf.compat.v1.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits)
if (mode == tf.estimator.ModeKeys.TRAIN):
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(loss, global_step=tf.compat.v1.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
eval_metric_ops = {'accuracy': tf.compat.v1.metrics.accuracy(labels=labels, predictions=predicted_classes)}
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def model_custom(features, labels, mode):
bow_column = tf.feature_column.categorical_column_with_identity("words", num_buckets=1000)
bow_embedding_column = tf.feature_column.embedding_column(bow_column, dimension=50)
bow = tf.compat.v1.feature_column.input_layer(features, feature_columns=[bow_embedding_column])
logits = tf.compat.v1.layers.dense(bow, 31, activation=None)
return estimator_spec_for_softmax_classification(logits=logits, labels=labels, mode=mode)
def main():
# ...
# preprocess-> features_train_set and labels_train_set
# ...
classifier = tf.estimator.Estimator(model_fn = model_custom)
train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(x={"words": features_train_set}, y=labels_train_set, batch_size=batch_size_param, num_epochs=None, shuffle=True)
classifier.train(input_fn=train_input_fn, steps=100)
full_model_dir = classifier.export_savedmodel(export_dir_base="C:/models/directory_base", serving_input_receiver_fn=serving_input_receiver_fn)
Testing script updated code
def main():
# ...
# preprocess-> features_test_set
# ...
with tf.compat.v1.Session() as sess:
tf.compat.v1.saved_model.loader.load(sess, [tf.saved_model.SERVING], full_model_dir)
predictor = tf.contrib.predictor.from_saved_model(full_model_dir)
model_input = tf.train.Example(features=tf.train.Features( feature={"words": tf.train.Feature(int64_list=tf.train.Int64List(value=features_test_set)) }))
model_input = model_input.SerializeToString()
output_dict = predictor({"predictor_inputs":[model_input]})
y_predicted = output_dict["pred_output_classes"][0]

Categories