I'm a beginner to this field and am stuck. I am following this tutorial (https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a) to build a multi-label classification using huggingface tranformers.
Following is the code I'm using to train my model.
# Name of the BERT model to use
model_name = 'bert-base-uncased'
# Max length of tokens
max_length = 100
PATH = 'uncased_L-12_H-768_A-12/'
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(PATH)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(PATH, local_files_only=True, config = config)
# tokenizer = BertTokenizer.from_pretrained(PATH, local_files_only=True, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(PATH, config = config,from_pt=True)
#######################################
### ------- Build the model ------- ###
# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = Input(shape=(None,), name='input_ids', dtype='int32')
# attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32')
# inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
pooled_output = dropout(bert_model, training=False)
# Then build your model output
issue = Dense(units=len(data.U_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='issue')(pooled_output)
outputs = {'issue': issue}
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
# Take a look at the model
model.summary()
#######################################
### ------- Train the model ------- ###
# Set an optimizer
optimizer = Adam(
learning_rate=5e-05,
epsilon=1e-08,
decay=0.01,
clipnorm=1.0)
# Set loss and metrics
loss = {'issue': CategoricalCrossentropy(from_logits = True)}
# loss = {'issue': CategoricalCrossentropy()}
metric = {'issue': CategoricalAccuracy('accuracy')}
# Compile the model
model.compile(
optimizer = optimizer,
loss = loss,
metrics = metric)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data['U_H_Label'])
# Ready output data for the model
y_issue = to_categorical(le.transform(data['U_H_Label']))
# Tokenize the input (takes some time)
x = tokenizer(
text=data['Input_Data'].to_list(),
add_special_tokens=True,
max_length=max_length,
truncation=True,
padding=True,
return_tensors='tf',
return_token_type_ids = False,
return_attention_mask = True,
verbose = True)
# Fit the model
history = model.fit(
# x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
x={'input_ids': x['input_ids']},
y={'issue': y_issue},
validation_split=0.2,
batch_size=64,
epochs=10)
When I use the model.predict() function, I think I get logit scores for each class, and would like to convert them to probability scores ranging from 0 to 1.
I have read in multiple blogs that a softmax function is what I have to use, but am not able to relate on where and how. If anyone could please tell me what line of code would be required, I'd be grateful!
Once you get the logit scores from model.predict(), then you can do as follows:
from torch.nn import functional as F
import torch
# convert logit score to torch array
torch_logits = torch.from_numpy(logit_score)
# get probabilities using softmax from logit score and convert it to numpy array
probabilities_scores = F.softmax(torch_logits, dim = -1).numpy()[0]
Related
I am trying to fine tune a transformer model for text classification but I am having trouble training the model. I have tried many things but none of them seem to work. I have also tried different solutions on other question but they didn't work. I am using 'microsoft/deberta-v3-base' model for fine tuning. Here's my code:
train_dataset = Dataset.from_pandas(df_tr[['text', 'label']]).class_encode_column("label")
val_dataset = Dataset.from_pandas(df_tes[['text', 'label']]).class_encode_column("label")
train_tok_dataset = train_dataset.map(tokenizer_func, batched=True, remove_columns=('text'))
val_tok_dataset = val_dataset.map(tokenizer_func, batched=True, remove_columns=('text'))
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained(config.model_name, num_labels=3)
transformer_model = TFAutoModelForSequenceClassification.from_pretrained(config.model_name, output_hidden_states=True)
input_ids = tf.keras.Input(shape=(config.max_len, ),dtype='int32')
attention_mask = tf.keras.Input(shape=(config.max_len, ), dtype='int32')
transformer = transformer_model([input_ids, attention_mask])
hidden_states = transformer[1] # get output_hidden_states
#print(hidden_states)
hidden_states_size = 4 # count of the last states
hiddes_states_ind = list(range(-hidden_states_size, 0, 1))
selected_hiddes_states = tf.keras.layers.concatenate(tuple([hidden_states[i] for i in hiddes_states_ind]))
# Now we can use selected_hiddes_states as we want
output = tf.keras.layers.Dense(128, activation='relu')(selected_hiddes_states)
output=tf.keras.layers.Flatten()(output)
output = tf.keras.layers.Dense(3, activation='softmax')(output)
model = tf.keras.models.Model(inputs = [input_ids, attention_mask], outputs = output)
from transformers import create_optimizer
import tensorflow as tf
batch_size = 8
num_epochs = config.epochs
#batches_per_epoch = len(tokenized_tweets["train"]) // batch_size
total_train_steps = int(num_steps * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=num_steps/2)
model.compile(optimizer=optimizer)
with tf.device('GPU:0'):
model.fit(x=[np.array(train_tok_dataset["input_ids"]),np.array(train_tok_dataset["attention_mask"])],
y=tf.keras.utils.to_categorical(y_train,num_classes=3),
validation_data=([np.array(val_tok_dataset["input_ids"]),np.array(val_tok_dataset["attention_mask"])],tf.keras.utils.to_categorical(y_test,num_classes=3)),
epochs=config.epochs,class_weight={0:0.57,1:0.18,2:0.39})
It seems like a small issue, but I am new to tensorflow and transformers so I couldn't sort it out myself.
I would say it's probably due to the fact that you are not adding a loss to the compilation, thus no gradient can be computed wrt it:
model.compile(optimizer=optimizer)
^^^^^^^^^^^^^^^^^^^^---- no "loss = tf.keras.losses...
Maybe you're just missing an = on the right side of validation_data.
model.fit(
x=[np.array(...),np.array(...)],
y=tf.keras.utils.to_categorical(...),
validation_data=([np.array(...), np.array(...)], tf.keras.utils.to_categorical(...)),
...
)
I'm trying to train a basic Graph Neural Network using the StellarGraph library, in particular starting from the example provided in [0].
The example works fine, but now I would like to repeat the same exercize removing the N-Fold Crossvalidation and providing specific training, validation and test sets. I'm trying to do so with the following code:
# One hot encoding
graph_training_set_labels_encoded = pd.get_dummies(graphs_training_set_labels, drop_first=True)
graph_validation_set_labels_encoded = pd.get_dummies(graphs_validation_set_labels, drop_first=True)
graphs = graphs_training_set + graphs_validation_set
# Graph generator preparation
generator = PaddedGraphGenerator(graphs=graphs)
train_gen = generator.flow([x for x in range(0, len(graphs_training_set))],
targets=graph_training_set_labels_encoded,
batch_size=batch_size)
valid_gen = generator.flow([x for x in range(len(graphs_training_set),
len(graphs_training_set) + len(graphs_validation_set))],
targets=graph_validation_set_labels_encoded,
batch_size=batch_size)
# Stopping criterium
es = EarlyStopping(monitor="val_loss",
min_delta=0,
patience=20,
restore_best_weights=True)
# Model definition
gc_model = GCNSupervisedGraphClassification(layer_sizes=[64, 64],
activations=["relu", "relu"],
generator=generator,
dropout=dropout_value)
x_inp, x_out = gc_model.in_out_tensors()
predictions = Dense(units=32, activation="relu")(x_out)
predictions = Dense(units=16, activation="relu")(predictions)
predictions = Dense(units=1, activation="sigmoid")(predictions)
# Creating Keras model and preparing it for training
model = Model(inputs=x_inp, outputs=predictions)
model.compile(optimizer=Adam(adam_value), loss=binary_crossentropy, metrics=["acc"])
# GNN Training
history = model.fit(train_gen, epochs=num_epochs, validation_data=valid_gen, verbose=0, callbacks=[es])
# Calculate performance on the validation data
test_metrics = model.evaluate(valid_gen, verbose=0)
valid_acc = test_metrics[model.metrics_names.index("acc")]
print(f"Test Accuracy model = {valid_acc}")
Where graphs_training_set and graphs_validation_set are lists of StellarDiGraphs.
I am able to run this piece of code, but it provides NaN as result. What could be the problem?
Since it is the first time I am using StellarGraph and in particular PaddedGraphGenerator. I think my mistake rely on the usage of that generator, but providing training set and validation set in different manner didn't produce better results.
Thank you in advance.
UPDATE Fixed I typo in the code, as pointed out here (thanks to george123).
[0] https://stellargraph.readthedocs.io/en/stable/demos/graph-classification/gcn-supervised-graph-classification.html
I found a solution digging in the StellarGraph documentation for PaddedGraphGenerator and GCN Neural Network Class GCNSupervisedGraphClassification. Furthermore, I have found a similar question on StellarGraph Issue Tracker which also points out to the solution.
# Graph generator preparation
generator = PaddedGraphGenerator(graphs=graphs)
train_gen = generator.flow([x for x in range(0, num_graphs_for_training)],
targets=training_graphs_labels,
batch_size=35)
valid_gen = generator.flow([x for x in range(num_graphs_for_training, num_graphs_for_training + num_graphs_for_validation)],
targets=validation_graphs_labels,
batch_size=35)
# Stopping criterium
es = EarlyStopping(monitor="val_loss",
min_delta=0.001,
patience=10,
restore_best_weights=True)
# Model definition
gc_model = GCNSupervisedGraphClassification(layer_sizes=[64, 64],
activations=["relu", "relu"],
generator=generator,
dropout=dropout_value)
x_inp, x_out = gc_model.in_out_tensors()
predictions = Dense(units=32, activation="relu")(x_out)
predictions = Dense(units=16, activation="relu")(predictions)
predictions = Dense(units=1, activation="sigmoid")(predictions)
# Let's create the Keras model and prepare it for training
model = Model(inputs=x_inp, outputs=predictions)
model.compile(optimizer=Adam(adam_value), loss=binary_crossentropy, metrics=["acc"])
# GNN Training
history = model.fit(train_gen, epochs=num_epochs, validation_data=valid_gen, verbose=1, callbacks=[es])
# Evaluate performance on the validation data
valid_metrics = model.evaluate(valid_gen, verbose=0)
valid_acc = valid_metrics[model.metrics_names.index("acc")]
# Define test set indices temporary vars
index_begin_test_set = num_graphs_for_training + num_graphs_for_validation
index_end_test_set = index_begin_test_set + num_graphs_for_testing
test_set_indices = [x for x in range(index_begin_test_set, index_end_test_set)]
# Evaluate performance on test set
generator_for_test_set = PaddedGraphGenerator(graphs=graphs)
test_gen = generator_for_test_set.flow(test_set_indices)
result = model.predict(test_gen)
I have this model :
# Set random seed
tf.random.set_seed(42)
# Create some regression data
X_regression = np.expand_dims(np.arange(0, 1000, 5), axis=0)
y_regression = np.expand_dims(np.arange(100, 1100, 5), axis=0)
# Split it into training and test sets
X_reg_train = X_regression[:150]
X_reg_test = X_regression[150:]
y_reg_train = y_regression[:150]
y_reg_test = y_regression[150:]
# Setup random seed
tf.random.set_seed(42)
# Recreate the model
model_3 = tf.keras.Sequential([
tf.keras.layers.Dense(100),
tf.keras.layers.Dense(10),
tf.keras.layers.Dense(1)
])
# Change the loss and metrics of our compiled model
model_3.compile(loss=tf.keras.losses.mae, # change the loss function to be regression-specific
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
metrics=['mae']) # change the metric to be regression-specific
# Fit the recompiled model
model_3.fit(X_reg_train, y_reg_train, epochs=100)
To begin with, the model does not train well
To add on, when I try to predict using that model, I get the following error :
Why am I getting the above error and how can I fix it?
Change the axis dimension in expand_dims to 1 and slice your data like this, since it is 2D:
import tensorflow as tf
import numpy as np
tf.random.set_seed(42)
# Create some regression data
X_regression = np.expand_dims(np.arange(0, 1000, 5), axis=1)
y_regression = np.expand_dims(np.arange(100, 1100, 5), axis=1)
# Split it into training and test sets
X_reg_train = X_regression[:150, :]
X_reg_test = X_regression[150:, :]
y_reg_train = y_regression[:150, :]
y_reg_test = y_regression[150:, :]
tf.random.set_seed(42)
# Recreate the model
model_3 = tf.keras.Sequential([
tf.keras.layers.Dense(100),
tf.keras.layers.Dense(10),
tf.keras.layers.Dense(1)
])
# Change the loss and metrics of our compiled model
model_3.compile(loss=tf.keras.losses.mae, # change the loss function to be regression-specific
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
metrics=['mae']) # change the metric to be regression-specific
# Fit the recompiled model
model_3.fit(X_reg_train, y_reg_train, epochs=100)
model_3.predict(X_reg_test)
I have trained a NN using tf.keras and saved the whole model with ModelCheckpoint in a .h5 file.
However, when I restore it with models.load_model and then train it again with the method fit, it only returns a History object and does nothing more.
Below is the minimal example for the training:
import numpy as np
import tensorflow as tf
# Creates dummy data
train_x = np.random.randint(10,size=40).reshape(-1,1)
train_y = np.random.randint(2,size=40).reshape(-1,1)
train_set = (train_x,train_y)
val_x = np.random.randint(10,size=20).reshape(-1,1)
val_y = np.random.randint(2,size=20).reshape(-1,1)
val_set = (val_x,val_y)
# Set Learning Rate Decay
import math
def step_decay(epoch):
print('--- Epoch:',epoch)
print(tf.keras.callbacks.History())
init_lr = 0.001
drop = 0.9
epochs_drop = 1.0
lr = init_lr*math.pow(drop,math.floor((1+epoch)/epochs_drop))
return(lr)
lr_callback = tf.keras.callbacks.LearningRateScheduler(step_decay)
# Saves the whole model
cp_callback = tf.keras.callbacks.ModelCheckpoint('model.h5',
save_weights_only=False,
verbose=True)
# Creates the model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(1,activation='relu',use_bias=False,input_dim=(1)))
model.add(tf.keras.layers.Dense(100,activation='relu',use_bias=False))
model.add(tf.keras.layers.Dense(1,activation='relu',use_bias=False))
model.compile(loss='mean_squared_error',
optimizer=tf.keras.optimizers.Adam(),
metrics=['accuracy'])
print('Learning Rate: ',tf.keras.backend.eval(model.optimizer.lr))
# Train the model
model.fit(x=train_set[0],y=train_set[1],epochs=2,steps_per_epoch=40,
validation_data=val_set,validation_steps=20,
callbacks=[lr_callback,cp_callback])
print('Learning Rate: ',tf.keras.backend.eval(model.optimizer.lr))
The code I am currently using to load it again is the follow one.
import numpy as np
import tensorflow as tf
# Creates dummy data
train_x = np.random.randint(10,size=40).reshape(-1,1)
train_y = np.random.randint(2,size=40).reshape(-1,1)
train_set = (train_x,train_y)
val_x = np.random.randint(10,size=20).reshape(-1,1)
val_y = np.random.randint(2,size=20).reshape(-1,1)
val_set = (val_x,val_y)
# Set Learning Rate Decay
import math
def step_decay(epoch):
print('--- Epoch:',epoch)
print(tf.keras.callbacks.History())
init_lr = 0.001
drop = 0.9
epochs_drop = 1.0
lr = init_lr*math.pow(drop,math.floor((1+epoch)/epochs_drop))
return(lr)
lr_callback = tf.keras.callbacks.LearningRateScheduler(step_decay)
# Saves the whole model
cp_callback = tf.keras.callbacks.ModelCheckpoint('model.h5',
save_weights_only=False,
verbose=True)
# Load model
model = tf.keras.models.load_model('model.h5')
print('Learning Rate: ',tf.keras.backend.eval(model.optimizer.lr))
model.fit(x=train_set[0],y=train_set[1],epochs=2,steps_per_epoch=40,
validation_data=val_set,validation_steps=20,initial_epoch=3,
callbacks=[lr_callback,cp_callback])
As you can observe when running it is that the learning rate is restored hence the whole model as well, or at least that's what I think. However, after running model.fit(...) it does nothing but return <tensorflow.python.keras.callbacks.History object at 0x7f11c81cb940>. Any idea how to allow it to train again?
EDIT: I also tried to compile it by setting the compile attribute of load_model to true.
Did you try to compile it after loading?
I want to make a Seq2Seq model for reconstruction purpose. I want a model trained to reconstruct the normal time-series and it is assumed that such a model would do badly to reconstruct the anomalous time-series having not seen them during training.
I have some gaps in my code and also in the understanding. I took this as an orientation and did so far:
traindata: input_data.shape(1000,60,1) and target_data.shape(1000,50,1) with target data being the same training data only in reversed order as sugested in the paper here.
for inference: I want to predict another time series data with the trained model having the shape (3000,60,1). T Now 2 points are open: how do I specify the input data for my training model and how do I build the inference part with the stop condition ?
Please correct any mistakes.
from keras.models import Model
from keras.layers import Input
from keras.layers import LSTM
from keras.layers import Dense
num_encoder_tokens = 1#number of features
num_decoder_tokens = 1#number of features
encoder_seq_length = None
decoder_seq_length = None
batch_size = 50
epochs = 40
# same data for training
input_seqs=()#shape (1000,60,1) with sliding windows
target_seqs=()#shape(1000,60,1) with sliding windows but reversed
x= #what has x to be ?
#data for inference
# how do I specify the input data for my other time series ?
# Define training model
encoder_inputs = Input(shape=(encoder_seq_length,
num_encoder_tokens))
encoder = LSTM(128, return_state=True, return_sequences=True)
encoder_outputs = encoder(encoder_inputs)
_, encoder_states = encoder_outputs[0], encoder_outputs[1:]
decoder_inputs = Input(shape=(decoder_seq_length,
num_decoder_tokens))
decoder = LSTM(128, return_sequences=True)
decoder_outputs = decoder(decoder_inputs, initial_state=encoder_states)
decoder_outputs = TimeDistributed(
Dense(num_decoder_tokens, activation='tanh'))(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Training
model.compile(optimizer='adam', loss='mse')
model.fit([input_seqs,x], target_seqs,
batch_size=batch_size, epochs=epochs)
# Define sampling models for inference
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(100,))
decoder_state_input_c = Input(shape=(100,))
decoder_states = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs = decoder(decoder_inputs,
initial_state=decoder_states)
decoder_model = Model([decoder_inputs] + decoder_states,
decoder_outputs)
# Sampling loop for a batch of sequences
states_values = encoder_model.predict(input_seqs)
stop_condition = False
while not stop_condition:
output_tokens = decoder_model.predict([target_seqs] + states_values)
#what else do I need to include here ?
break
def predict_sequence(infenc, infdec, source, n_steps, cardinality):
# encode
state = infenc.predict(source)
# start of sequence input
target_seq = array([0.0 for _ in range(cardinality)]).reshape(1, 1, cardinality)
# collect predictions
output = list()
for t in range(n_steps):
# predict next char
yhat, h, c = infdec.predict([target_seq] + state)
# store prediction
output.append(yhat[0,0,:])
# update state
state = [h, c]
# update target sequence
target_seq = yhat
return array(output)
You can see that the output from every timestep is fed back to the LSTM cell externally.
You can refer the blog and find how it is done during inference.
https://machinelearningmastery.com/develop-encoder-decoder-model-sequence-sequence-prediction-keras/
During training, we give the data in a one shot manner. I think you understand that part.
But during the inference time, we can't do like that. We have to give the data at every time step and then return the cell states, hidden states and the loop should continue till the last word is generated