I am trying to figure out sentiment classification on movie reviews using BERT, transformers and tensorflow. This is the code I currently have:
def read_dataset(filename, model_name="bert-base-uncased"):
"""Reads a dataset from the specified path and returns sentences and labels"""
tokenizer = BertTokenizer.from_pretrained(model_name)
with open(filename, "r", encoding="utf-8") as f:
lines = f.readlines()
# preallocate memory for the data
sents, labels = list(), np.empty((len(lines), 1), dtype=int)
for i, line in enumerate(lines):
text, str_label, _ = line.split("\t")
labels[i] = int(str_label.split("=")[1] == "POS")
sents.append(text)
return dict(tokenizer(sents, padding=True, truncation=True, return_tensors="tf")), labels
class BertMLP(tf.keras.Model):
def __init__(self, embed_batch_size=100, model_name="bert-base-cased"):
super(BertMLP, self).__init__()
self.bs = embed_batch_size
self.model = TFBertModel.from_pretrained(model_name)
self.classification_head = tf.keras.models.Sequential(
layers = [
tf.keras.Input(shape=(self.model.config.hidden_size,)),
tf.keras.layers.Dense(350, activation="tanh"),
tf.keras.layers.Dense(200, activation="tanh"),
tf.keras.layers.Dense(50, activation="tanh"),
tf.keras.layers.Dense(1, activation="sigmoid", use_bias=False)
]
)
def call(self, inputs):
outputs = self.model(inputs)
return outputs
def evaluate(model, inputs, labels, loss_func):
mean_loss = tf.keras.metrics.Mean(name="train_loss")
accuracy = tf.keras.metrics.BinaryAccuracy(name="train_accuracy")
predictions = model(inputs)
mean_loss(loss_func(labels, predictions))
accuracy(labels, predictions)
return mean_loss.result(), accuracy.result() * 100
if __name__ == "__main__":
train = read_dataset("datasets/rt-polarity.train.vecs")
dev = read_dataset("datasets/rt-polarity.dev.vecs")
test = read_dataset("datasets/rt-polarity.test.vecs")
mlp = BertMLP()
mlp.compile(tf.keras.optimizers.SGD(learning_rate=0.01), loss='mse')
dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
print("Before training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")
mlp.fit(*train, epochs=10, batch_size=10)
dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
print("After training:", f"Dev Loss: {dev_loss}, Dev Acc: {dev_acc}")
However, when I run this code, I get an error:
Traceback (most recent call last):
File "C:\Users\home\anaconda3\lib\site-packages\spyder_kernels\py3compat.py", line 356, in compat_exec
exec(code, globals, locals)
File "c:\users\home\downloads\mlp.py", line 60, in <module>
dev_loss, dev_acc = evaluate(mlp, *dev, tf.keras.losses.MeanSquaredError())
File "c:\users\home\downloads\mlp.py", line 46, in evaluate
predictions = model(inputs)
File "C:\Users\home\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "c:\users\home\downloads\mlp.py", line 39, in call
outputs = self.model(inputs)
File "C:\Users\home\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 409, in run_call_with_unpacked_inputs
return func(self, **unpacked_inputs)
File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 1108, in call
outputs = self.bert(
File "C:\Users\home\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 409, in run_call_with_unpacked_inputs
return func(self, **unpacked_inputs)
File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 781, in call
embedding_output = self.embeddings(
File "C:\Users\home\anaconda3\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 203, in call
inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
InvalidArgumentError: Exception encountered when calling layer "embeddings" (type TFBertEmbeddings).
indices[1174,8] = 29550 is not in [0, 28996) [Op:ResourceGather]
Call arguments received:
• input_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
• position_ids=None
• token_type_ids=tf.Tensor(shape=(1599, 73), dtype=int32)
• inputs_embeds=None
• past_key_values_length=0
• training=False
I googled for a while, and I can't find anything conclusive. I am pretty sure it has something to do with this part:
def call(self, inputs):
outputs = self.model(inputs)
return outputs
But again, I have tried a lot of different things, including limiting dataset size and installing different versions of transformers and tensorflow, but to no avail. Please let me know what I'm doing wrong. Thank you!
OP was using bert-base-cased for their model, and bert-base-uncased for their tokenizer, causing issues during training when the vocab size of the model and the tokenized data differed.
Related
I am a beginner to machine learning and trying to train a model on counting the amount of numbers below 0.5 in a 1D Vector with the length of 10. The input vectors contain number between 0 and 1. I generate the input data and the labels in my script instead of having them in a seperate file, because the data is so simple.
This is the Code:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class MyNet(nn.Module):
def __init__(self):
super(MyNet, self).__init__()
self.lin1 = nn.Linear(10,10)
self.lin2 = nn.Linear(10,1)
def forward(self,x):
x = self.lin1(x)
x = F.relu(x)
x = self.lin2(x)
return x
net = MyNet()
net.to(device)
def train():
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.1)
for epochs in range(100):
target = 0
data = torch.rand(10)
for entry in data:
if entry < 0.5:
target += 1
# print(target)
# print(data)
data = data.to(device)
out = net(data)
# print(out)
target = torch.Tensor(target)
target = target.to(device)
loss = criterion(out, target)
print(loss)
net.zero_grad()
loss.backward()
optimizer.step()
def test():
acc_error = 0
for i in range(100):
test_data = torch.rand(10)
test_data.to(device)
test_target = 0
for entry in test_data:
if entry < 0.5:
test_target += 1
out = net(test_data)
error = test_target - out
if error < 0:
error *= -1
acc_error += error
overall_error = acc_error / 100
print(overall_error)
train()
test()
This is the error:
Traceback (most recent call last):
File "test1.py", line 70, in <module>
test()
File "test1.py", line 59, in test
out = net(test_data)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "test1.py", line 15, in forward
x = self.lin1(x)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/modules/linear.py", line 94, in forward
return F.linear(input, self.weight, self.bias)
File "/vol/fob-vol7/mi18/radtklau/SP/sem_project/lib64/python3.6/site-packages/torch/nn/functional.py", line 1753, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: Tensor for 'out' is on CPU, Tensor for argument #1 'self' is on CPU, but expected them to be on GPU (while checking arguments for addmm)
The other posts regarding the topic have not solved my problem. Maybe somebody can help. Thanks!
Notice how your error message traces back to test, while train works fine.
You've transfered your data correctly in train:
data = data.to(device)
But not in test:
test_data.to(device)
Instead it should be reassigned to test_data, since torch.Tensor.to makes a copy:
test_data = test_data.to(device)
I am saving my model, optimizer, scheduler, and scaler in a general checkpoint.
Now when I load them, they load properly but after the first iteration the scaler.step(optimizer) throws this error:
Traceback (most recent call last):
File "HistNet/trainloop.py", line 92, in <module>
scaler.step(optimizer)
File "/opt/conda/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 333, in step
retval = optimizer.step(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/optim/optimizer.py", line 89, in wrapper
return func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/optim/adam.py", line 108, in step
F.adam(params_with_grad,
File "/opt/conda/lib/python3.8/site-packages/torch/optim/functional.py", line 86, in adam
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
RuntimeError: The size of tensor a (32) must match the size of tensor b (64) at non-singleton dimension 0
Now I don't really understand why a shape mismatch of all things is there. I'm doing everything similarly to official docs, here is shortened version of my code:
dataloader = DataLoader(Dataset)
model1 = model1()
optimizer = optim.Adam(parameters, lr, betas)
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: decay_rate**epoch)
scaler = amp.GradScaler()
if resume: epoch_resume = load_checkpoint(path, model1, optimizer, scheduler, scaler)
for epoch in trange(epoch_resume, config['epochs']+1, desc='Epochs'):
for content_image, style_image in tqdm(dataloader, desc='Dataloader'):
content_image, style_image = content_image.to(device), style_image.to(device)
with amp.autocast():
content_image = TF.rgb_to_grayscale(content_image)
s = TF.rgb_to_grayscale(style_image)
deformation_field = model1(s, content_image)
output_image = F.grid_sample(content_image, deformation_field.float(), align_corners=False)
loss_after = cost_function(output_image, s, device=device)
loss_list += [loss_after]
scaler.scale(loss_after).backward()
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
scheduler.step()
torch.save({
'epoch': epoch,
'model1_state_dict': model1.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler_state_dict': scheduler.state_dict(),
'scaler_state_dict': scaler.state_dict(),
}, path)
def load_checkpoint(checkpoint_path, model1, optimizer, scheduler, scaler):
checkpoint = torch.load(checkpoint_path)
model1.load_state_dict(checkpoint['model1_state_dict'])
model1.train()
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
scaler.load_state_dict(checkpoint['scaler_state_dict'])
epoch = checkpoint['epoch']
return epoch+1
For anyone with similar issue:
It boiled down to my use of 2 models and 1 optimizer. I did:
parameters = set()
for net in nets:
parameters |= set(net.parameters())
which resulted in unordered list of parameters which was unsurprisingly different with each resume.
I currently changed it to:
parameters = []
for net in nets:
parameters += list(net.parameters())
which works but I haven't seen the use of list in any other code as of now and I have seen the use of a set. So be wary of some potential unwanted behavior. As of now I understand you lose only the fact that you can have multiple same tensors in a list. But with two different models I don't see how it could affect the optimizer. If you know more than me, please correct me.
I want to implement perceptual loss for sequential image data of the shape [batch_size, sequence_length, height, width, channels]
The predictions of my model also have the same shape as the input.
My problem is, that I'm not able to properly feed in my predictions to the VGG16 in order to calculate the loss.
Firstly, I build my vgg as follows:
def build_vgg_model(self, weights="imagenet"):
# Input image to extract features from
img = Input(shape=(self.img_rows, self.img_cols, 3))
# Mean center and rescale by variance as in PyTorch
processed = Lambda(lambda x: (x - self.mean) / self.std)(img)
# If inference only, just return empty model
if self.inference_only:
model = Model(inputs=img, outputs=[img for _ in range(len(self.vgg_layers))])
model.trainable = False
model.compile(loss='mse', optimizer='adam')
return model
# Get the vgg network from Keras applications
if weights in ['imagenet', None]:
vgg = VGG16(weights=weights, include_top=False)
# model = Model(inputs=base_model.input, outputs=base_model.get_layer('block4_pool').output)
else:
vgg = VGG16(weights=None, include_top=False)
vgg.load_weights(weights, by_name=True)
# Output the first three pooling layers
vgg.outputs = [vgg.layers[i].output for i in self.vgg_layers]
# Create model and compile
model = Model(inputs=img, outputs=vgg(processed))
model.trainable = False
return model
Then I define my loss function:
def total_loss(self, mask):
def loss(y_true, y_pred):
# Compute predicted image with non-hole pixels set to ground truth
y_comp = mask * y_true + (1 - mask) * y_pred
# Compute the vgg features.
if self.vgg_device:
with tf.device(self.vgg_device):
vgg_out = self.vgg(y_pred)
vgg_gt = self.vgg(y_true)
vgg_comp = self.vgg(y_comp)
else:
vgg_out = self.vgg(y_pred)
vgg_gt = self.vgg(y_true)
vgg_comp = self.vgg(y_comp)
# Compute loss components
l1 = self.loss_valid(mask, y_true, y_pred)
l2 = self.loss_hole(mask, y_true, y_pred)
l3 = self.loss_perceptual(vgg_out, vgg_gt, vgg_comp)
# Return loss function
return l1 + 6 * l2 + 0.05 * l3
return loss
def loss_perceptual(self, vgg_out, vgg_gt, vgg_comp):
loss = 0
for o, c, g in zip(vgg_out, vgg_comp, vgg_gt):
loss += self.l1(o, g) + self.l1(c, g)
return loss
If I now run my code, I get the following error:
Traceback (most recent call last):
File "convlstm_main.py", line 51, in <module>
model = ConvLSTM(64, 64, 3, 5)
File "/mnt/workspace/lm78463/RecurrentDiFoRem/src/model/conv_lstm.py", line 40, in __init__
self.model = self.compile()
File "/mnt/workspace/lm78463/RecurrentDiFoRem/src/model/conv_lstm.py", line 116, in compile
self.model.compile(loss=self.loss_total, optimizer='adadelta')
File "/usr/local/lib/python3.6/dist-packages/keras/engine/training.py", line 229, in compile
self.total_loss = self._prepare_total_loss(masks)
File "/usr/local/lib/python3.6/dist-packages/keras/engine/training.py", line 692, in _prepare_total_loss
y_true, y_pred, sample_weight=sample_weight)
File "/usr/local/lib/python3.6/dist-packages/keras/losses.py", line 71, in __call__
losses = self.call(y_true, y_pred)
File "/usr/local/lib/python3.6/dist-packages/keras/losses.py", line 132, in call
return self.fn(y_true, y_pred, **self._fn_kwargs)
File "/mnt/workspace/lm78463/RecurrentDiFoRem/src/model/conv_lstm.py", line 180, in loss_total
vgg_gts = self.vgg(y_trues)
File "/usr/local/lib/python3.6/dist-packages/keras/engine/base_layer.py", line 489, in __call__
output = self.call(inputs, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/keras/engine/network.py", line 583, in call
output_tensors, _, _ = self.run_internal_graph(inputs, masks)
File "/usr/local/lib/python3.6/dist-packages/keras/engine/network.py", line 740, in run_internal_graph
layer.call(computed_tensor, **kwargs))
File "/usr/local/lib/python3.6/dist-packages/keras/engine/network.py", line 583, in call
output_tensors, _, _ = self.run_internal_graph(inputs, masks)
File "/usr/local/lib/python3.6/dist-packages/keras/engine/network.py", line 740, in run_internal_graph
layer.call(computed_tensor, **kwargs))
File "/usr/local/lib/python3.6/dist-packages/keras/layers/convolutional.py", line 171, in call
dilation_rate=self.dilation_rate)
File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 3717, in conv2d
**kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_ops.py", line 917, in convolution
name=name)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_ops.py", line 979, in convolution_internal
strides = _get_sequence(strides, n, channel_index, "strides")
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_ops.py", line 74, in _get_sequence
name, n, n + 2, current_n))
ValueError: strides should be of length 1, 3 or 5 but was 2
I'm using keras (tf.keras) in tensorflow 2.0.0
I've a network, whose input is an image and output is also an image. I want to use a combination of MSE, MSE in VGG feature space and some other losses, which depend on intermediate layer output. I'm defining a custom loss function. I'm able to build the model, compile with the custom loss. But when I train using fit_generator, I'm getting a SymbolicException saying Inputs to eager execution function cannot be Keras symbolic tensors
Full Code:
Train File:
def __init__(self, gray_images: bool, verbose: bool = True):
super().__init__(gray_images, verbose)
self.model = None
self.vgg_feature_extractor = VggFeaturesExtractor(model_name='vgg16', layers=[3, 6, 10])
def build_model():
image_input = Input(shape=(None, None, num_input_channels))
out1 = self.build_out1_model(image_input, num_filters, depth_t)
out2 = self.build_out2_model(image_input, num_filters, depth_n, use_bnorm)
enhanced_image = ... # Some function of image_input, out1 and out2
self.model = Model(inputs=image_input, outputs=enhanced_image)
self.model.add_loss(loss_weights[1] * self.loss2(out2))
self.model.compile(optimizer='adam', loss=self.vgg_loss)
def vgg_loss(self, gt_image, est_image):
gt_features = self.vgg_feature_extractor.extract_features(gt_image)
est_features = self.vgg_feature_extractor.extract_features(est_image)
loss = tf.reduce_mean(tf.square(gt_features[0] - est_features[0])) + \
tf.reduce_mean(tf.square(gt_features[1] - est_features[1])) + \
tf.reduce_mean(tf.square(gt_features[2] - est_features[2]))
return loss
VggFeatures.py:
class VggFeaturesExtractor:
def __init__(self, model_name: str, layers: List[int]):
self.model_name = model_name
self.layers = layers
if model_name == 'vgg16':
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
vgg_model = VGG16(include_top=False)
self.preprocess_input = preprocess_input
elif model_name == 'vgg19':
from tensorflow.keras.applications.vgg19 import VGG19, preprocess_input
vgg_model = VGG19(include_top=False)
self.preprocess_input = preprocess_input
else:
raise RuntimeError(f'Unknown Model: {model_name}')
outputs = []
for layer_num in layers:
outputs.append(vgg_model.layers[layer_num].output)
self.feature_extractor = keras.Model(inputs=vgg_model.input, outputs=outputs)
def extract_features(self, images: numpy.ndarray):
preprocessed_images = self.preprocess_input(images)
features = self.feature_extractor(preprocessed_images)
return features
Stack trace:
Epoch 1/1000
Traceback (most recent call last):
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/execute.py", line 61, in quick_execute
num_outputs)
TypeError: An op outside of the function building code is being passed
a "Graph" tensor. It is possible to have Graph tensors
leak out of the function building context by including a
tf.init_scope in your function building code.
For example, the following function will fail:
#tf.function
def has_init_scope():
my_constant = tf.constant(1.)
with tf.init_scope():
added = my_constant * 2
The graph tensor has name: StridedSliceGrad:0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/media/nagabhushan/Data02/SNB/IISc/Research/.../Workspace/Ideas/01_Supervised/src/N09.py", line 363, in <module>
main()
File "/media/nagabhushan/Data02/SNB/IISc/Research/.../Workspace/Ideas/01_Supervised/src/N09.py", line 343, in main
args.save_interval)
File "/media/nagabhushan/Data02/SNB/IISc/Research/.../Workspace/Ideas/01_Supervised/src/N09.py", line 92, in train_model
verbose=self.verbose)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 1297, in fit_generator
steps_name='steps_per_epoch')
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_generator.py", line 265, in model_iteration
batch_outs = batch_function(*batch_data)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training.py", line 973, in train_on_batch
class_weight=class_weight, reset_metrics=reset_metrics)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py", line 264, in train_on_batch
output_loss_metrics=model._output_loss_metrics)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_eager.py", line 311, in train_on_batch
output_loss_metrics=output_loss_metrics))
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/keras/engine/training_eager.py", line 268, in _process_single_batch
grads = tape.gradient(scaled_total_loss, trainable_weights)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/backprop.py", line 1014, in gradient
unconnected_gradients=unconnected_gradients)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/imperative_grad.py", line 76, in imperative_grad
compat.as_str(unconnected_gradients.value))
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 911, in _backward_function_wrapper
processed_args, remapped_captures)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 1224, in _call_flat
ctx, args, cancellation_manager=cancellation_manager)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/function.py", line 511, in call
ctx=ctx)
File "/media/nagabhushan/Data02/SoftwareFiles/Anaconda/anaconda3/envs/.../lib/python3.7/site-packages/tensorflow_core/python/eager/execute.py", line 75, in quick_execute
"tensors, but found {}".format(keras_symbolic_tensors))
tensorflow.python.eager.core._SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'StridedSliceGrad:0' shape=(16, 64, 64, 3) dtype=float32>]
Process finished with exit code 1
Note:
1. If I replace self.model.compile(optimizer='adam', loss=self.vgg_loss) with self.model.compile(optimizer='adam', loss='mse'), code works fine, which implies the other part of code is working correctly.
2. Almost every question I found on SO regarding VGG loss advises to append VGG network to the main network, set trainable=False for VGG network and then train with MSE loss. But I can't do that, since I have many components in my loss function.
I was able to fix this issue by disabling eager execution. In tensorflow 2.0, eager execution is enabled by default.
tf.compat.v1.disable_eager_execution()
I didn't understand how this was able to fix the issue though. If anybody stumbles on a similar problem, you can try disabling eager execution.
I am using the Dataset API to generate training data and sort it into batches for a NN.
Here is a minimum working example of my code:
import tensorflow as tf
import numpy as np
import random
def my_generator():
while True:
x = np.random.rand(4, 20)
y = random.randint(0, 11)
label = tf.one_hot(y, depth=12)
yield x.reshape(4, 20, 1), label
def my_input_fn():
dataset = tf.data.Dataset.from_generator(lambda: my_generator(),
output_types=(tf.float64, tf.int32))
dataset = dataset.batch(32)
iterator = dataset.make_one_shot_iterator()
batch_features, batch_labels = iterator.get_next()
return batch_features, batch_labels
if __name__ == "__main__":
tf.enable_eager_execution()
model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=(4, 20, 1)),
tf.keras.layers.Dense(128, activation=tf.nn.relu),
tf.keras.layers.Dense(12, activation=tf.nn.softmax)])
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
data_generator = my_input_fn()
model.fit(data_generator)
The code fails using TensorFlow 1.13.1 at the model.fit() call with the following error:
Traceback (most recent call last):
File "scripts/min_working_example.py", line 37, in <module>
model.fit(data_generator)
File "~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py", line 880, in fit
validation_steps=validation_steps)
File "~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_arrays.py", line 310, in model_iteration
ins_batch = slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
File "~/.local/lib/python3.6/site-packages/tensorflow/python/keras/utils/generic_utils.py", line 526, in slice_arrays
return [None if x is None else x[start] for x in arrays]
File "~/.local/lib/python3.6/site-packages/tensorflow/python/keras/utils/generic_utils.py", line 526, in <listcomp>
return [None if x is None else x[start] for x in arrays]
File "~/.local/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 654, in _slice_helper
name=name)
File "~/.local/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 820, in strided_slice
shrink_axis_mask=shrink_axis_mask)
File "~/.local/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 9334, in strided_slice
_six.raise_from(_core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InvalidArgumentError: Attr shrink_axis_mask has value 4294967295 out of range for an int32 [Op:StridedSlice] name: strided_slice/
I tried running the same code on a different machine using TensorFlow 2.0 (after removing the line tf.enable_eager_execution() because it runs eagerly by default) and I got the following error:
Traceback (most recent call last):
File "scripts/min_working_example.py", line 37, in <module>
model.fit(data_generator)
File "~/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 873, in fit
steps_name='steps_per_epoch')
File "~/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_arrays.py", line 352, in model_iteration
batch_outs = f(ins_batch)
File "~/.local/lib/python3.7/site-packages/tensorflow/python/keras/backend.py", line 3217, in __call__
outputs = self._graph_fn(*converted_inputs)
File "~/.local/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 558, in __call__
return self._call_flat(args)
File "~/.local/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 627, in _call_flat
outputs = self._inference_function.call(ctx, args)
File "~/.local/lib/python3.7/site-packages/tensorflow/python/eager/function.py", line 397, in call
(len(args), len(list(self.signature.input_arg))))
ValueError: Arguments and signature arguments do not match: 21 23
I tried changing model.fit() to model.fit_generator() but this fails on both TensorFlow versions too. On TF 1.13.1 I get the following error:
Traceback (most recent call last):
File "scripts/min_working_example.py", line 37, in <module>
model.fit_generator(data_generator)
File "~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py", line 1426, in fit_generator
initial_epoch=initial_epoch)
File "~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_generator.py", line 115, in model_iteration
shuffle=shuffle)
File "~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training_generator.py", line 377, in convert_to_generator_like
num_samples = int(nest.flatten(data)[0].shape[0])
TypeError: __int__ returned non-int (type NoneType)
and on TF 2.0 I get the following error:
Traceback (most recent call last):
File "scripts/min_working_example.py", line 37, in <module>
model.fit_generator(data_generator)
File "~/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py", line 1515, in fit_generator
steps_name='steps_per_epoch')
File "~/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_generator.py", line 140, in model_iteration
shuffle=shuffle)
File "~/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/training_generator.py", line 477, in convert_to_generator_like
raise ValueError('You must specify `batch_size`')
ValueError: You must specify `batch_size`
yet batch_size is not a recognized keyword for fit_generator().
I am puzzled by these error messages and I would appreciate if anyone can shed some light on them, or point out what I am doing wrong.
While the origin of the errors is still nebulous, I have found a solution that makes the code work. I'll post it here in case it is useful to anyone in a similar situation.
Basically, I changed the my_input_fn() into a generator and used model.fit_generator() as follows:
import tensorflow as tf
import numpy as np
import random
def my_generator(total_items):
i = 0
while i < total_items:
x = np.random.rand(4, 20)
y = random.randint(0, 11)
label = tf.one_hot(y, depth=12)
yield x.reshape(4, 20, 1), label
i += 1
def my_input_fn(total_items, epochs):
dataset = tf.data.Dataset.from_generator(lambda: my_generator(total_items),
output_types=(tf.float64, tf.int64))
dataset = dataset.repeat(epochs)
dataset = dataset.batch(32)
iterator = dataset.make_one_shot_iterator()
while True:
batch_features, batch_labels = iterator.get_next()
yield batch_features, batch_labels
if __name__ == "__main__":
tf.enable_eager_execution()
model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=(4, 20, 1)),
tf.keras.layers.Dense(64, activation=tf.nn.relu),
tf.keras.layers.Dense(12, activation=tf.nn.softmax)])
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
total_items = 200
batch_size = 32
epochs = 10
num_batches = int(total_items/batch_size)
train_data_generator = my_input_fn(total_items, epochs)
model.fit_generator(generator=train_data_generator, steps_per_epoch=num_batches, epochs=epochs, verbose=1)
EDIT
As implied by giser_yugang in a comment, it is also possible to do it with my_input_fn() as a function returning the dataset instead of the individual batches.
def my_input_fn(total_items, epochs):
dataset = tf.data.Dataset.from_generator(lambda: my_generator(total_items),
output_types=(tf.float64, tf.int64))
dataset = dataset.repeat(epochs)
dataset = dataset.batch(32)
return dataset
if __name__ == "__main__":
tf.enable_eager_execution()
model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=(4, 20, 1)),
tf.keras.layers.Dense(64, activation=tf.nn.relu),
tf.keras.layers.Dense(12, activation=tf.nn.softmax)])
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
total_items = 100
batch_size = 32
epochs = 10
num_batches = int(total_items/batch_size)
dataset = my_input_fn(total_items, epochs)
model.fit_generator(dataset, epochs=epochs, steps_per_epoch=num_batches)
There does not appear to be any average performance difference between the approaches.