Related
I am following a tutorial to build a basic GAN on the MNIST handwritten digits data. Here is a link to the tutorial: TUTORIAL.
I copied the exact code and changed the imports to use tf.keras and the legacy Adam optimizer (to work with my system). My generator outputs almost the exact same image with each generation, and the discriminator loss goes to zero incredibly quickly, with the generator never recovering. What could be causing this? I have followed three tutorials verbatim, and always end up with this same result. My full code is at the bottom. The only issue I can think of that might be causing problems is using the legacy Adam optimizer. I'm not sure if this is an issue with my code, my system, or both.
# example of training a gan on mnist
from numpy import expand_dims, zeros, ones, vstack
from numpy.random import randn
from numpy.random import randint
from tensorflow.keras.datasets.mnist import load_data
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Reshape, Flatten, Conv2D, Conv2DTranspose, LeakyReLU, Dropout, BatchNormalization
from matplotlib import pyplot
# define the standalone discriminator model
def define_discriminator(in_shape=(28,28,1)):
model = Sequential()
model.add(Conv2D(64, (3,3), strides=(2, 2), padding='same', input_shape=in_shape))
model.add(LeakyReLU(alpha=0.2))
model.add(Dropout(0.4))
model.add(Conv2D(64, (3,3), strides=(2, 2), padding='same'))
model.add(LeakyReLU(alpha=0.2))
model.add(Dropout(0.4))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile model
opt = Adam(learning_rate=0.0002, beta_1=0.5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
return model
# define the standalone generator model
def define_generator(latent_dim):
model = Sequential()
# foundation for 7x7 image
n_nodes = 128 * 7 * 7
model.add(Dense(n_nodes, input_dim=latent_dim))
model.add(LeakyReLU(alpha=0.2))
model.add(Reshape((7, 7, 128)))
# upsample to 14x14
model.add(Conv2DTranspose(128, (4,4), strides=(2,2), padding='same'))
model.add(LeakyReLU(alpha=0.2))
# upsample to 28x28
model.add(Conv2DTranspose(128, (4,4), strides=(2,2), padding='same'))
model.add(LeakyReLU(alpha=0.2))
model.add(Conv2D(1, (7,7), activation='tanh', padding='same'))
opt = Adam(learning_rate=0.0002, beta_1=0.5)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
return model
# define the combined generator and discriminator model, for updating the generator
def define_gan(g_model, d_model):
# make weights in the discriminator not trainable
d_model.trainable = False
# connect them
model = Sequential()
# add generator
model.add(g_model)
# add the discriminator
model.add(d_model)
# compile model
opt = Adam(learning_rate=0.0002, beta_1=0.5)
model.compile(loss='binary_crossentropy', optimizer=opt)
return model
# load and prepare mnist training images
def load_real_samples():
# load mnist dataset
(trainX, _), (_, _) = load_data()
# expand to 3d, e.g. add channels dimension
X = expand_dims(trainX, axis=-1)
# convert from unsigned ints to floats
X = X.astype('float32')
# scale from [0,255] to [0,1]
X = X / 255.0
return X
# select real samples
def generate_real_samples(dataset, n_samples):
# choose random instances
ix = randint(0, dataset.shape[0], n_samples)
# retrieve selected images
X = dataset[ix]
# generate 'real' class labels (1)
y = ones((n_samples, 1))
return X, y
# generate points in latent space as input for the generator
def generate_latent_points(latent_dim, n_samples):
# generate points in the latent space
x_input = randn(latent_dim * n_samples)
# reshape into a batch of inputs for the network
x_input = x_input.reshape(n_samples, latent_dim)
return x_input
# use the generator to generate n fake examples, with class labels
def generate_fake_samples(g_model, latent_dim, n_samples):
# generate points in latent space
x_input = generate_latent_points(latent_dim, n_samples)
# predict outputs
X = g_model.predict(x_input)
# create 'fake' class labels (0)
y = zeros((n_samples, 1))
return X, y
# create and save a plot of generated images (reversed grayscale)
def save_plot(examples, epoch, n=10):
# plot images
for i in range(n * n):
# define subplot
pyplot.subplot(n, n, 1 + i)
# turn off axis
pyplot.axis('off')
# plot raw pixel data
pyplot.imshow(examples[i, :, :, 0], cmap='gray_r')
# save plot to file
filename = 'generated_plot_e%03d.png' % (epoch+1)
pyplot.savefig(filename)
pyplot.close()
# evaluate the discriminator, plot generated images, save generator model
def summarize_performance(epoch, g_model, d_model, dataset, latent_dim, n_samples=100):
# prepare real samples
X_real, y_real = generate_real_samples(dataset, n_samples)
# evaluate discriminator on real examples
_, acc_real = d_model.evaluate(X_real, y_real, verbose=0)
# prepare fake examples
x_fake, y_fake = generate_fake_samples(g_model, latent_dim, n_samples)
# evaluate discriminator on fake examples
_, acc_fake = d_model.evaluate(x_fake, y_fake, verbose=0)
# summarize discriminator performance
print('>Accuracy real: %.0f%%, fake: %.0f%%' % (acc_real*100, acc_fake*100))
# save plot
save_plot(x_fake, epoch)
# save the generator model tile file
filename = 'generator_model_%03d.h5' % (epoch + 1)
g_model.save(filename)
# train the generator and discriminator
def train(g_model, d_model, gan_model, dataset, latent_dim, n_epochs, n_batch=256):
bat_per_epo = int(dataset.shape[0] / n_batch)
half_batch = int(n_batch / 2)
# manually enumerate epochs
for i in range(n_epochs):
# enumerate batches over the training set
for j in range(bat_per_epo):
# get randomly selected 'real' samples
X_real, y_real = generate_real_samples(dataset, half_batch)
# generate 'fake' examples
X_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
# create training set for the discriminator
X, y = vstack((X_real, X_fake)), vstack((y_real, y_fake))
# update discriminator model weights
d_loss, _ = d_model.train_on_batch(X, y)
# prepare points in latent space as input for the generator
X_gan = generate_latent_points(latent_dim, n_batch)
# create inverted labels for the fake samples
y_gan = ones((n_batch, 1))
# update the generator via the discriminator's error
g_loss = gan_model.train_on_batch(X_gan, y_gan)
# summarize loss on this batch
print('>%d, %d/%d, d=%.3f, g=%.3f' % (i+1, j+1, bat_per_epo, d_loss, g_loss))
# evaluate the model performance, sometimes
if (i+1) % 10 == 0:
summarize_performance(i, g_model, d_model, dataset, latent_dim)
# size of the latent space
latent_dim = 100
# number of epochs
n_epochs = 20
# create the discriminator
d_model = define_discriminator()
# create the generator
g_model = define_generator(latent_dim)
# create the gan
gan_model = define_gan(g_model, d_model)
# load image data
dataset = load_real_samples()
# train model
train(g_model, d_model, gan_model, dataset, latent_dim, n_epochs)
enter image description here> image sample
Hello! I am trying to train a model. I had dataset of 3000 images(500~1000bite / 8bit / size: 200x50) and tried to learn dataset using package "CaptchCracker".But I it doesnt work.. The interesting thing is that it doesn't work when dataset has 3000 images, but it works when dataset has 300~400 images.
And it work only batch size: 1 (with 3000 images)
But I want to learn more than 3,000 images, batch size>1
I tried in (Python3.7.-numpy1.19.2-tensorflow2.3.0) and (Python3.7.-numpy1.19.5-tensorflow2.5.0)
please Help me. please..
-This is the image training code I tried.-
import glob
import CaptchaCracker as cc
train_img_path_list = glob.glob("data/*.png")
img_width = 200
img_height = 50
CM = cc.CreateModel(train_img_path_list, img_width, img_height)
model = CM.train_model(epochs=100)
model.save_weights("weights.h5")
-I get following erros:-
Epoch 1/100
85/91 [===========================>..] - ETA: 2s - loss: 25.1152Traceback (most recent call last):
File "C:\Users\rlack\OneDrive\wallpaper\ver_1\train.py", line 10, in <module>
model = CM.train_model(epochs=100)
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\CaptchaCracker\core.py", line 110, in train_model
epochs=epochs
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\tensorflow\python\keras\engine\training.py", line 108, in _method_wrapper
return method(self, *args, **kwargs)
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\tensorflow\python\keras\engine\training.py", line 1098, in fit
tmp_logs = train_function(iterator)
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\tensorflow\python\eager\def_function.py", line 780, in __call__
result = self._call(*args, **kwds)
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\tensorflow\python\eager\def_function.py", line 807, in _call
return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\tensorflow\python\eager\function.py", line 2829, in __call__
return graph_function._filtered_call(args, kwargs) # pylint: disable=protected-access
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\tensorflow\python\eager\function.py", line 1848, in _filtered_call
cancellation_manager=cancellation_manager)
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\tensorflow\python\eager\function.py", line 1924, in _call_flat
ctx, args, cancellation_manager=cancellation_manager))
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\tensorflow\python\eager\function.py", line 550, in call
ctx=ctx)
File "C:\Users\rlack\anaconda3\envs\cracker\lib\site-packages\tensorflow\python\eager\execute.py", line 60, in quick_execute
inputs, attrs, num_outputs)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot add tensor to the batch: number of elements does not match. Shapes are: [tensor]: [5], [batch]: [6]
[[node IteratorGetNext (defined at \anaconda3\envs\cracker\lib\site-packages\CaptchaCracker\core.py:110) ]] [Op:__inference_train_function_12337]
Function call stack:
train_function
-And this is core code of "CaptchaCracker"
import os
import glob
import numpy as np
from pathlib import Path
from collections import Counter
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class CTCLayer(layers.Layer):
def __init__(self, name=None):
super().__init__(name=name)
self.loss_fn = keras.backend.ctc_batch_cost
def call(self, y_true, y_pred):
# Compute the training-time loss value and add it
# to the layer using `self.add_loss()`.
batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
loss = self.loss_fn(y_true, y_pred, input_length, label_length)
self.add_loss(loss)
# At test time, just return the computed predictions
return y_pred
class CreateModel:
def __init__(self, train_img_path, img_width=200, img_height=50):
# Image Size
self.img_width = img_width
self.img_height = img_height
# Learning Image File Path List
self.images = sorted(train_img_path)
# Learning Image File Label List
self.labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in self.images]
# Label SET
self.characters = set(char for label in self.labels for char in label)
# Label maximum length
self.max_length = max([len(label) for label in self.labels])
# Mapping characters to integers
self.char_to_num = layers.experimental.preprocessing.StringLookup(
vocabulary=sorted(self.characters), num_oov_indices=0, mask_token=None
)
# Mapping integers back to original characters
self.num_to_char = layers.experimental.preprocessing.StringLookup(
vocabulary=self.char_to_num.get_vocabulary(), mask_token=None, invert=True
)
def train_model(self, epochs=100, earlystopping=False):
# Define batch size for learning and validation
batch_size = 16
# Number of downsampling factors (Conv: 2, Pooling: 2)
downsample_factor = 4
# Splitting data into training and validation sets
x_train, x_valid, y_train, y_valid = self.split_data(np.array(self.images), np.array(self.labels))
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = (
train_dataset.map(
self.encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
.batch(batch_size)
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
validation_dataset = (
validation_dataset.map(
self.encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
)
.batch(batch_size)
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
# Get the model
model = self.build_model()
if earlystopping == True:
early_stopping_patience = 10
# Add early stopping
early_stopping = keras.callbacks.EarlyStopping(
monitor="val_loss", patience=early_stopping_patience, restore_best_weights=True
)
# Train the model
history = model.fit(
train_dataset,
validation_data=validation_dataset,
epochs=epochs,
callbacks=[early_stopping],
)
else:
# Train the model
history = model.fit(
train_dataset,
validation_data=validation_dataset,
epochs=epochs
)
return model
def encode_single_sample(self, img_path, label):
# 1. Read image
img = tf.io.read_file(img_path)
# 2. Decode and convert to grayscale
img = tf.io.decode_png(img, channels=1)
# 3. Convert to float32 in [0, 1] range
img = tf.image.convert_image_dtype(img, tf.float32)
# 4. Resize to the desired size
img = tf.image.resize(img, [self.img_height, self.img_width])
# 5. Transpose the image because we want the time
# dimension to correspond to the width of the image.
img = tf.transpose(img, perm=[1, 0, 2])
# 6. Map the characters in label to numbers
label = self.char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
# 7. Return a dict as our model is expecting two inputs
return {"image": img, "label": label}
def build_model(self):
# Inputs to the model
input_img = layers.Input(
shape=(self.img_width, self.img_height, 1), name="image", dtype="float32"
)
labels = layers.Input(name="label", shape=(None,), dtype="float32")
# First conv block
x = layers.Conv2D(
32,
(3, 3),
activation="relu",
kernel_initializer="he_normal",
padding="same",
name="Conv1",
)(input_img)
x = layers.MaxPooling2D((2, 2), name="pool1")(x)
# Second conv block
x = layers.Conv2D(
64,
(3, 3),
activation="relu",
kernel_initializer="he_normal",
padding="same",
name="Conv2",
)(x)
x = layers.MaxPooling2D((2, 2), name="pool2")(x)
# We have used two max pool with pool size and strides 2.
# Hence, downsampled feature maps are 4x smaller. The number of
# filters in the last layer is 64. Reshape accordingly before
# passing the output to the RNN part of the model
new_shape = ((self.img_width // 4), (self.img_height // 4) * 64)
x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
x = layers.Dense(64, activation="relu", name="dense1")(x)
x = layers.Dropout(0.2)(x)
# RNNs
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
# Output layer
x = layers.Dense(len(self.characters) + 1, activation="softmax", name="dense2")(x)
# Add CTC layer for calculating CTC loss at each step
output = CTCLayer(name="ctc_loss")(labels, x)
# Define the model
model = keras.models.Model(
inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
)
# Optimizer
opt = keras.optimizers.Adam()
# Compile the model and return
model.compile(optimizer=opt)
return model
def split_data(self, images, labels, train_size=0.9, shuffle=True):
# 1. Get the total size of the dataset
size = len(images)
# 2. Make an indices array and shuffle it, if required
indices = np.arange(size)
if shuffle:
np.random.shuffle(indices)
# 3. Get the size of training samples
train_samples = int(size * train_size)
# 4. Split data into training and validation sets
x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
return x_train, x_valid, y_train, y_valid
class ApplyModel:
def __init__(self,
weights_path,
img_width=200,
img_height=50,
max_length=6,
characters={'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}):
self.img_width = img_width
self.img_height = img_height
self.max_length = max_length
self.characters = characters
# Mapping characters to integers
self.char_to_num = layers.experimental.preprocessing.StringLookup(
vocabulary=sorted(self.characters), num_oov_indices=0, mask_token=None
)
# Mapping integers back to original characters
self.num_to_char = layers.experimental.preprocessing.StringLookup(
vocabulary=self.char_to_num.get_vocabulary(), mask_token=None, invert=True
)
# Model
self.model = self.build_model()
self.model.load_weights(weights_path)
self.prediction_model = keras.models.Model(
self.model.get_layer(name="image").input, self.model.get_layer(name="dense2").output
)
def predict(self, target_img_path):
target_img = self.encode_single_sample(target_img_path)['image']
target_img = tf.reshape(target_img, shape=[1,self.img_width,self.img_height,1])
pred_val = self.prediction_model.predict(target_img)
pred = self.decode_batch_predictions(pred_val)[0]
return pred
def encode_single_sample(self, img_path):
# 1. Read image
img = tf.io.read_file(img_path)
# 2. Decode and convert to grayscale
img = tf.io.decode_png(img, channels=1)
# 3. Convert to float32 in [0, 1] range
img = tf.image.convert_image_dtype(img, tf.float32)
# 4. Resize to the desired size
img = tf.image.resize(img, [self.img_height, self.img_width])
# 5. Transpose the image because we want the time
# dimension to correspond to the width of the image.
img = tf.transpose(img, perm=[1, 0, 2])
# 6. Map the characters in label to numbers
# 7. Return a dict as our model is expecting two inputs
return {"image": img}
def build_model(self):
# Inputs to the model
input_img = layers.Input(
shape=(self.img_width, self.img_height, 1), name="image", dtype="float32"
)
labels = layers.Input(name="label", shape=(None,), dtype="float32")
# First conv block
x = layers.Conv2D(
32,
(3, 3),
activation="relu",
kernel_initializer="he_normal",
padding="same",
name="Conv1",
)(input_img)
x = layers.MaxPooling2D((2, 2), name="pool1")(x)
# Second conv block
x = layers.Conv2D(
64,
(3, 3),
activation="relu",
kernel_initializer="he_normal",
padding="same",
name="Conv2",
)(x)
x = layers.MaxPooling2D((2, 2), name="pool2")(x)
# We have used two max pool with pool size and strides 2.
# Hence, downsampled feature maps are 4x smaller. The number of
# filters in the last layer is 64. Reshape accordingly before
# passing the output to the RNN part of the model
new_shape = ((self.img_width // 4), (self.img_height // 4) * 64)
x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
x = layers.Dense(64, activation="relu", name="dense1")(x)
x = layers.Dropout(0.2)(x)
# RNNs
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
# Output layer
x = layers.Dense(len(self.characters) + 1, activation="softmax", name="dense2")(x)
# Add CTC layer for calculating CTC loss at each step
output = CTCLayer(name="ctc_loss")(labels, x)
# Define the model
model = keras.models.Model(
inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
)
# Optimizer
opt = keras.optimizers.Adam()
# Compile the model and return
model.compile(optimizer=opt)
return model
def split_data(self, images, labels, train_size=0.9, shuffle=True):
# 1. Get the total size of the dataset
size = len(images)
# 2. Make an indices array and shuffle it, if required
indices = np.arange(size)
if shuffle:
np.random.shuffle(indices)
# 3. Get the size of training samples
train_samples = int(size * train_size)
# 4. Split data into training and validation sets
x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
return x_train, x_valid, y_train, y_valid
# A utility function to decode the output of the network
def decode_batch_predictions(self, pred):
input_len = np.ones(pred.shape[0]) * pred.shape[1]
# Use greedy search. For complex tasks, you can use beam search
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
:, :self.max_length
]
# Iterate over the results and get back the text
output_text = []
for res in results:
res = tf.strings.reduce_join(self.num_to_char(res+1)).numpy().decode("utf-8")
output_text.append(res)
return output_text
What i tried
-Change image size of dataset
-Change version of python,numpy,tensorflow...
-Change computer with a good gpu.
I need help. thank you for reading.
I'm working on a project using Keras Model Subclassing in order to create a model with 2 inputs and 2 outputs. The training data for this model is essentially a dataset of other image classification datasets, with each image being paired with it's corresponding label; a dataset of datasets. One input of the network receives the label, the other receives the image.
train_img = generate_tensors(train, 0)
train_ans = generate_tensors(train, 1)
val_img = generate_tensors(val, 0)
val_ans = generate_tensors(val, 1)
train_img_b = train_img.batch(batch_size) # b for batched
train_ans_b = train_ans.batch(batch_size)
structuremodel = StructureModel()
hnet_output, anet_output = structuremodel([train_img_b, train_ans_b])
In the above code, I'm trying to perform a single forward propagation on my custom "StructureModel" class. "train_img" and "train_ans" are of shapes (None, 100, 224, 224, 1) and [insert shape] respectively. I have set the batch_size to 1.
The model itself is defined as follows:
class StructureModel(keras.Model):
num_images = 100 # images per timestep
resolution = [224, 224]
hnet_pred_vars = 9
anet_pred_vars = 25 # the thing on my whiteboard didnt include a stopping node
alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?#[\\]^_`{|}~ "
def __init__(self):
super().__init__()
self.anet_layer = ArchitectureNet(self.anet_pred_vars)
def call(self, inputs):
# CNN-RNN/CNN-LSTM for processing images and corresponding answers
# Copied VGG16 for structure
# Image processing
# shape=(timesteps,resolution,resolution,rgb channels)
images = inputs[0]
answers = inputs[1]
x = TimeDistributed(Conv2D(filters=64, kernel_size=(3, 3), padding='same', activation='relu'))(images)
x = TimeDistributed(Conv2D(filters=64, kernel_size=(3, 3), padding='same', activation='relu'))(x)
x = TimeDistributed(MaxPooling2D(pool_size=(2, 2), strides=2))(x)
filters_convs = [(128, 2), (256, 3), (512, 3), (512, 3)]
for n_filters, n_convs in filters_convs:
for _ in range(n_convs):
x = TimeDistributed(Conv2D(filters=n_filters, kernel_size=(3, 3), padding='same', activation='relu'))(x)
x = TimeDistributed(MaxPooling2D(pool_size=(2, 2), strides=2))(x)
x = TimeDistributed(Flatten())(x)
img_embed = TimeDistributed(Dense(units=1000), name='Image_Preprocessing')(x)
# Answer embedding
# Number of image-answer pairs, characters in answer, single character
x = TimeDistributed(LSTM(units=500))(answers) # All answers, shape (100, None, 95)
answer_embed = TimeDistributed(Dense(units=1000), name='Answer_Preprocessing/Embed')(x)
# Combines both models
merge = Concatenate(axis=2)([img_embed, answer_embed])
x = LSTM(units=100)(merge)
dataset_embed = Dense(units=100, activation='relu', name='Dataset_Embed')(x)
# hnet
x = Dense(units=50)(dataset_embed)
hnet_output = Dense(units=self.hnet_pred_vars, name='Hyperparameters')(x)
# anet
anet_output = self.anet_layer(dataset_embed)
return hnet_output, anet_output
There's a lot of extra fluff in it, and I'm sure there's many other errors in the model, but the main one that I care about is the TypeError that I keep receiving. Without resolving that, I can't get to debugging anything else. The error is as follows:
File ~\Documents\Programming\Python\HYPAT\NetworksV2.py:83 in call
x = TimeDistributed(Conv2D(filters=64, kernel_size=(3, 3), padding='same', activation='relu'))(images)
TypeError: Exception encountered when calling layer "structure_model_7" (type StructureModel).
'<' not supported between instances of 'NoneType' and 'int'
Call arguments received by layer "structure_model_7" (type StructureModel):
• inputs=['<BatchDataset element_spec=TensorSpec(shape=(None, 100, 224, 224, 1), dtype=tf.float32, name=None)>', '<BatchDataset element_spec=TensorSpec(shape=(None, 100, 2, 95), dtype=tf.float64, name=None)>']
If it would be of any use, here's the entirety of the code.
import keras
from keras.layers import TimeDistributed, Conv2D, Dense, MaxPooling2D, Flatten, LSTM, Concatenate
from tensorflow.keras.utils import plot_model
import pickle
import tqdm
import tensorflow as tf
from varname import nameof
# constants/hyperparamete
batch_size = 1
epochs = 10
train_test_split = 0.25
with open("datasets", "rb") as fp:
datasets = pickle.load(fp)
class ArchitectureNet(keras.layers.Layer):
def __init__(self, anet_pred_vars, **kwargs):
super().__init__()
self.anet_pred_vars = anet_pred_vars
self.concat = Concatenate(axis=1)
self.dense1 = Dense(units=50, activation='relu')
self.dense2 = Dense(units=50, activation='relu')
self.anet_output = Dense(units=self.anet_pred_vars, name='Architecture')
self.stopping_node = Dense(units=1, activation='sigmoid')
def call(self, prev_output, dataset_embed):
x = self.concat([prev_output, dataset_embed])
x = self.dense1(x)
x = self.dense2(x)
anet_output = self.anet_output(x)
stop_node_output = self.stopping_node(x)
print(tf.make_ndarray(stop_node_output))
return anet_output
class StructureModel(keras.Model):
num_images = 100 # images per timestep
resolution = [224, 224]
hnet_pred_vars = 9
anet_pred_vars = 25 # the thing on my whiteboard didnt include a stopping node
alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"#$%&'()*+,-./:;<=>?#[\\]^_`{|}~ "
def __init__(self):
super().__init__()
self.anet_layer = ArchitectureNet(self.anet_pred_vars)
def call(self, inputs):
# CNN-RNN/CNN-LSTM for processing images and corresponding answers
# Copied VGG16 for structure
# Image processing
# shape=(timesteps,resolution,resolution,rgb channels)
images = inputs[0]
answers = inputs[1]
x = TimeDistributed(Conv2D(filters=64, kernel_size=(3, 3), padding='same', activation='relu'))(images)
x = TimeDistributed(Conv2D(filters=64, kernel_size=(3, 3), padding='same', activation='relu'))(x)
x = TimeDistributed(MaxPooling2D(pool_size=(2, 2), strides=2))(x)
filters_convs = [(128, 2), (256, 3), (512, 3), (512, 3)]
for n_filters, n_convs in filters_convs:
for _ in range(n_convs):
x = TimeDistributed(Conv2D(filters=n_filters, kernel_size=(3, 3), padding='same', activation='relu'))(x)
x = TimeDistributed(MaxPooling2D(pool_size=(2, 2), strides=2))(x)
x = TimeDistributed(Flatten())(x)
img_embed = TimeDistributed(Dense(units=1000), name='Image_Preprocessing')(x)
# Answer embedding
# Number of image-answer pairs, characters in answer, single character
x = TimeDistributed(LSTM(units=500))(answers) # All answers, shape (100, None, 95)
answer_embed = TimeDistributed(Dense(units=1000), name='Answer_Preprocessing/Embed')(x)
# Combines both models
merge = Concatenate(axis=2)([img_embed, answer_embed])
x = LSTM(units=100)(merge)
dataset_embed = Dense(units=100, activation='relu', name='Dataset_Embed')(x)
# hnet
x = Dense(units=50)(dataset_embed)
hnet_output = Dense(units=self.hnet_pred_vars, name='Hyperparameters')(x)
# anet
anet_output = self.anet_layer(dataset_embed)
return hnet_output, anet_output
def compile(self):
super().compile()
# Reserve 10,000 samples for validation
ratio = int(train_test_split * len(datasets))
val = datasets[:ratio]
train = datasets[ratio:]
if len(val) == 0: # look at me mom i'm a real programmer
raise IndexError('List \"x_val\" is empty; \"train_test_split\" is set too small')
# Prepare the training and testing datasets
def generate_tensors(data, img_or_ans): # 0 for image, 1 for ans
# technically the images aren't ragged arrays but for simplicity sake we'll keep them alll as ragged tensors
column = [i[img_or_ans] for i in data]
tensor_data = tf.ragged.constant(column)
tensor_data = tensor_data.to_tensor()
tensor_dataset = tf.data.Dataset.from_tensor_slices(tensor_data)
return tensor_dataset
train_img = generate_tensors(train, 0)
train_ans = generate_tensors(train, 1)
val_img = generate_tensors(val, 0)
val_ans = generate_tensors(val, 1)
# TODO: Test if CIFAR 100 dataset (which has variable length answers) will work
#train_dataset = tf.data.Dataset.zip((train_img, train_ans))
#train_dataset = train_dataset.batch(batch_size)
train_img_b = train_img.batch(batch_size) # b for batched
train_ans_b = train_ans.batch(batch_size)
structuremodel = StructureModel()
hnet_output, anet_output = structuremodel([train_img_b, train_ans_b])
plot_model(StructureModel, to_file='aeu.png', show_shapes=True)
"""
for epoch in tqdm.trange(epochs, desc="Epochs"):
# Iterate over the batches of the dataset.
for step, (x_batch_train, y_batch_train) in tqdm(enumerate(train_dataset), leave=False):
# Open a GradientTape to record the operations run
# during the forward pass, which enables auto-differentiation.
with tf.GradientTape() as tape:
# Run the forward pass of the layer.
# The operations that the layer applies
# to its inputs are going to be recorded
# on the GradientTape.
# Logits for this minibatch
logits = model(x_batch_train, training=True)
# Compute the loss value for this minibatch.
loss_value = los5s_fn(y_batch_train, logits)
# Use the gradient tape to automatically retrieve
# the gradients of the trainable variables with respect to the loss.
grads = tape.gradient(loss_value, model.trainable_weights)
# Run one step of gradient descent by updating
# the value of the variables to minimize the loss.
optimizer.apply_gradients(zip(grads, model.trainable_weights))
# Log every 200 batches.
if step % 200 == 0:
print(
"Training loss (for one batch) at step %d: %.4f"
% (step, float(loss_value))
)
print("Seen so far: %s samples" % ((step + 1) * batch_size))
"""
You cannot feed tf.data.Datasets directly to keras layers. Try this:
dataset1 = tf.data.Dataset.from_tensor_slices((tf.random.uniform((5, 100, 224, 224, 1)))).batch(1)
dataset2 = tf.data.Dataset.from_tensor_slices((tf.random.uniform((5, 100, 2, 95)))).batch(1)
structuremodel = StructureModel()
for (x1, x2) in zip(dataset1.take(1), dataset2.take(1)):
hnet_output, anet_output = structuremodel([x1, x2])
Note, however, that StructureModel is buggy, but I'm sure you know that.
Please help me to understand why predict doesn't work correct, when train-test accuracy is 0.97.
Is it from data, or network should be changed?
Input data are 32500 (5 gesture with 6500 images) RGB images with 640*480 pixels.
dataset
Images loaded and resized IMG_WIDTH = 100, IMG_HEIGHT = 77.
Here's the function which load, resized images and return np.array.
def load_data(data_dir):
"""
Load image data from directory `data_dir`.
Assume `data_dir` has one directory named after each category, numbered
0 through NUM_CATEGORIES - 1. Inside each category directory will be some
number of image files.
Return tuple `(images, labels)`. `images` should be a list of all
of the images in the data directory, where each image is formatted as a
numpy ndarray with dimensions IMG_WIDTH x IMG_HEIGHT x 3. `labels` should
be a list of integer labels, representing the categories for each of the
corresponding `images`.
"""
images = []
labels = []
for dir in range(0, NUM_CATEGORIES):
# get path for each gesture
d = os.path.join(data_dir, f"{str(dir)}")
# os.listdir(d) return the list of all names of images in that folder
for image_path in os.listdir(d):
# get the full path of specific image
full_path = os.path.join(data_dir, f"{str(dir)}", image_path)
# Returns an image that is loaded from the specified file
image = cv2.imread(full_path)
# get dimension for each image
dim = (100, 77)
# resized the image
image_resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
# add image and their directory name to images and labels list
images.append(image_resized)
labels.append(dir)
return images, labels
Here's my model.
def get_model():
"""
Returns a compiled convolutional neural network model. Assume that the
`input_shape` of the first layer is `(IMG_WIDTH=100, IMG_HEIGHT=77, 3)`.
The output layer should have `NUM_GESTURE = 5` units, one for each category.
"""
# Create a convolutional neural network
model = tf.keras.models.Sequential(
[
# Convolutional layer. Learn 32 filters using a 3x3 kernel
tf.keras.layers.Conv2D(
32, (5, 5), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
),
# Max-pooling layer, using 2x2 pool size
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Conv2D(
64, (3, 3), activation='relu', input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)
),
# Max-pooling layer, using 2x2 pool size
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Conv2D(
64, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
),
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Conv2D(
128, (3, 3), activation='relu', input_shape=((IMG_WIDTH), (IMG_HEIGHT), 3)
),
tf.keras.layers.MaxPool2D(pool_size=(2, 2)),
tf.keras.layers.Flatten(),
# Add a hidden layer with dropout
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.3),
# Add an output layer with output units for all 5 gestures
tf.keras.layers.Dense(5, activation='softmax')
])
# Train neural network
model.compile(
optimizer='adam',
loss="categorical_crossentropy",
metrics=["accuracy"]
)
return model
labels = tf.keras.utils.to_categorical(labels)
x_train, x_test, y_train, y_test = train_test_split(
np.array(images), np.array(labels), test_size=0.4)
model = get_model()
model.fit(x_train, y_train, batch_size=64, epochs=10)
model.evaluate(x_test, y_test, verbose=2)
The result is 0.97.
fit result
From video I save the image and want to real time predict the hand gesture.
GESTURE = {0:"ok", 1:"down", 2:"up", 3:"palm", 4:"l"}
video = cv2.VideoCapture(0)
while True:
# Capture the video frame
ret, img = video.read()
# Display the resulting frame
# to flip the video with 180 degree
image = cv2.flip(img, 1)
# save image for prediction
image = cv2.imwrite('Frame'+str(0)+'.jpg', image)
image_addr = "Frame0.jpg"
image = cv2.imread(image_addr)
dim = (100,77)
image = tf.keras.preprocessing.image.load_img(image_addr, target_size=dim)
# Converts a PIL Image instance to a Numpy array. Return a 3D Numpy array.
input_arr = tf.keras.preprocessing.image.img_to_array(image)
# Convert single image to a batch.
input_arr = np.array([input_arr])
input_arr = input_arr.astype('float32')/255
# Generates output predictions for the input samples. Return Numpy array(s) of predictions.
predictions = model.predict(input_arr)
# Return the index_array of the maximum values along an axis.
pre_class = np.argmax(predictions, axis=-1)
# for writing in the video
text = GESTURE[pre_class[0]]
font = cv2.FONT_HERSHEY_SIMPLEX
image = cv2.flip(img, 1)
cv2.putText(image,
text,
(50, 50),
font, 2,
(0, 0, 0),
2,
cv2.LINE_4)
cv2.imshow('video', image)
# the 'q' button is set as the
# quitting button you may use any
# desired button of your choice
k = cv2.waitKey(1)
if k == ord('q'):
break
video.release()
cv2.destroyAllWindows()
github link
I am no expert, but typically when you perform well on the training data and the testing data "The result is 0.97", but perform poorly on new end-user data, it is because there is a data-mismatch (although it could possibly be overfitting).
As in, the data you trained and tested on is so different (pixel values, probability distribution of pixels, or unseen differences that are noticable to the model) that the model could not generalize to it and performed badly.
It is good practice to use the same data you would use in production/final-product as a test set. Andrew Ng uses this dataset split(this is applicable if you have enough data):
From the training data:
Training Set
Train-Dev (same as Validation, I think) Set
From the end-product data:
Development Set
Test Set
You can check this post for more information regarding why: https://cs230.stanford.edu/blog/split/
Your preprocess in training step mismatch with preprocess in predict step:
input_arr = input_arr.astype('float32')/255
I have a image data set that I want to use to train a CNN. I have initialized a class "tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter" object that I understand is essentially an iterator that caches the train images in batches so that the entire data set need not be loaded at once.
I have received this error when trying to call model.fit():
ValueError: Error when checking input: expected conv2d_input to have 4 dimensions, but got array with
shape (None, 1)
I understand that I need to add a dimension to my model input. I want to add a channels dimension to my images. I have tried to use np.expand_dims() and tf.expand_dims() on my class "tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter" object but the former changes the object type and the latter is not supported for the class object. Any help is appreciated. Below is my model structure:
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.summary()
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10))
history = model.fit(train_data, epochs=10, validation_data=(val_data),steps_per_epoch=x,
validation_steps=y)
I have been following the tutorial in the example listed here, https://www.tensorflow.org/tutorials/load_data/images, but have tried to create and load in my own data set.
Below is my tf pipeline:
BATCH_SIZE = 32
IMG_HEIGHT = 224
IMG_WIDTH = 224
STEPS_PER_EPOCH = np.ceil(image_count/BATCH_SIZE)
data_dir = 'C:\\Users\\rtlum\\Documents\\DataSci_Projects\\PythonTensorFlowProjects\\google-images-download\\images'
list_ds = tf.data.Dataset.list_files(str(data_dir+"*.jpg")) #Make dataset of file paths
class_names = ['sad', 'angry']
size = 0
for count in enumerate(list_ds):
size += 1
val_data_size = size * .2
for f in list_ds.take(5):#test for correct file paths
print(f.numpy())
def get_label(file_path):
# convert the path to a list of path components
parts = tf.strings.split(file_path, os.path.sep)
# The second to last is the class-directory
return parts[-2] == class_names
def decode_img(img):
# convert the compressed string to a 3D uint8 tensor
img = tf.image.decode_jpeg(img, channels=3)
# Use `convert_image_dtype` to convert to floats in the [0,1] range.
img = tf.image.convert_image_dtype(img, tf.float32)
# resize the image to the desired size.
return tf.image.resize(img, [64, 64])
def process_path(file_path):
label = get_label(file_path)
# load the raw data from the file as a string
img = tf.io.read_file(file_path)
img = decode_img(img)
return img, label
# Set `num_parallel_calls` so multiple images are loaded/processed in parallel.
labeled_ds = list_ds.map(process_path)
for image, label in labeled_ds.take(1):
print("Image shape: ", image.numpy().shape)
print("Label: ", label.numpy())
shuffle_buffer_size=1000
def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
# This is a small dataset, only load it once, and keep it in memory.
# use `.cache(filename)` to cache preprocessing work for datasets that don't
# fit in memory.
if cache:
if isinstance(cache, str):
ds = ds.cache(cache)
else:
ds = ds.cache()
ds = ds.shuffle(buffer_size=shuffle_buffer_size)
# Repeat forever
ds = ds.repeat()
ds = ds.batch(BATCH_SIZE)
# `prefetch` lets the dataset fetch batches in the background while the model
# is training.
ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
return ds
ds = prepare_for_training(list_ds)
val_data = ds.take(int(val_data_size))
train_data = ds.skip(int(val_data_size))
Two problems,
Your wildcard for directory matching appears to be incorrect.
By looking at your code, it seems that your data needs to follow a structure like,
data
|- sad
|- 1.jpg
...
|- angry
|- 1.jpg
...
This is not what you're matching when you say,
tf.data.Dataset.list_files(str(data_dir+"*.jpg"))
it should be,
tf.data.Dataset.list_files(str(data_dir+os.path.sep+"*"+os.path.sep+"*.jpg"))
You have the wrong dataset,
You have,
ds = prepare_for_training(list_ds)
should be,
ds = prepare_for_training(labeled_ds)
Other issues,
You are resizing data to 64x64, but your model accepts a 32x32 model.
You have 2 labels, but your model expects 10 classes.
You don't have a model compilation line (i.e. model.compile(...))