I am trying to make a VAE to encode movie names and then train it on an 8 core GPU. The model compiles and fits as expected on a single GPU, but breaks when I try to run it on multiple. Here is the basic code of the autoencoder:
from keras.layers import Input, GRU, RepeatVector, Conv1D, Dense, TimeDistributed, Dropout, MaxPooling1D
from keras.models import Model
from keras.utils import to_categorical, plot_model
from keras.callbacks import ModelCheckpoint
import numpy as np
from keras import backend as K
from keras import metrics
from keras.layers import Lambda, Flatten, Layer
from keras import losses
import tensorflow as tf
import random
# Open file with 20k movie names from imdb
movies = open('/home/ubuntu/MovieNames/data/movies.dat')
data = []
# read data
for line in movies:
data += [line.split("\t")]
names = [x[1] for x in data]
# get rid of the header
movie_names = names[1:]
chars = list('abcdefghijklmnopqrstuvwxyz ') + ['<END>', '<NULL>']
indices_for_chars = {c: i for i, c in enumerate(chars)}
NAME_MAX_LEN = 35 # include the <END> char
def name_to_vec(name, maxlen=NAME_MAX_LEN):
name_lowercase = name.lower()
v = np.zeros(maxlen, dtype=int)
null_idx = indices_for_chars['<NULL>']
v.fill(null_idx)
# ignore cases
for i, c in enumerate(name_lowercase):
if i >= maxlen: break
n = indices_for_chars.get(c, null_idx)
v[i] = n
v[min(len(name_lowercase), maxlen-1)] = indices_for_chars['<END>']
return v
# convert to Keras-compatible form
names = np.array([to_categorical(name_to_vec(name),num_classes=len(chars)) for name in movie_names])
# Global parameters
NAME_LENGTH = names.shape[1]
ALPHABET = names.shape[2]
latent_dim = 10 * 8
intermediate_dim = 24 * 8
batch_size = 100 * 8
epochs = 20
epsilon_std = 0.01
i = Input(shape=(NAME_LENGTH, ALPHABET))
x = Conv1D(256, 9)(i)
x = Dropout(0.2)(x) # o
x = Conv1D(256, 7)(x)
x = MaxPooling1D(pool_length=3)(x)
x = Dropout(0.2)(x)
x = Conv1D(256, 3)(x)
x = Dropout(0.2)(x)
x = Flatten()(x)
x = Dense(intermediate_dim, activation='relu')(x)
x = Dropout(0.2)(x)
z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)
def sampling(args):
z_mean, z_log_var = args
epsilon = K.random_normal(shape=(batch_size, latent_dim),
mean=0., stddev=epsilon_std)
return z_mean + K.exp(z_log_var) * epsilon
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
h = Dense(intermediate_dim, activation='relu')(z)
h = RepeatVector(NAME_LENGTH)(h)
h = GRU(256, return_sequences=True)(h)
h = Dropout(0.2)(h)
h = GRU(256, return_sequences=True)(h)
h = TimeDistributed(Dense(ALPHABET, activation='softmax'), name='decoded_mean')(h)
autoencoder = Model(i, h)
def vae_objective(y_true, y_pred):
recon = K.sum(K.categorical_crossentropy(y_pred,y_true),axis=1)
kl = 0.5 * K.sum(K.exp(z_log_var) + K.square(z_mean) - 1. - z_log_var,axis=1)
return recon + kl
I then use the Keras multi-GPU tool to parallelize the code:
from keras import backend as K
from keras.models import Model
from keras.layers import Input
from keras.layers.core import Lambda
from keras.layers.merge import Concatenate
def slice_batch(x, n_gpus, part):
"""
Divide the input batch into [n_gpus] slices, and obtain slice no. [part].
i.e. if len(x)=10, then slice_batch(x, 2, 1) will return x[5:].
"""
sh = K.shape(x)
L = sh[0] / n_gpus
if part == n_gpus - 1:
return x[part*L:]
return x[part*L:(part+1)*L]
def to_multi_gpu(model, n_gpus=2):
"""Given a keras [model], return an equivalent model which parallelizes
the computation over [n_gpus] GPUs.
Each GPU gets a slice of the input batch, applies the model on that slice
and later the outputs of the models are concatenated to a single tensor,
hence the user sees a model that behaves the same as the original.
"""
with tf.device('/cpu:0'):
x = Input(model.input_shape[1:], name=model.input_names[0])
towers = []
for g in range(n_gpus):
with tf.device('/gpu:' + str(g)):
slice_g = Lambda(slice_batch, lambda shape: shape, arguments={'n_gpus':n_gpus, 'part':g})(x)
towers.append(model(slice_g))
with tf.device('/cpu:0'):
merged = Concatenate(axis=0)(towers)
return Model(inputs=[x], outputs=[merged])
When it's time to fit it is when I run into the issue:
model = to_multi_gpu(autoencoder, n_gpus=8)
model.compile(loss=vae_objective, optimizer='adam', metrics=["accuracy"])
model.fit(names[:8000], names[:8000], batch_size=batch_size)
gives me the following error:
InvalidArgumentError: You must feed a value for placeholder tensor 'input_4' with dtype float
[[Node: input_4 = Placeholder[dtype=DT_FLOAT, shape=[], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
Note that all of the parameters are evenly divisible by the number of GPUs, so I don't expect that to be the problem.
Use
model = to_multi_gpu(autoencoder, n_gpus=8)
model.compile(loss=vae_objective, optimizer='adam', metrics=["accuracy"])
model.fit(names[:8000], names[:8000], batch_size=batch_size*8)
ie code the VAE using batch_size, run fit with batch_size*gpus
Make sure the sample size can be divided by batch_size*gpus
Related
I modified the code from here. What I'm trying to do is combine the two matrices to predict the output matrix. The output matrix is built from the two input matrices. The problem seems to be associated to:
self.Combined_dense_1 = tf.keras.layers.Dense(units=32, activation="relu")
self.Combined_dense_2 = tf.keras.layers.Dense(units=16, activation="softmax")
The linked medium tutorial only predicting a single number based on the combined mixed input. I however am trying to predict a whole matrix but don't know how to structure the combined layer (if this is even the problem).
The error: "ValueError: Shape mismatch: The shape of labels (received (40,)) should equal the shape of logits except for the last dimension (received (10, 16))."
The code:
import warnings
import sys
if not sys.warnoptions:
warnings.simplefilter("ignore")
import numpy as np
import os
import random
import tensorflow as tf
from tensorflow import keras
from IPython.display import clear_output
class model(keras.Model):
def __init__(self):
super().__init__()
# The layers to process our image
self.Conv2D_1 = tf.keras.layers.Conv2D(filters=32,
kernel_size=(1, 1),
strides=(1, 1)
)
self.Conv2D_2 = tf.keras.layers.Conv2D(filters=32,
kernel_size=(3, 3),
strides=(1, 1)
)
# our combined layers
self.Combined_dense_1 = tf.keras.layers.Dense(units=32, activation="relu")
self.Combined_dense_2 = tf.keras.layers.Dense(units=16, activation="softmax")
def call(self, input_image_one, input_image_two):
# Image model
I = self.Conv2D_1(input_image_one)
I = self.Conv2D_2(I)
# Flatten I so we can merge our data.
I = tf.keras.layers.Flatten()(I)
N = self.Conv2D_1(input_image_two)
N = self.Conv2D_2(N)
N = tf.keras.layers.Flatten()(N)
# Combined model
x = tf.concat([N, I], 1) # Concatenate through axis #1
x = self.Combined_dense_1(x)
x = self.Combined_dense_2(x)
return x
network = model()
optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def train_step(model, optimizer, loss_function,
images_one_batch, images_two_batch,
labels):
with tf.GradientTape() as tape:
model_output = model(images_one_batch, images_two_batch)
print(model_output)
loss = loss_function(labels, model_output) # our labels vs our predictions
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return loss
def train(model, optimizer, loss_function, epochs,
images_one_batch, images_two_batch,
labels):
loss_array = []
for epoch in range(epochs):
loss = train_step(model, optimizer, loss_function, images_one_batch, images_two_batch, labels)
loss_array.append(loss)
if ((epoch + 1) % 20 == 0):
# Calculating accuracy
network_output = network(images_one_batch, images_two_batch)
preds = np.argmax(network_output, axis=1)
acc = 0
for i in range(len(images_one_batch)):
if (preds[i] == labels[i]):
acc += 1
print(" loss:", loss, " Accuracy: ", acc / len(images_one_batch) * 100, "%")
clear_output(wait=True)
NumberofVars = 2;
width= NumberofVars; height = NumberofVars
NumberOfComputationSets = 10
CM_MatrixArr1 = []
CM_MatrixArr2 = []
for j in range(NumberOfComputationSets):
Theta1 = list(np.reshape(np.random.randint(2, size=4), (1,4))[0])
Theta1 = list(np.float_(Theta1))
CM_MatrixArr1.append(Theta1)
Theta2 = list(np.reshape(np.random.randint(2, size=4), (1,4))[0])
Theta2 = list(np.float_(Theta2))
CM_MatrixArr2.append(Theta2)
combinedCM_MatrixArr = []
combinedCM_toIntArr = []
for x,y in zip(CM_MatrixArr1, CM_MatrixArr2):
combinedCM = []
combinedCM_toInt = 0
for a,b in zip(x,y):
LogVal = (a == b)
combinedCM.append(float(LogVal == True))
combinedCM_MatrixArr.append(combinedCM)
combinedCM_MatrixArr = np.array(combinedCM_MatrixArr)
combinedCM_MatrixArr = combinedCM_MatrixArr.reshape(NumberOfComputationSets,2,2)
CM_MatrixArr1 = np.array(CM_MatrixArr1)
CM_MatrixArr1 = CM_MatrixArr1.reshape(NumberOfComputationSets,2,2)
CM_MatrixArr1 = CM_MatrixArr1.reshape(NumberOfComputationSets, 2,2,1)
CM_MatrixArr2 = np.array(CM_MatrixArr2)
CM_MatrixArr2 = CM_MatrixArr2.reshape(NumberOfComputationSets,2,2)
CM_MatrixArr2 = CM_MatrixArr2.reshape(NumberOfComputationSets, 2,2,1)
train(network,optimizer,loss_function,300,CM_MatrixArr1,CM_MatrixArr2,combinedCM_MatrixArr)
I am using TF2 (2.3.0) NN to approximate the function y which solves the ODE: y'+3y=0
I have defined cutsom loss class and function in which I am trying to differentiate the single output with respect to the single input so the equation holds, provided that y_true is zero:
from tensorflow.keras.losses import Loss
import tensorflow as tf
class CustomLossOde(Loss):
def __init__(self, x, model, name='ode_loss'):
super().__init__(name=name)
self.x = x
self.model = model
def call(self, y_true, y_pred):
with tf.GradientTape() as tape:
tape.watch(self.x)
y_p = self.model(self.x)
dy_dx = tape.gradient(y_p, self.x)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
return loss
but running the following NN:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
from custom_loss_ode import CustomLossOde
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
loss = CustomLossOde(model.input, model)
model.compile(optimizer=Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99),loss=loss)
model.run_eagerly = True
model.fit(x_train, y_train, batch_size=16, epochs=30)
for now I am getting 0 loss from the fisrt epoch, which doesn't make any sense.
I have printed both y_true and y_test from within the function and they seem OK so I suspect that the problem is in the gradien which I didn't succeed to print.
Apprecitate any help
Defining a custom loss with the high level Keras API is a bit difficult in that case. I would instead write the training loop from scracth, as it allows a finer grained control over what you can do.
I took inspiration from those two guides :
Advanced Automatic Differentiation
Writing a training loop from scratch
Basically, I used the fact that multiple tape can interact seamlessly. I use one to compute the loss function, the other to calculate the gradients to be propagated by the optimizer.
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
num_samples = 1024
x_train = 4 * (tf.random.uniform((num_samples, )) - 0.5)
y_train = tf.zeros((num_samples, ))
inputs = Input(shape=(1,))
x = Dense(16, 'tanh')(inputs)
x = Dense(8, 'tanh')(x)
x = Dense(4)(x)
y = Dense(1)(x)
model = Model(inputs=inputs, outputs=y)
# using the high level tf.data API for data handling
x_train = tf.reshape(x_train,(-1,1))
dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).batch(1)
opt = Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.99)
for step, (x,y_true) in enumerate(dataset):
# we need to convert x to a variable if we want the tape to be
# able to compute the gradient according to x
x_variable = tf.Variable(x)
with tf.GradientTape() as model_tape:
with tf.GradientTape() as loss_tape:
loss_tape.watch(x_variable)
y_pred = model(x_variable)
dy_dx = loss_tape.gradient(y_pred, x_variable)
loss = tf.math.reduce_mean(tf.square(dy_dx + 3 * y_pred - y_true))
grad = model_tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(grad, model.trainable_variables))
if step%20==0:
print(f"Step {step}: loss={loss.numpy()}")
I am trying to use batch normalization layers whith U-net for the segmentation task. Same layers works fine for res-net, vgg, xception etc., and I'm curious if it is an architecture dependent problem? During the training everything is fine, metrics grow losses dpor, but once I'm trying to evaluate the model or predict the mask it generates garbage. Seems like learned weights for those layers keep updating even during test and prediction. How to solve this problem in keras?
keras version = 2.2.2
I was trying to use Batch norm layers only in encoder part, doesn't help.
I was also trying to set layers parameter: trainable=False, doesn't help.
from keras.models import Input, Model
from keras.layers import Conv2D, Concatenate, MaxPooling2D
from keras.layers import UpSampling2D, Dropout, BatchNormalization
def conv_block(m, dim, res, do=0):
n = Conv2D(dim, 3, padding='same')(m)
n = BatchNormalization()(n)
n = keras.layers.LeakyReLU(0)(n)
n = Dropout(do)(n) if do else n
n = Conv2D(dim, 3, padding='same')(n)
n = BatchNormalization()(n)
n = keras.layers.LeakyReLU(0)(n)
return Concatenate()([m, n]) if res else n
def conv_block_bn(m, dim, res, do=0):
n = Conv2D(dim, 3, padding='same')(m)
n = BatchNormalization()(n)
n = keras.layers.LeakyReLU(0)(n)
n = Dropout(do)(n) if do else n
n = Conv2D(dim, 3, padding='same')(n)
n = BatchNormalization()(n)
n = keras.layers.LeakyReLU(0)(n)
return Concatenate()([m, n]) if res else n
def level_block(m, dim, depth, inc, do, mp, up, res):
if depth > 0:
n = conv_block_bn(m, dim, res)#(m, dim, acti, bn, res)
m = MaxPooling2D()(n) if mp else Conv2D(dim, 3, strides=2, padding='same')(n)
m = level_block(m, int(inc*dim), depth-1, inc, do, mp, up, res)
if up:
m = UpSampling2D()(m)
m = Conv2D(dim, 2, padding='same')(m)
m = BatchNormalization()(m)
m = keras.layers.LeakyReLU(0)(m)
else:
m = Conv2DTranspose(dim, 3, strides=2, activation='relu', padding='same')(m)
n = Concatenate()([n, m])
m = conv_block_bn(n, dim, res)#(n, dim, acti, bn, res)
else:
m = conv_block_bn(m, dim, res,do)#(m, dim, acti, bn, res, do)
return m
def UNet(img_shape, out_ch=1, start_ch=64, depth=4, inc_rate=2., activation='relu',
dropout=0.5, batchnorm=False, maxpool=True, upconv=True, residual=False):
i = Input(shape=img_shape)
o = level_block(i, start_ch, depth, inc_rate,dropout, maxpool, upconv, residual)
o = Conv2D(out_ch, 1, activation='sigmoid')(o)
return Model(inputs=i, outputs=o)
model1 = UNet((512,512,1), out_ch=1, start_ch=64, depth=4, inc_rate=2.,
dropout=0.5, maxpool=True, upconv=True, residual=False)
model1 = multi_gpu_model(model1,gpus=6)
model1.compile(Adam(lr = 3.5e-6), loss = custom_losses, metrics = [dice_coef]) ```
When predicting outputs after training you must call your model with:
Option1:
prediction = trained_model(input, training=False)
Option2:
prediction = trained_model.call(input, training=False)
Option3:
prediction = trained_model.predict(input)
The reason is that layers such as Normalization and dropout behave differently uring training. Hence, you must tell the model that you are running inference, not training.
try bach normalization on each layer and also eliminate dropout layer. See what you get. In prediction also set the flag of training=False.
I'm new to machine learning and trying to fit a sample data set with neural networks in python using tensorflow. After having implemented the neural network in Dymola I want to compare the outputs of the function with those from the neural network.
The sample data set is:
import tensorflow as tf
from keras import metrics
import numpy as np
from keras.models import *
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.callbacks import *
import scipy.io as sio
import mat4py as m4p
inputs = np.linspace(0, 15, num=3000)
outputs = 1/7 * ((inputs/5)^3 - (inputs/3)^2 + 5)
Inputs and outputs are then scaled into the interval [0; 0.9]:
inputs_max = np.max(inputs)
inputs_min = np.min(inputs)
outputs_max = np.max(outputs)
outputs_min = np.min(outputs)
upper_bound = 0.9
lower_bound = 0
m_in = (upper_bound - lower_bound) / (inputs_max - inputs_min)
c_in = upper_bound - (m_in * inputs_max)
scaled_in = m_in * inputs + c_in
m_out = (upper_bound - lower_bound) / (outputs_max - outputs_min)
c_out = upper_bound - (m_out * outputs_max)
scaled_out = m_in * inputs + c_in
and after that the neural network is trained with:
# shuffle values
def shuffle_in_unison(a, b):
assert len(a) == len(b)
shuffled_a = np.empty(a.shape, dtype=a.dtype)
shuffled_b = np.empty(b.shape, dtype=b.dtype)
permutation = np.random.permutation(len(a))
for old_index, new_index in enumerate(permutation):
shuffled_a[new_index] = a[old_index]
shuffled_b[new_index] = b[old_index]
return shuffled_a, shuffled_b
tf_features_64 = scaled_in
tf_labels_64 = scaled_out
tf_features_32 = tf_features_64.astype(np.float32)
tf_labels_32 = tf_labels_64.astype(np.float32)
X = tf_features_32
Y = tf_labels_32
shuffle_in_unison(X, Y)
# define callbacks
filepath = "weights-improvement-{epoch:02d}-{val_loss:.2f}.hdf5"
savebestCallBack = ModelCheckpoint(filepath, monitor='val_loss', verbose=1,
save_best_only=True, save_weights_only=False, mode='auto', period=1)
tbCallBack = TensorBoard(log_dir='./Graph',
histogram_freq=5,
write_graph=True,
write_images=True)
esCallback = EarlyStopping(monitor='val_loss',
min_delta=0,
patience=500,
verbose=0,
mode='min')
# neural network architecture
visible = Input(shape=(1,))
x = Dense(40, activation='tanh')(visible)
x = Dense(39, activation='tanh')(x)
x = Dense(38, activation='tanh')(x)
x = Dense(30, activation='tanh')(x)
output = Dense(1)(x)
# setup optimizer
Optimizer = optimizers.adam(lr=0.0007, amsgrad=True)
model = Model(inputs=visible, outputs=output)
model.compile(optimizer=Optimizer,
loss=['mse'],
metrics=['mae', 'mse']
)
model.fit(X, Y, epochs=1000, batch_size=1, verbose=1,
shuffle=True, validation_split=0.05, callbacks=[tbCallBack, esCallback])
# return weights
weights1 = model.layers[1].get_weights()[0]
biases1 = model.layers[1].get_weights()[1]
print('Layer1---------------------------------------------------------------------------------------------------------')
print('weights1:')
print(repr(weights1.transpose()))
print('biases1:')
print(repr(biases1))
w1 = weights1.transpose()
b1 = biases1.transpose()
we1 = {'w1' : w1.tolist()}
bi1 = {'b1' : b1.tolist()}
.........
......
Later on, I implemented the trained neural network in the program "Dymola" by loading the weights and biases in pre-configured "neural network base classes" (which have been used several times and are working).
// Modelica code for Dymola:
Real inputs;
Real outputs;
Real scaled_outputs;
Real scaled_inputs(start=0);
Real scaled_outputsfunc;
der(scaled_inputs) = 0.9;
//part of the neural network implementation in Dymola
NeuralNetwork.BaseClasses.NeuralNetworkLayer neuralNetworkLayer1(
NeuronActivationFunction=NeuralNetwork.Types.ActivationFunction.TanSig,
numInputs=1,
numNeurons=40,
weightTable=[-0.367953330278397; ......])
annotation (Placement(transformation(extent={{-76,22},{-56,42}})));
//scaled inputs
neuralNetworkLayer1.u[1] = scaled_inputs;
//scaled outputs
neuralNetworkLayer5.y[1]= scaled_outputs;
//scaled_inputs = 0.06 * inputs
inputs = 1/0.06 * (scaled_inputs);
outputs = 1/875 * inputs^3 - 1/63 * inputs^2 + 5/7;
scaled_outputsfunc = 1.2173139581825052 * outputs - 0.3173139581825052;
When plotting and comparing the scaled outputs of the function and the returned (scaled) values of the neural network I noticed that the approximation is very good in the interval from [0.5; 0.8], but the closer the inputs reach the boundaries the worse the approximation becomes.
Unfortunately, I have no clue why this is happening and how to fix this issue. I'd be very glad if someone could help me.
I want to answer my own question: I forgot to specify the activation function in the output layer in my python code, which Keras then set to a linear function by default, see also:
https://keras.io/layers/core/
In Dymola, where my ANN was implemented, 'tanh' was the activation function in the last layer, which lead to a divergence near the boundaries.
The correct python code for this application must be:
visible = Input(shape=(1,))
x = Dense(40, activation='tanh')(visible)
x = Dense(39, activation='tanh')(x)
x = Dense(38, activation='tanh')(x)
x = Dense(30, activation='tanh')(x)
output = Dense(1, activation='tanh')(x)
I am attempting to build a Conditional GAN model based on jacob's code on keras-dcgan (https://github.com/jacobgil/keras-dcgan).
The model architecture I assumed is the following picture:
original paper:
http://cs231n.stanford.edu/reports/2015/pdfs/jgauthie_final_report.pdf
For generator, I insert the condition (the condition is a bunch of one-hot vectors in this case) by first concatenating it with noise, then feed the concatenation through the generator.
For discriminator, I insert the condition by concatenating with a flattened layer in the middle of the model.
My code runs, but it generates some random graph instead of specific numbers. Which step is wrong? Did I not insert the condition appropriately?
My result after running approximately 5500 iterations:
Code:
import warnings
warnings.filterwarnings('ignore')
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Input, merge
from keras.layers import Reshape, concatenate
from keras.layers.core import Activation
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import UpSampling2D
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers.core import Flatten
from keras.optimizers import SGD
from keras.datasets import mnist
import numpy as np
import tensorflow as tf
from PIL import Image
import argparse
import math
K.set_image_dim_ordering('th')
# based on the labels below, we create a flattened array with 10 one-hot-vectors, and call it y_prime
labels = np.array([0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9])
def dense_to_one_hot(labels_dense, num_classes=10):
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = np.arange(num_labels) * num_classes
labels_one_hot = np.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
return labels_one_hot
# y_dim is the number of labels in one hot vector form, hence its 10
# y_prime is a 100*10 matrix, and len(y_p) = 100. Note that len(y_prime) must equate to batch_size for the matrices to be properly concatenated
# Also y_dim=10, which is the size of any one-hot vector
y_p = dense_to_one_hot(labels)
y_size = len(y_p)
y_dim = len(y_p[0])
#g_inputs is the input for generator
#auxiliary_input is the condition
#d_inputs is the input for discriminator
g_inputs = (Input(shape=(100,), dtype='float32'))
auxiliary_input = (Input(shape=(y_dim,), dtype='float32'))
d_inputs = (Input(shape=(1,28,28), dtype='float32'))
def generator_model():
T = concatenate([g_inputs,auxiliary_input])
T = (Dense(1024))(T)
T = (Dense(128*7*7))(T)
T = (BatchNormalization())(T)
T = (Activation('tanh'))(T)
T = (Reshape((128, 7, 7), input_shape=(128*7*7,)))(T)
T = (UpSampling2D(size=(2, 2)))(T)
T = (Convolution2D(64, 5, 5, border_mode='same'))(T)
T = (BatchNormalization())(T)
T = (Activation('tanh'))(T)
T = (UpSampling2D(size=(2, 2)))(T)
T = (Convolution2D(1, 5, 5, border_mode='same'))(T)
T = (BatchNormalization())(T)
T = (Activation('tanh'))(T)
model = Model(input=[g_inputs,auxiliary_input], output=T)
return model
def discriminator_model():
T = (Convolution2D(filters= 64, kernel_size= (5,5), padding='same'))(d_inputs)
T = (BatchNormalization())(T)
T = (Activation('tanh'))(T)
T = (MaxPooling2D(pool_size=(2, 2)))(T)
T = (Convolution2D(128, 5, 5))(T)
T = (BatchNormalization())(T)
T = (Activation('tanh'))(T)
T = (MaxPooling2D(pool_size=(2, 2)))(T)
T = (Flatten())(T)
T = concatenate([T, auxiliary_input])
T = (Dense(1024))(T)
T = (Activation('tanh'))(T)
T = (Dense(1))(T)
T = (Activation('sigmoid'))(T)
model = Model(input=[d_inputs,auxiliary_input], output=T)
return model
def generator_containing_discriminator(generator, discriminator):
T1 = generator([g_inputs, auxiliary_input])
discriminator.trainable = False
T2 = discriminator([T1,auxiliary_input])
model = Model(input=[g_inputs, auxiliary_input], output=T2)
return model
def combine_images(generated_images):
num = generated_images.shape[0]
width = int(math.sqrt(num))
height = int(math.ceil(float(num)/width))
shape = generated_images.shape[2:]
image = np.zeros((height*shape[0], width*shape[1]), dtype=generated_images.dtype)
for index, img in enumerate(generated_images):
i = int(index/width)
j = index % width
image[i*shape[0]:(i+1)*shape[0], j*shape[1]:(j+1)*shape[1]] = img[0, :, :]
return image
def train(BATCH_SIZE,y_prime):
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = (X_train.astype(np.float32) - 127.5)/127.5
X_train = X_train.reshape((X_train.shape[0], 1) + X_train.shape[1:])
discriminator = discriminator_model()
generator = generator_model()
discriminator_on_generator = generator_containing_discriminator(generator, discriminator)
d_optim = SGD(lr=0.0005, momentum=0.9, nesterov=True)
g_optim = SGD(lr=0.0005, momentum=0.9, nesterov=True)
generator.compile(loss='binary_crossentropy', optimizer="SGD")
discriminator_on_generator.compile(loss='binary_crossentropy', optimizer=g_optim)
discriminator.trainable = True
discriminator.compile(loss='binary_crossentropy', optimizer=d_optim)
noise = np.zeros((BATCH_SIZE, 100))
for epoch in range(100):
print("Epoch is", epoch)
print("Number of batches", int(X_train.shape[0]/BATCH_SIZE))
for index in range(int(X_train.shape[0]/BATCH_SIZE)):
for i in range(BATCH_SIZE):
noise[i, :] = np.random.uniform(-1, 1, 100)
image_batch = X_train[index*BATCH_SIZE:(index+1)*BATCH_SIZE]
y_batch = dense_to_one_hot(y_train[index*BATCH_SIZE:(index+1)*BATCH_SIZE])
y_batch = np.concatenate((y_batch , y_prime))
generated_images = generator.predict([noise,y_prime], verbose=0)
if index % 20 == 0:
image = combine_images(generated_images)
image = image*127.5+127.5
Image.fromarray(image.astype(np.uint8)).save(str(epoch)+"_"+str(index)+".png")
X = np.concatenate((image_batch, generated_images))
y = [1] * BATCH_SIZE + [0] * BATCH_SIZE
d_loss = discriminator.train_on_batch([X,y_batch], y)
print("batch %d d_loss : %f" % (index, d_loss))
for i in range(BATCH_SIZE):
noise[i, :] = np.random.uniform(-1, 1, 100)
discriminator.trainable = False
g_loss = discriminator_on_generator.train_on_batch([noise,y_prime], [1] * BATCH_SIZE)
discriminator.trainable = True
print("batch %d g_loss : %f" % (index, g_loss))
if index % 10 == 9:
generator.save_weights('generator', True)
discriminator.save_weights('discriminator', True)
train(100,y_p)
Here is my code for building Conditional GAN (CGAN) with Keras: https://github.com/hklchung/GAN-GenerativeAdversarialNetwork/tree/master/CGAN
After 5 epochs on MNIST I get this:
MNIST CGAN output
and after 50 epochs on the CelebsA dataset:
CelebA CGAN output
My experience is that if you don't see any good results after 20 epochs, something is wrong with your model and training it any longer won't improve your image quality.