Environment:
TF2.0
Python 3.5
ubuntu 16.04
Problem:
I try to use the pre-trained mobilenet_V2 but accuracy doesn't increase:
base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
include_top=False,
weights='imagenet')
The script is copied from the tutorial of the tensorflow 2.0(https://www.tensorflow.org/tutorials/images/transfer_learning?hl=zh-cn)
The only change I made is the dataset which feed into the network. The original code makes binary classification between dogs and cats, and everything works. However, the accuracy never increases while using multi-classes datasets like: "mnist", "tf_flowers". Please note that, I used the correct loss function and metrics.
Naive model and results:
Keras.mobilenetv2:
Here is the code:
from __future__ import absolute_import, division, print_function, unicode_literals
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, GlobalAveragePooling2D
from tensorflow.keras import Model
keras = tf.keras
import tensorflow_datasets as tfds
# tfds.disable_progress_bar()
IMG_SIZE = 224
IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)
def format_example(image, label):
if image.shape[-1] == 1:
image = tf.concat([image, image, image], 2)
image = tf.cast(image, tf.float32)
image = (image/127.5) - 1
image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
return image, label
##----functional model----##
class TinyModel():
def __init__(self, num_classes, hiddens=32, input_shape=IMG_SHAPE):
import tensorflow as tf
self.num_classes = num_classes
self.input_shape = input_shape
self.hiddens = hiddens
def build(self):
inputs = Input(shape=self.input_shape)
x = Conv2D(16, 3, activation="relu", strides=2)(inputs)
x = Conv2D(32, 3, activation="relu", strides=2)(x)
x = Conv2D(32, 3, activation="relu", strides=2)(x)
x = Conv2D(16, 3, activation="relu")(x)
x = Flatten()(x)
x = Dense(self.hiddens, activation="relu")(x)
outputs = Dense(self.num_classes, activation="softmax")(x)
model = Model(inputs=inputs, outputs=outputs, name='my_model')
return model
def assemble_model(num_classes, model_name='MobileNetV2'):
import tensorflow as tf
base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
weights='imagenet',
include_top=False)
model = tf.keras.Sequential([
base_model,
GlobalAveragePooling2D(),
Dense(num_classes, activation='softmax')
])
model.trainable = True
return model
## ---- dataset preparation -----##
SPLIT_WEIGHTS = (8, 1, 1)
splits = tfds.Split.TRAIN.subsplit(weighted=SPLIT_WEIGHTS)
(raw_train, raw_validation, raw_test), metadata = tfds.load(
'tf_flowers', split=list(splits),
with_info=True, as_supervised=True)
get_label_name = metadata.features['label'].int2str
train = raw_train.map(format_example)
validation = raw_validation.map(format_example)
test = raw_test.map(format_example)
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 1000
train_ds = train.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
validation_ds = validation.batch(BATCH_SIZE)
test_ds = test.batch(BATCH_SIZE)
IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)
## ----- model config ---- ##
# Create an instance of the model
model = TinyModel(num_classes=5).build() # model 1
# model = assemble_model(num_classes=5) # model 2
model.summary()
## ----- training config -----##
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
## ----- training loop -----##
#tf.function
def train_step(images, labels):
with tf.GradientTape() as tape:
predictions = model(images)
loss = loss_object(labels, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
train_accuracy(labels, predictions)
#tf.function
def test_step(images, labels):
predictions = model(images)
t_loss = loss_object(labels, predictions)
test_loss(t_loss)
test_accuracy(labels, predictions)
EPOCHS = 5
for epoch in range(EPOCHS):
# Reset the metrics at the start of the next epoch
train_loss.reset_states()
train_accuracy.reset_states()
test_loss.reset_states()
test_accuracy.reset_states()
for images, labels in train_ds:
train_step(images, labels)
for test_images, test_labels in test_ds:
test_step(test_images, test_labels)
template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
print(template.format(epoch+1,
train_loss.result(),
train_accuracy.result()*100,
test_loss.result(),
test_accuracy.result()*100))
----------------------SOLVED-----------------------
Solution:add the argument "training=True" when training the keras.application.. For example
model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,weights="imagenet",include_top=False)
pred = model(inputs, training=True)
The reason might be caused by "batchnorm" layer. Those model which has BN layers works well in keras training loop, "model.fit()", and nothing to takecare. However, they cannot learn anything by costume training loop if you forget to set training=True in model()
The problem is that you set all your parameters to be non-trainable, check this on hte summary of the model, you will see something like this
Change this line, (or just delete it)
base_model.trainable = False
To
base_model.trainable = True
And everything will work fine
Related
I was comparing loss for two simple MLP models with and without dropout on both TF/Keras and Pytorch frameworks (on Keras imdb dataset). But with PyTorch I am not getting the same results as I hoped for and was wondering perhaps what I am doing incorrectly.
# Keras - IMDB Dataset
model = Sequential()
model.add(Dense(16, activation = "relu", input_shape= (10000,)))
model.add(Dropout(0.5)) # comment out this line for no dropout model
model.add(Dense(16, activation = "relu"))
model.add(Dropout(0.5)) # comment out this line for no dropout model
model.add(Dense(1, activation = "sigmoid"))
model.compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = ["accuracy"]
)
history = model.fit(
X_train,
y_train,
epochs = 20,
batch_size = 512,
validation_data = (X_val, y_val)
)
The results I obtained in keras (Left figure without dropout and right with dropout)
# Pytorch - same IMDB dataset from keras
class MLP(nn.Module):
def __init__(self, in_dims, l1, l2, out_dims):
super(MLP, self).__init__()
self.fc1 = nn.Linear(in_dims, l1)
self.fc2 = nn.Linear(l1, l2)
self.fc3 = nn.Linear(l2, out_dims)
self.dropout = nn.Dropout(p=0.5)
def forward(self, X):
out = F.relu(self.fc1(X))
out = self.dropout(out) # comment out this line for no dropout model
out = F.relu(self.fc2(out))
out = self.dropout(out) # comment out this line for no dropout model
out = F.sigmoid(self.fc3(out))
return out
model = MLP(10000, 16, 16, 1)
optimizer = optim.RMSprop(model.parameters(), lr = 0.001)
criterion = nn.BCELoss()
min_val_loss = np.inf
losses = []
val_losses = []
accuracy = []
val_accuracy = []
for e in range(0,20):
running_loss = 0
for i,(X_train, y_train) in enumerate(train_loader):
yhat = model.forward(X_train)
loss = criterion(yhat.flatten(), y_train)
running_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(running_loss / (i+1)) #note its i+1 since i starts from 0
model.eval()
with torch.no_grad():
running_val_loss = 0
for i,(X_val, y_val) in enumerate(val_loader):
yhat_val = model.forward(X_val)
val_loss = criterion(yhat_val.flatten(), y_val)
running_val_loss += val_loss.item()
val_losses.append(running_val_loss / (i + 1))
if val_loss < min_val_loss:
best_params = model.state_dict()
min_val_loss = val_loss
print(f"epochs : {e}, train_loss : {loss}, val_loss : {val_loss}")
Figure on the left is the result from no dropout model which has similar results to the keras model. However the one with dropout doesnot have the same behaviour.
I am coming from medical background and a newbie in this machine learning field. I am trying to train my U-Net model using keras and tensorflow for image segmentation. However, my loss value is all NaN and the prediction is all black.
I would like to check the U-Net layer by layer but I don't know how to feed the data and from where to start. What I meant by checking for each layer is that I want to feed my images to first layer for example and see the output from the first layer and then moving on to the second layer and until to the last layer. Just want to see how the output is produced for each layer and to check from where the nan value is started. Really appreciate for your help.
These are my codes.
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from keras_preprocessing.image
import ImageDataGenerator
from tensorflow import keras
#Constants
SEED = 42
BATCH_SIZE_TRAIN = 16
BATCH_SIZE_TEST = 16
IMAGE_HEIGHT = 512
IMAGE_WIDTH = 512
IMG_SIZE = (IMAGE_HEIGHT, IMAGE_WIDTH)
data_dir = 'data'
data_dir_train = os.path.join(data_dir, 'training')
data_dir_train_image = os.path.join(data_dir_train, 'img')
data_dir_train_mask = os.path.join(data_dir_train, 'mask')
data_dir_test = os.path.join(data_dir, 'test')
data_dir_test_image = os.path.join(data_dir_test, 'img')
data_dir_test_mask = os.path.join(data_dir_test, 'mask')
NUM_TRAIN = 1413
NUM_TEST = 210
NUM_OF_EPOCHS = 10
def create_segmentation_generator_train(img_path, mask_path, BATCH_SIZE):
data_gen_args = dict(rescale=1./255)
img_datagen = ImageDataGenerator(**data_gen_args)
mask_datagen = ImageDataGenerator(*data_gen_args)
img_generator = img_datagen.flow_from_directory(img_path, target_size=IMG_SIZE, class_mode=None, color_mode='grayscale', batch_size=BATCH_SIZE, seed=SEED)
mask_generator = mask_datagen.flow_from_directory(mask_path, target_size=IMG_SIZE, class_mode=None, color_mode='grayscale', batch_size=BATCH_SIZE, seed=SEED)
return zip(img_generator, mask_generator)
def create_segmentation_generator_test(img_path, mask_path, BATCH_SIZE):
data_gen_args = dict(rescale=1./255)
img_datagen = ImageDataGenerator(**data_gen_args)
mask_datagen = ImageDataGenerator(*data_gen_args)
img_generator = img_datagen.flow_from_directory(img_path, target_size=IMG_SIZE, class_mode=None, color_mode='grayscale', batch_size=BATCH_SIZE, seed=SEED)
mask_generator = mask_datagen.flow_from_directory(mask_path, target_size=IMG_SIZE, class_mode=None, color_mode='grayscale', batch_size=BATCH_SIZE, seed=SEED)
return zip(img_generator, mask_generator)
def display(display_list):
plt.figure(figsize=(15,15))
title = ['Input Image', 'True Mask', 'Predicted Mask']
for i in range(len(display_list)):
plt.subplot(1, len(display_list), i+1)
plt.title(title[i])
plt.imshow(tf.keras.preprocessing.image.array_to_img(display_list[i]), cmap='gray')
plt.show()
def show_dataset(datagen, num=1):
for i in range(0,num):
image,mask = next(datagen)
display([image[0], mask[0]])
def unet(n_levels, initial_features=32, n_blocks=2, kernel_size=3, pooling_size=2, in_channels=1, out_channels=1):
#n_blocks = how many conv in each level
inputs = keras.layers.Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, in_channels))
x = inputs
convpars = dict(kernel_size=kernel_size, activation='relu', padding='same')
#downstream
skips = {}
for level in range(n_levels):
for _ in range (n_blocks):
x = keras.layers.Conv2D(initial_features * 2 ** level, **convpars)(x)
if level < n_levels - 1:
skips[level] = x
x = keras.layers.MaxPool2D(pooling_size)(x)
#upstream
for level in reversed(range(n_levels-1)):
x = keras.layers.Conv2DTranspose(initial_features * 2 ** level, strides=pooling_size, **convpars)(x)
x = keras.layers.Concatenate()([x, skips[level]])
for _ in range (n_blocks):
x = keras.layers.Conv2D(initial_features * 2 ** level, **convpars)(x)
#output
activation = 'sigmoid' if out_channels == 1 else 'softmax'
x = keras.layers.Conv2D(out_channels, kernel_size=1, activation='sigmoid', padding='same')(x)
return keras.Model(inputs=[inputs], outputs=[x], name=f'UNET-L{n_levels}-F{initial_features}')
EPOCH_STEP_TRAIN = NUM_TRAIN // BATCH_SIZE_TRAIN
EPOCH_STEP_TEST = NUM_TEST // BATCH_SIZE_TRAIN
model = unet(4)
model.compile(optimizer="adam", loss='binary_crossentropy', metrics=['accuracy'])
model.fit_generator(generator=train_generator, steps_per_epoch=EPOCH_STEP_TRAIN, validation_data=test_generator, validation_steps=EPOCH_STEP_TEST, epochs=NUM_OF_EPOCHS)
def show_prediction(datagen, num=1):
for i in range(0,num):
image,mask = next(datagen)
pred_mask = model.predict(image)[0] > 0.5
display([image[0], mask[0], pred_mask])
show_prediction(test_generator, 2)
To investigate your model layer-by-layer please see example how to show summary of the model and also how to save the model:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
#luodaan input
inputs=keras.Input(shape=(1,))
#luodaan kerros
dense=layers.Dense(64,activation="relu")
x=dense(inputs)
x=layers.Dense(64,activation="relu")(x)
outputs=layers.Dense(10)(x)
#Koostetaa
model=keras.Model(inputs=inputs,outputs=outputs,name="Spesiaali")
#Tarkastellaan
model.summary()
#Tallennellaan
model.save(".\model_to_be_investigated_by_someone_else_to_help_you")
...this makes it possible for you to see the whole model structure for "debugging your AI". If you do not find the solution itself, then add the last row of example to your own code, and then put the resulting folder e.g. to github and ask someone other to see the structure of your model to help you in solving the problem.
The blue drawing illustrates the output of command model.summary() and the red line illustrates the output shape of the first dense layer.
I am trying to implement the code from a Pytorch beginner's tutorial. But I have written the code for loading the saved model in another Python file.
The FashionClassify file contains the code exactly as its in the tutorial.
Below is the code:
from FashionClassify import NeuralNetwork
from FashionClassify import test_data
import torch
model = NeuralNetwork()
model.load_state_dict(torch.load("model.pth"))
classes = [
"T-shirt/top", "Trouser","Pullover","Dress","Coat","Sandal","Shirt","Sneaker","Bag","Ankle boot",
]
model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
pred = model(x)
predicted, actual = classes[pred[0].argmax(0)],classes[y]
print(f'Predicted: "{predicted}", Actual: "{actual}"')
However, when I run this, the entire training process starts again. Why is that so ?
OR
Is it an expected behavior ?
(I have gone through a couple of webpages and StackOverflow answers but couldn't find my problem)
FashionClassify file code:
import torch
from torch import nn
from torch.utils.data import DataLoader # wraps an iterable around dataset
from torchvision import datasets # stores samples and their label
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib as plt
training_data = datasets.FashionMNIST(root='data', train=True, download=True, transform=ToTensor(), )
test_data = datasets.FashionMNIST(root='data', train=False, download=True, transform=ToTensor(), )
batch_size = 64
train_dataLoader = DataLoader(training_data, batch_size=batch_size)
test_dataLoader = DataLoader(test_data, batch_size=batch_size)
for X, y in test_dataLoader:
print('Shape of X [N,C,H,W]:', X.size())
print('Shape of y:', y.shape, y.dtype)
break
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))
# to define a NN, we inherit a class from nn.Module
class NeuralNetwork(nn.Module):
def __init__(self):
# will specify how data will proceed in the forward pass
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28 * 28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
nn.ReLU()
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
model = NeuralNetwork().to(device)
print(model)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X,y) in enumerate(dataloader):
X,y = X.to(device), y.to(device)
#compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
#backprop
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch%100 ==0:
loss,current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test(dataloader, model):
size = len(dataloader.dataset)
model.eval()
test_loss, correct = 0,0
with torch.no_grad():
for X, y in dataloader:
X,y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= size
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
epochs = 5
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(train_dataLoader, model, loss_fn, optimizer)
test(test_dataLoader, model)
print("Done!")
torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")
That's what happens when you import another file. All the code gets rerun.
Instead, in your training file:
class FancyNetwork(nn.Module):
[...]
def train():
[train code]
if __name__ == "__main__":
train()
Now when you run this file train() will get called, but when you import this file in another one, train won't get called automatically.
I tried to make a copy of a neural network in pytorch and subsequently train the copied network, but training does not seem to change the weights in the network after copying. This post suggests that deepcopy is a convenient way to make a copy of a neural network, so I tried using that in my code.
The code below works just fine and shows that the weights and accuracy of the network are different after training from before training. However, when I toggle so that network_cp=deepcopy(network) and optimizer_cp=deepcopy(optimizer), the accuracy and weights before and after training are exactly the same.
# torch settings
torch.backends.cudnn.enabled = True
device = torch.device("cpu")
# training settings
learning_rate = 0.01
momentum = 0.5
batch_size_train = 64
batch_size_test = 1000
# get MNIST data set
train_loader, test_loader = load_mnist(batch_size_train=batch_size_train,
batch_size_test=batch_size_test)
# make a network
network = Net()
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
momentum=momentum)
network.to(device)
# train network
train(network, optimizer, train_loader, device)
# copy network
network_cp = network
#network_cp = deepcopy(network)
optimizer_cp = optimizer
#optimizer_cp = deepcopy(optimizer)
# get edge weights and accuracy of the copied network
acc1 = float(test(network_cp, optimizer_cp, test_loader, device))
weights1 = np.array(get_edge_weights(network_cp))
# train copied network
train(network_cp, optimizer_cp, train_loader, device)
# get edge weights and accuracy of the copied network after training
acc2 = float(test(network_cp, optimizer_cp, test_loader, device))
weights2 = np.array(get_edge_weights(network_cp))
# compare edge weights and accuracy of copied network before and after training
print('accuracy', acc1, acc2)
print('abs diff of weights for net1 and net2', np.sum(np.abs(weights1-weights2)))
To run the code above, include these imports and function definitions:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as tnn
import torch.nn.functional as tnf
from copy import deepcopy
import numpy as np
def load_mnist(batch_size_train = 64, batch_size_test = 1000):
train_loader = torch.utils.data.DataLoader(
torchvision.datasets.MNIST('temp/', #'/data/users/alice/pytorch_training_files/',
train=True, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.1307,), (0.3081,))
])),
batch_size=batch_size_train, shuffle=True)
test_loader = torch.utils.data.DataLoader(
torchvision.datasets.MNIST('temp/', #'/data/users/alice/pytorch_training_files/',
train=False, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.1307,), (0.3081,))
])),
batch_size=batch_size_test, shuffle=True)
return(train_loader, test_loader)
def train(network, optimizer, train_loader, device, n_epochs=5):
network.train()
for epoch in range(1, n_epochs + 1):
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = network(data)
loss = tnf.nll_loss(output, target)
loss.backward()
optimizer.step()
def test(network, optimizer, test_loader, device):
network.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = network(data)
test_loss += tnf.nll_loss(output, target, size_average=False).item()
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(target.data.view_as(pred)).sum()
test_loss /= len(test_loader.dataset)
print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
return(float(correct)/float(len(test_loader.dataset)))
def get_edge_weights(network):
layers = [module for module in network.modules()][1:]
output = np.zeros(1)
for j, layer in enumerate(layers):
weights = list(layer.parameters())[0]
weights_arr = weights.detach().numpy()
weights_arr = weights_arr.flatten()
output = np.concatenate((output,weights_arr))
return output[1:]
class Net(tnn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 =tnn.Linear(784,264)
self.fc2 = tnn.Linear(264,10)
def forward(self, x):
x = tnf.relu(self.fc1(x.view(-1,784)))
x = tnf.relu(self.fc2(x))
return tnf.log_softmax(x)
After optimizer_cp = deepcopy(optimizer), the optimizer_cp still wants to optimize the old model's parameters (as defined by optimizer = optim.SGD(network.parameters(), lr=learning_rate, momentum=momentum)).
After deep copying the model, the optimizer needs to be told to optimize this new model's parameters:
optimizer_cp = optim.SGD(network_cp.parameters(), lr=learning_rate, momentum=momentum)
Chainer batchnormalization does not work well with my code although batch normalization of tensorflow works. I use the dataset mnist the code below shows.
Using chainer(version=6.1.0), without batchnormalization, the validation accuracy is within 0.97 and 0.98 after 100 epochs whereas with batchnormalization, it is less than 0.80 after 100 epochs.
When I use the same way with tensorflow(version=1.14.0), the validation accuracy is around 0.98 in a both way, with batchnormalization or without batchnormalization.
This is the part of my code.
The number of epoch is 100, and its batchsize is 1000. I use the Adam as the optimizer with learning_rate 0.01.
dataset, train data, validation data
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)
x_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
x_val = test_images.astype('float32')/255
y_val = test_labels.astype('int32')
model and condition (chainer)
# Define model
class MyModel(Chain):
def __init__(self,n_in=784,n_hidden=100,n_out=10):
initializer = chainer.initializers.HeNormal()
super().__init__()
with self.init_scope():
self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
self.bn=L.BatchNormalization(n_hidden, decay=0.99, eps=0.001)
def forward(self,x):
h = F.relu(self.bn(self.l1(x)))
h = F.relu(self.bn(self.l2(h)))
return self.l3(h)
model = MyModel()
optimizer = optimizers.Adam()
optimizer.setup(model)
n_epoch = 100
n_batchsize = 1000
model and condition (tensorflow)
n_inputs = 28 * 28
n_hidden1 = 100
n_hidden2 = 100
n_outputs = 10
batch_norm_momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
with tf.name_scope("dnn"):
he_init = tf.variance_scaling_initializer()
my_batch_norm_layer = partial(tf.layers.batch_normalization,
training=training,
momentum=batch_norm_momentum)
my_dense_layer = partial(tf.layers.dense,
kernel_initializer=he_init)
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
bn1 = tf.nn.relu(my_batch_norm_layer(hidden1))
hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
bn2 = tf.nn.relu(my_batch_norm_layer(hidden2))
logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
with tf.name_scope("train"):
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
All code I use with chainer
import numpy as np
import chainer
from chainer import cuda, Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions
from keras.datasets import mnist
import cupy as cp
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)
x_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
x_val = test_images.astype('float32')/255
y_val = test_labels.astype('int32')
# Define model
class MyModel(Chain):
def __init__(self,n_in=784,n_hidden=100,n_out=10):
initializer = chainer.initializers.HeNormal()
super().__init__()
with self.init_scope():
self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
self.bn=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)
def forward(self,x):
h = F.relu(self.bn(self.l1(x)))
h = F.relu(self.bn(self.l2(h)))
return self.l3(h)
# define optimizer
model = MyModel()
optimizer = optimizers.Adam(alpha=0.01)
optimizer.setup(model)
## learn network
n_epoch = 100
n_batchsize = 1000
iteration = 0
gpu_id = 0
cuda.get_device(gpu_id).use()
# send the network to gpu memory
model.to_gpu(gpu_id)
print("epoch train/loss val/loss train/acc val/acc")
for epoch in range(n_epoch):
# order dataset randomly
order = np.random.permutation(range(len(x_train)))
loss_list = []
accuracy_list = []
for i in range(0, len(order), n_batchsize):
index = order[i:i+n_batchsize]
x_train_batch = x_train[index,:]
y_train_batch = y_train[index]
x_train_batch = cp.asarray(x_train_batch)
y_train_batch = cp.asarray(y_train_batch)
output_train = model(x_train_batch)
loss_train_batch = F.softmax_cross_entropy(output_train, y_train_batch)
accuracy_train_batch = F.accuracy(output_train, y_train_batch)
loss_list.append(cuda.to_cpu(loss_train_batch.array))
accuracy_list.append(cuda.to_cpu(accuracy_train_batch.array))
model.cleargrads()
loss_train_batch.backward()
optimizer.update()
iteration += 1
loss_train = np.mean(loss_list)
accuracy_train = np.mean(accuracy_list)
# after one epoch, evaluate with validation data
x_val = cp.asarray(x_val)
y_val = cp.asarray(y_val)
with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
output_val = model(x_val)
loss_val = F.softmax_cross_entropy(output_val, y_val)
loss_val = cuda.to_cpu(loss_val.array)
accuracy_val = F.accuracy(output_val, y_val)
accuracy_val = cuda.to_cpu(accuracy_val.array)
print('{0:>4d} {1:>10.4f} {2:>10.4f} {3:>10.4f} {4:>10.4f}'.format(epoch,loss_train,loss_val,accuracy_train,accuracy_val))
All code I use with tensorflow
python
import tensorflow as tf
from keras.datasets import mnist
from functools import partial
import numpy as np
def shuffle_batch(X, y, batch_size):
rnd_idx = np.random.permutation(len(X))
n_batches = len(X) // batch_size
for batch_idx in np.array_split(rnd_idx, n_batches):
X_batch, y_batch = X[batch_idx], y[batch_idx]
yield X_batch, y_batch
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images=train_images.reshape(60000, 28*28)
test_images = test_images.reshape(10000, 28*28)
X_train = train_images.astype('float32')/255
y_train = train_labels.astype('int32')
X_valid = test_images.astype('float32')/255
y_valid = test_labels.astype('int32')
n_inputs = 28 * 28
n_hidden1 = 100
n_hidden2 = 100
n_outputs = 10
batch_norm_momentum = 0.9
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')
with tf.name_scope("dnn"):
he_init = tf.variance_scaling_initializer()
my_batch_norm_layer = partial(tf.layers.batch_normalization,
training=training,
momentum=batch_norm_momentum)
my_dense_layer = partial(tf.layers.dense)
hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
bn1 = tf.nn.relu(my_batch_norm_layer(hidden1))
hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
bn2 = tf.nn.relu(my_batch_norm_layer(hidden2))
logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
logits = my_batch_norm_layer(logits_before_bn)
with tf.name_scope("loss"):
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
loss = tf.reduce_mean(xentropy, name="loss")
learning_rate = 0.01
with tf.name_scope("train"):
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
with tf.name_scope("eval"):
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
n_epochs = 100
batch_size = 1000
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
print("epoch train/loss val/loss train/acc val/acc")
with tf.Session() as sess:
init.run()
for epoch in range(n_epochs):
loss_list = []
accuracy_list = []
for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
sess.run([training_op, extra_update_ops],
feed_dict={training: True, X: X_batch, y: y_batch})
loss_batch = loss.eval(feed_dict={X: X_batch, y: y_batch})
accuracy_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
loss_list.append(loss_batch)
accuracy_list.append(accuracy_batch)
loss_val = loss.eval(feed_dict={X: X_valid, y: y_valid})
accuracy_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})
print('{0:>4d} {1:>10.4f} {2:>10.4f} {3:>10.4f} {4:>10.4f}'
.format(epoch,np.mean(loss_list),loss_val,np.mean(accuracy_list),accuracy_val))
I expect batch normalization with chainer would be around 98% but it got less than 80%.
Am I using the batchnormalization with chainer in a wrong way, or the structure of batchnormalization differs a lot between chainer and tensorflow??
In order to use different batch statistics between the layers, the model definition has to be like the following code, which achieves 98% validation accuracy after 100 epochs in my environment.
class MyModel(Chain):
def __init__(self,n_in=784,n_hidden=100,n_out=10):
initializer = chainer.initializers.HeNormal()
super().__init__()
with self.init_scope():
self.l1=L.Linear(n_in, n_hidden, initialW=initializer)
self.l2=L.Linear(n_hidden, n_hidden, initialW=initializer)
self.l3=L.Linear(n_hidden, n_out, initialW=initializer)
self.bn1=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)
self.bn2=L.BatchNormalization(n_hidden, decay=0.9, eps=0.001)
def forward(self,x):
h = F.relu(self.bn1(self.l1(x)))
h = F.relu(self.bn2(self.l2(h)))
return self.l3(h)