I've been trying to replicate the Eager Execution tutorial using the MNIST dataset, but it doesn't work... I get no error, but it looks like as the code just stops in the first round, the output is
Epoch 000: Loss: nan, Accuracy: 9.898%
I've tested other MNIST codes and they progressed faster... (I waited for about 30 min)
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
#Load dataset
mnist = tf.contrib.learn.datasets.load_dataset('mnist')
train_data = mnist.train.images
train_labels = np.asarray(mnist.train.labels, dtype = np.int32)
test_data = mnist.test.images
test_labels = np.asarray(mnist.test.labels, dtype = np.int32)
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
train_dataset = train_dataset.shuffle(buffer_size=100000)
train_dataset = train_dataset.batch(10)
features, label = iter(train_dataset).next()
print("example features:", features[0])
print("example label:", label[0])
model = tf.keras.Sequential([
tf.keras.layers.Dense(10, activation="relu", input_shape=(784,)), # input shape required
tf.keras.layers.Dense(10, activation="relu"),
tf.keras.layers.Dense(3)
])
def loss(model, x, y):
y_ = model(x)
return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=y_)
def grad(model, inputs, targets):
with tf.GradientTape() as tape:
loss_value = loss(model, inputs, targets)
return tape.gradient(loss_value, model.variables)
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.01)
train_loss_results = []
train_accuracy_results = []
num_epochs = 200
for epoch in range(num_epochs):
epoch_loss_avg = tf.contrib.eager.metrics.Mean()
epoch_accuracy = tf.contrib.eager.metrics.Accuracy()
for x,y in train_dataset:
grads = grad(model, x, y)
optimizer.apply_gradients(zip(grads, model.variables),
global_step=tf.train.get_or_create_global_step())
epoch_loss_avg(loss(model, x, y))
epoch_accuracy(tf.argmax(model(x), axis=1, output_type=tf.int32), y)
train_loss_results.append(epoch_loss_avg.result())
train_accuracy_results.append(epoch_accuracy.result())
if epoch % 50 == 0:
print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch,
epoch_loss_avg.result(),
epoch_accuracy.result()))
PS. I know that the model probably won't be efficient, is just a test run
Related
I'm having trouble understanding why this is throwing an error. This code is pulled directly from the PyTorch documentation for a NN classifier for the fashion MNIST dataset. However when I try to flip this to the MNIST handwritten digits data set it comes up with the following error:
RuntimeError: The size of tensor a (10) must match the size of tensor b (64) at non-singleton dimension 1
This occurs when using the loss function during the training loop function. Can anyone help me understand why this is happening. Thanks!
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda
import torchvision.models as models
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
print(f"Using {device} device")
training_data = datasets.MNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.MNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
X, y = X.to(device), y.to(device)
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
def save_checkpoint(state, filename = "checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
model = NeuralNetwork().to(device)
learning_rate = 1e-3
batch_size = 64
epochs = 10
# Initialize the loss function
loss_fn = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimiser)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
torch.nn.MSELoss is an implemention of mean squared error. You can't measure the difference between two tensors if they're different sizes (MSELoss does not allow for broadcasting). So if you're using MSELoss, then the predictions and the targets must be the same shape. In your case, preds is a tensor of shape [64, 10], and y is a tensor of shape [64].
The reason y is of shape [64] rather than [64, 10] is that most classification dataset implementations represent targets as integer labels rather than one-hot encoded vectors. In theory, you could convert these integer label targets to one-hot encoded targets.
But in reality, since this is a classification problem, you should probably be using something like nn.CrossEntropyLoss rather than nn.MSELoss. The former is a conventional classification loss function, and it allows the targets to be integer labels rather than one-hot labels (so just swapping out MSELoss for CrossEntropyLoss should solve your problem). MSELoss is better suited for regression tasks and such.
I have experimented with training a simple tensorflow model using two scenarios: passing my model to my training loop (and to the subfunctions which are called from training loop), versus not passing my model to the training loop. The two cases result in different results. When passing my model to the training functions, the model is trained properly. But in the second scenario, something is wrong because the model is apparently not trained. I am baffled, and I wonder if it's a scope thing.
To be more specific, my setup involves dynamically creating a new model of larger size (adding some layers at each iteration of a for loop), and then training the resulting model. As stated before, I train the model in two scenarios: passing the model to training subfunctions and not doing so, and I obtain different results depending on which one I do. I verify this by giving the model a test sample (class 0 MNIST images) and checking if the correct classification is output. The models trained by passing the model as an argument is trained correctly, but, if I do not do this, then only the first model created by the for loop is correctly trained, as verified by incorrect class predictions. Can this be explained?
The code below is for training by not passing model as an argument.
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import time
epochs = 200
input_shape = (28,28,1)
num_classes=10
batch_size = 64
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_val = x_train[-10000:]
y_val = y_train[-10000:]
x_train = x_train[:-10000]
y_train = y_train[:-10000]
x_train = np.expand_dims(x_train, -1)
x_val = np.expand_dims(x_val, -1)
x_test = np.expand_dims(x_test, -1)
x_train = x_train.astype("float32")
x_test = x_test.astype("float32")
x_val = x_val.astype("float32")
y_test_sorted_args_0=np.where(y_test == 0)
x_test_0=x_test[y_test_sorted_args_0]
y_test_0=np.full( (x_test_0.shape)[0],0)
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
val_dataset = val_dataset.batch(batch_size)
acc_metric = keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = keras.metrics.SparseCategoricalAccuracy()
optimizer = keras.optimizers.SGD(learning_rate=1e-3)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
#tf.function
def train_step(x, y):
with tf.GradientTape() as tape:
mod_output = model(x, training=True)
loss_value = loss_fn(y, mod_output)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
acc_metric.update_state(y, mod_output)
return loss_value
#tf.function
def test_step(x, y):
val = model(x, training=False)
acc_metric.update_state(y, val)
def train( epochs):
for epoch in range(epochs):
print("\nStart of epoch %d" % (epoch,))
start_time = time.time()
# Iterate over the batches of the dataset.
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
loss_value = train_step( x_batch_train, y_batch_train)
# Log every 200 batches.
if step % 200 == 0:
print(
"Training loss (for one batch) at step %d: %.4f"
% (step, float(loss_value))
)
print("Seen so far: %d samples" % ((step + 1) * batch_size))
# Display metrics at the end of each epoch.
train_acc = acc_metric.result()
print("Training acc over epoch: %.4f" % (float(train_acc),))
# Reset training metrics at the end of each epoch
acc_metric.reset_states()
# Run a validation loop at the end of each epoch.
for x_batch_val, y_batch_val in val_dataset:
test_step(x_batch_val, y_batch_val)
val_acc = acc_metric.result()
val_acc_metric.reset_states()
print("Validation acc: %.4f" % (float(val_acc),))
print("Time taken: %.2fs" % (time.time() - start_time))
max_hidden=7
for num_hidden_layers in range(1,max_hidden,3):
model1 = keras.Sequential(
[
keras.Input(shape=input_shape),
layers.Flatten(),
]
)
for i in range(1, num_hidden_layers+1):
model1.add(layers.Dense(150, activation="relu"))
model1.add(layers.Dense(num_classes, activation="softmax"))
model=model1
train(epochs)
#verify that the model is properly trained by checking that the model correclty predicts images from class 0.
#The more class 0 predictions we have, the better.
for sample_index in range(0,10):
x_sample=x_test_0[sample_index,:,:,:]
x_sample=np.expand_dims(x_sample, 0)
print(tf.math.argmax(model(x_sample),axis=1))
time.sleep(1)
I am trying to make a deep neural network with the low level API of tensorflow although when I train the model and test it, the loss and mae of the testing set and the training set it very similar and very high compared to other models I have tried(e.g. Random Forest, AdaBoost Decision Tree). I even made the same dnn using keras and it gave me much better results I don't understand the problem, I do not have to much experience in machine learning
Tensorflow code
# load libraries
import tensorflow as tf
# reset graph
tf.reset_default_graph()
# define variables
n_hidden1 = 200
n_outputs = 1
with tf.device("/gpu:0"):
X = tf.placeholder(tf.float32, shape=(None, n_features), name="X")
y = tf.placeholder(tf.float32, shape=(None), name="y")
with tf.name_scope("dnn"):
hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1",
activation=tf.nn.leaky_relu)
logits = tf.layers.dense(hidden1, n_outputs, name="logits")
with tf.device("/cpu:0"):
with tf.name_scope("loss"):
loss = tf.reduce_mean(tf.abs(logits - y), name="loss")
with tf.device("cpu:0"):
with tf.name_scope("learning_rate"):
learning_rate = 0.001
with tf.device("/gpu:0"):
with tf.name_scope("train"):
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
with tf.device("/gpu:0"):
with tf.name_scope("eval"):
mae = tf.reduce_mean(tf.abs(logits - y), name="mae")
def shuffle_batch(X, y, batch_size):
rnd_idx = np.random.permutation(len(X))
n_batches = len(X) // batch_size
for batch_idx in np.array_split(rnd_idx, n_batches):
X_batch, y_batch = X[batch_idx], y[batch_idx]
yield X_batch, y_batch
n_epochs = 200
batch_size = 1000
n_batches = int(np.ceil(X_train.shape[0] / batch_size))
# create graph variables initializer
init = tf.global_variables_initializer()
# create model saver
saver = tf.train.Saver()
# set device to gpu
with tf.device("/gpu:0"):
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs):
print("Epoch:", str(epoch) + "/" + str(n_epochs))
batch_index = 0
for X_batch, y_batch in shuffle_batch(X_train, np.array(y_train).reshape(-1), batch_size):
sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
acc_batch = mae.eval(feed_dict={X: X_batch, y: y_batch})
acc_val = mae.eval(feed_dict={X: X_test, y: np.array(y_test).reshape(-1)})
loss_batch = loss.eval(feed_dict={X: X_batch, y: y_batch})
loss_val = loss.eval(feed_dict={X: X_test, y: np.array(y_test).reshape(-1)})
print("Batch mae:", acc_batch, "Val mae:", acc_val)
print("Batch loss:", loss_batch, "Val loss:", loss_val)
Keras Code
import tensorflow as tf
from keras import layers
from keras import models
from keras import optimizers
from keras import initializers
from keras import regularizers
network = models.Sequential()
network.add(layers.Dense(units=200,
activation=tf.nn.leaky_relu,
input_shape=(X_train.shape[1], )))
# output layer
network.add(layers.Dense(units=1))
network.compile(loss="mae",
optimizer=optimizers.Adam(lr=0.001),
metrics=["mae"])
history = network.fit(X_train,
np.array(y_train).reshape(-1),
epochs=200,
verbose=1,
batch_size=1000,
validation_data=(X_test, np.array(y_test).reshape(-1)),
shuffle=True)
I am novice in Machine Learning (ML) and I'm trying to implement algorithm to understand basic syntax of ML frameworks etc. Now I am working on MNIST database of handwritten digits dataset.
I implemented just one layer (I mean: Input layer has 784 inputs, Hidden layer has 512 nodes, Output layer has 10 outputs) Neural Network using TensorFlow framework, no data preprocessing, 128 batch size, 10 epochs, ADAM optimizer. And the algorithm achieved about 0.95 accuracy on train set.
After that I tried to implement exactly the same architecture in Keras. However, the accuracy (train set) is about 0.3. I tried to find many different implementations founded on the internet but I still cannot find where is the issue. I believe that it is something stupid (as always is) :-/
I presume that the same architecture in Keras should give the same results as the implementation in TensorFlow, am I correct?
My Keras implementation is:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.layers import Input, Dense
from keras.models import Model
from keras.utils.np_utils import to_categorical
df_train = pd.read_csv('datasets/MNIST_train.csv', delimiter=',', header=0)
Y_train, X_train = np.split(df_train.values, [1], axis=1)
m, n_x = X_train.shape
n_y = len(np.unique(Y_train))
n_layer1 = 512
batch_size = 128
num_epochs = 10
Y_train = to_categorical(Y_train)
X_input = Input(shape=(n_x,), name='input')
X = Dense(n_layer1, activation='relu', name='hidden')(X_input)
X = Dense(n_y, activation='softmax', name='output')(X)
model = Model(inputs=X_input, outputs=X, name='Neural Network')
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, Y_train, epochs=num_epochs, batch_size=batch_size)
My TensorFlow implementation is:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
def one_hot(a, num_classes):
return np.eye(num_classes)[a.reshape(-1)]
def get_minibatches(batch_size, m, X, Y):
output_batches = []
for index in range(0, m, batch_size):
index_end = index + batch_size
batch = [X[index:index_end], Y[index:index_end]]
output_batches.append(batch)
return output_batches
def dense_layer(input, channels_in, channels_out, activation=None):
initializer = tf.contrib.layers.xavier_initializer()
w = tf.Variable(initializer([channels_in, channels_out]), name="w")
b = tf.Variable(tf.zeros([1, channels_out]), name="b")
if (activation == 'relu'):
a = tf.nn.relu(tf.matmul(input, w) + b)
return a
else:
z = tf.matmul(input, w) + b
return z
df_train = pd.read_csv('datasets/MNIST_train.csv', delimiter=',', header=0)
Y_train, X_train = np.split(df_train.values, [1], axis=1)
m, n_x = X_train.shape
n_y = len(np.unique(Y_train))
n_layer1 = 512
batch_size = 128
num_epochs = 10
Y_train = one_hot(Y_train, n_y)
X = tf.placeholder(tf.float32, [None, n_x], name="X")
Y = tf.placeholder(tf.float32, [None, n_y], name="Y")
hidden = dense_layer(X, n_x, n_layer1, 'relu')
output = dense_layer(hidden, n_layer1, n_y)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=Y))
optimizer = tf.train.AdamOptimizer().minimize(loss)
predict = tf.argmax(output, 1)
correct_prediction = tf.equal(predict, tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
minibatches = get_minibatches(batch_size, m, X_train, Y_train)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
current_cost = sess.run(loss, feed_dict={X: X_train, Y: Y_train})
train_accuracy = sess.run(accuracy, feed_dict={X: X_train, Y: Y_train})
print('Epoch: {:<4} - Loss: {:<8.3} Train Accuracy: {:<5.3} '.format(0, current_cost, train_accuracy))
for epoch in range(num_epochs):
for minibatch in minibatches:
minibatch_X, minibatch_Y = minibatch
sess.run(optimizer, feed_dict={ X: minibatch_X, Y: minibatch_Y })
current_cost = sess.run(loss, feed_dict={X: X_train, Y: Y_train})
train_accuracy = sess.run(accuracy, feed_dict={X: X_train, Y: Y_train})
print('Epoch: {:<4} - Loss: {:<8.3} Train Accuracy: {:<5.3} '.format(epoch + 1, current_cost, train_accuracy))
Could you help me and advice what I am doing wrong?
Thank you
Petr
I figured it out. At least partially. I standardized the input ((x - xmean) / xstd) and the Keras implementation has been started to return similar results as TensorFlow implementation…
I can't figure out what I'm doing wrong with this XOR neural network. Maybe I'm not computing the loss correctly? The loss improves slightly at the beginning, and then the accuracy converges to 50% very quickly. Could someone please point out what I'm doing wrong?
Here's a minimal self contained example:
import numpy as np
import tensorflow as tf
n_inputs = 2
n_hidden = 3
n_outputs = 1
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.float32, shape=(None), name='y')
def neuron_layer(X, n_neurons, name, activation=None):
with tf.name_scope(name):
n_inputs = int(X.get_shape()[1])
stddev = 2 / np.sqrt(n_inputs)
init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev)
W = tf.Variable(init, name="weights")
b = tf.Variable(tf.zeros([n_neurons]), name="bias")
Z = tf.matmul(X, W) + b
if activation is not None:
return activation(Z)
else: return Z
with tf.name_scope('nn'):
hidden = neuron_layer(X, n_hidden, name='hidden', activation=tf.nn.sigmoid)
prediction_probabilities = neuron_layer(hidden, n_outputs, name='outputs', activation=tf.nn.sigmoid)
with tf.name_scope('loss'):
mse_loss = tf.reduce_mean(tf.squared_difference(y, prediction_probabilities), name='loss')
learning_rate = 0.1
with tf.name_scope('train'):
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(mse_loss)
with tf.name_scope('eval'):
correct = tf.equal(tf.greater_equal(prediction_probabilities,0.5), tf.cast(y,tf.bool))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
X_train = [
(0, 0),
(0, 1),
(1, 0),
(1, 1)
]
y_train = [0,1,1,0]
with tf.Session() as sess:
init.run()
for epoch in range(500):
_, mse, acc = sess.run([training_op, mse_loss, accuracy],
feed_dict={X: np.array(X_train), y: np.array(y_train)})
print("mse: %.4f, accuracy: %.2f" % (mse, acc))
Your code is perfectly fine. The problem is with your input. You need to pass a 2D list, not 1D. Each inner list is a single dimension input, that's how tensorflow will parse them.
y_train = [[0],[1],[1],[0]]
Your code works nicely after.
...
mse: 0.0002, accuracy: 1.00