Problem in basic parameter initialization - python

Thanks for your time~ after I run the code, the grad is always zero and the loss is not updating.(I guess it's because the weights is initialized all 0's, but I don't know how to fix it) The code is a basic neural network:
class Model(torch.nn.Module): #class
def __init__(self):
super(Model, self).__init__()
self.linear1 = torch.nn.Linear(8,6)
self.linear2 = torch.nn.Linear(6,4)
self.linear3 = torch.nn.Linear(4,1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
x = self.sigmoid(self.linear1(x))
x = self.sigmoid(self.linear2(x))
x = self.sigmoid(self.linear3(x))
x = F.softmax(x, dim=1)
return x
model = Model() #model
criterion = torch.nn.BCELoss(size_average = False)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(1000): # training
y_pred = model(X_train.float())
loss = criterion(y_pred, y_train.float())
print(epoch, loss.item())
print([x.grad for x in optimizer.param_groups[0]['params']])
optimizer.zero_grad()
loss.backward()
optimizer.step()
And I get the 0s grad:

I think your forgot to apply the backpropagation. Adding loss.backward() just before your print statetements will do the trick (compute the accumulated gradients and store them in x.grad). Note that by default your weights are not initialized to 0 here. The default initialization for linear layers are here.

Related

Why doesn't loss function decrease under 1.5?

I'm trying to do machine learning using mnist dataset in pytorch, but the loss function doesn't decrease under 1.5.
I want the loss function to decrease under 1.
What to do for this problem?
what code should I fix?
This is my code:
BATCH_SIZE = 8
transform =torchvision.transforms.Compose([torchvision.transforms.ToTensor(),torchvision.transforms.Normalize((0.5,), (0.5,))])
trainset = torchvision.datasets.MNIST(root = "./data/train", train = True, download = True, transform = transform)
trainset, valset = torch.utils.data.random_split(trainset, [50000, 10000])
trainloader = torch.utils.data.DataLoader(trainset, batch_size = BATCH_SIZE, shuffle = True)
valloader = torch.utils.data.DataLoader(valset, batch_size = BATCH_SIZE, shuffle = True)
testset = torchvision.datasets.MNIST(root = "./data/test", train = False, download = True, transform = transform)
testloader = torch.utils.data.DataLoader(testset, batch_size = BATCH_SIZE, shuffle = False
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.relu = nn.ReLU()
self.pool = nn.MaxPool2d(2, stride=2)
self.conv1 = nn.Conv2d(1,20,5)
self.conv2 = nn.Conv2d(20,50,5)
self.conv3 = nn.Conv2d(50,500,4)
self.conv4 = nn.Conv2d(500,10,1)
self.soft = nn.Softmax(dim=1)
def forward(self, x):
x = self.conv1(x)
x = self.pool(x)
x = self.conv2(x)
x = self.pool(x)
x = self.conv3(x)
x = self.relu(x)
x = self.conv4(x)
x = self.soft(x)
x = x.view(x.size()[0], -1)
return x
class EarlyStopping:
"""earlystopping class"""
def __init__(self, patience=5, verbose=False, path='checkpoint_model.pth'):
self.patience = patience
self.verbose = verbose
self.counter = 0
self.best_score = None
self.early_stop = False
self.val_loss_min = np.Inf
self.path = path
def __call__(self, valid_loss):
score = -valid_loss
if self.best_score is None:
self.best_score = score
self.checkpoint(valid_loss)
elif score - self.best_score < 0.0001:
self.counter += 1
if self.verbose:
print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
if self.counter >= self.patience:
self.early_stop = True
else:
self.best_score = score
self.checkpoint(valid_loss)
self.counter = 0
def checkpoint(self, valid_loss):
if self.verbose:
print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {valid_loss:.6f}).')
self.val_loss_min = valid_loss
earlystopping = EarlyStopping(patience=5, verbose=True)
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.00005, momentum=0.9, weight_decay=0.005)
train_loss=[]
train_acc=[]
val_loss=[]
val_acc=[]
if __name__ == '__main__':
BATCH_SIZE = 8
start = time.time()
for epoch in range(10000):
print('epoch', epoch+1)
sum_loss = 0.0
sum_correct = 0
sum_total = 0
#Training
net = net.train()
for (inputs, labels) in trainloader:
#inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
sum_loss += loss.item() #Add loss
_, predicted = outputs.max(1)
sum_total += labels.size(0)
sum_correct += (predicted == labels).sum().item()
loss.backward()
optimizer.step()
print("train mean loss={}, accuracy={}"
.format(sum_loss*BATCH_SIZE/len(trainloader.dataset), float(sum_correct/sum_total)))
train_loss.append(sum_loss*BATCH_SIZE/len(trainloader.dataset))
train_acc.append(float(sum_correct/sum_total))
sum_loss = 0.0
sum_correct = 0
sum_total = 0
#Validating
net = net.eval()
for (inputs, labels) in valloader:
#inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
sum_loss += loss.item()
_, predicted = outputs.max(1)
sum_total += labels.size(0)
sum_correct += (predicted == labels).sum().item()
loss.backward()
optimizer.step()
print("valid mean loss={}, accuracy={}"
.format(sum_loss*BATCH_SIZE/len(valloader.dataset), float(sum_correct/sum_total)))
val_loss.append(sum_loss*BATCH_SIZE/len(valloader.dataset))
val_acc.append(float(sum_correct/sum_total))
#Early stop
earlystopping(val_loss[-1])
if earlystopping.early_stop:
print("Early Stopping!")
break
finish_time = time.time() - start
#Test
net.test()
test_loss = 0
correct = 0
with torch.no_grad():
for (inputs, labels) in testloader:
output = net(inputs)
test_loss += criterion(output, labels).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(labels.view_as(pred)).sum().item()
test_loss /= 10000
test_acc = correct / 10000
And this is the resulting image.
https://i.stack.imgur.com/eln4S.png
The loss function converges without going below 1.5, and earlystopping terminates machine learning.
Since I have done a similar task (with a different but comparable network) I know that the Loss when using cross_entropy decreases lower than 1.5. In my case accuracy was ~95% and the loss ~0.200. When inspecting your picture I saw that for the test loss (in the picture you named it Loss evaluate) the value matches with what one should expect for train/val loss after some epochs. So there is most likely an error when computing the loss for the training and validation phases.
You compute the loss for a certain epoch like this:
train_loss.append(sum_loss*BATCH_SIZE/len(trainloader.dataset))
len(trainloader.dataset) does not return number of batches but rather number of samples, thus using BATCH_SIZE for computing the loss is correct and necessary. See here.
Since Cross_Entropy by default reports the mean for the batch (see here) using
sum_loss*BATCH_SIZE/len(trainloader.dataset)
is correct if the trainloader actually has a Batch-size equal to the one defined in the provided code. Since you did not share the part where the trainloader was defined I cannot be sure.
What indicates this behaviour is that in the test part you don't use len(testloader.dataset) but rather hardcoded 10000 to get the mean for the epoch, I think that the dataloader (for all train, validate and testing) may not have batch size equal to BATCH_SIZE and is most likely 1.

Why can I save VRAM? I just loaded the model twice

I made the following simple toy CNN Model.
class Test(nn.Module):
def __init__(self):
super(Test, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(3,300,3),
nn.Conv2d(300,500,3),
nn.Conv2d(500,1000,3),
)
self.fc = nn.Linear(168200000,1)
def forward(self, x):
out = self.conv(x)
out = out.view(-1)
out = self.fc(out)
return out
and I made following trainer script.
if __name__ == '__main__':
device = 'cuda'
#Once
model = Test()
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()
#Twice
model = Test()
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
for i in range(10):
optimizer.zero_grad()
print('i :', i)
x = torch.zeros((50,3,64,64), device=device)
y = torch.ones(50, device=device)
output = model(x)
loss = criterion(output, y)
loss.backward()
optimizer.step()
I have 12211MB of VRAM.
If I load model just once, I use 11878MB.
But if I load model twice, I use 9302MB.
I can save the memory 2576MB! I just loaded the model twice.
Why happen this?

Tensorflow 2.0: flat_map() to flatten Dataset of Dataset returns cardinality -2

I am trying to run the following code (as given in Tensorflow documentation) to create windows of my data and then flatten the dataset of datasets.
window_size = 5
windows = range_ds.window(window_size, shift=1)
for sub_ds in windows.take(5):
print(sub_ds)
flat_windows = windows.flat_map(lambda x: x)
The problem is that flat_windows.cardinality().numpy() returns cardinality to be -2 which is creating problem for me during training. I tried looking for ways to set_cardinality of a dataset but couldn't find anything. I also tried other ways of flattening a dataset of datasets, but again no success.
Edit-1: The problem with the training is that the shape is unknown (at Linear and Dense layers) when I am training a subclass model (given below). The model trains well when I train the model eagerly (through tf.config.run_functions_eagerly(True)) but that is slow. Therefore I want the input data to be known for the model training.
Neural Network
class NeuralNetworkModel(tf.keras.Model):
def __init__(self):
super(NeuralNetworkModel, self).__init__()
self.encoder = Encoder()
def train_step(self, inputs):
X = inputs[0]
Y = inputs[1]
with tf.GradientTape() as tape:
enc_X = self.encoder(X)
enc_Y = self.encoder(Y)
# loss:
loss = tf.norm(enc_Y - enc_X, axis = [0, 1], ord = 'fro')
# Compute gradients
trainable_vars = self.encoder.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
#property
def metrics(self):
# We list our `Metric` objects here so that `reset_states()` can be
# called automatically at the start of each epoch
# or at the start of `evaluate()`.
# If you don't implement this property, you have to call
# `reset_states()` yourself at the time of your choosing.
return [loss_tracker]
def test_step(self, inputs):
X = inputs[0]
Y = inputs[1]
Psi_X = self.encoder(X)
Psi_Y = self.encoder(Y)
# loss:
loss = tf.norm(Psi_Y - Psi_X, axis = [0, 1], ord = 'fro')
# Compute our own metrics
loss_tracker.update_state(loss)
# Return a dict mapping metric names to current value.
# Note that it will include the loss (tracked in self.metrics).
return {"loss": loss_tracker.result()}
class Encoder(tf.keras.Model):
def __init__(self):
super(Encoder, self).__init__(dtype = 'float64', name = 'Encoder')
self.input_layer = DenseLayer(128)
self.hidden_layer1 = DenseLayer(128)
self.hidden_layer2 = DenseLayer(64)
self.hidden_layer3 = DenseLayer(64)
self.output_layer = LinearLayer(64)
def call(self, input_data, training):
fx = self.input_layer(input_data)
fx = self.hidden_layer1(fx)
fx = self.hidden_layer2(fx)
fx = self.hidden_layer3(fx)
return self.output_layer(fx)
class LinearLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(LinearLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
return tf.matmul(inputs, self.w) + self.b
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, units):
super(DenseLayer, self).__init__(dtype = 'float64')
self.units = units
def build(self, input_shape):
input_dim = input_shape[-1]
self.w = self.add_weight(shape = (input_dim, self.units),
initializer = "random_normal",
trainable = True)
self.b = self.add_weight(shape = (self.units,),
initializer = tf.zeros_initializer(),
trainable = True)
def call(self, inputs):
x = tf.matmul(inputs, self.w) + self.b
return tf.nn.elu(x)
I was wondering about this as well. Turns out that -2 is tf.data.UNKNOWN_CARDINALITY (https://www.tensorflow.org/api_docs/python/tf/data#UNKNOWN_CARDINALITY), which represents that TF doesn't know how many elements the flat_map returns per item.
I just asked Windowing a TensorFlow dataset without losing cardinality information? to see if anyone knows a way to window datasets without losing cardinality.

Debugging neural network dropout problem for the probability not lying inside [0,1]

I tried to put a droprate to my neural network (NN) using torch and I got a strange error at the end. How can I fix it?
So the idea is that I wrote a NN inside a function to make it easier to call. The function is the following:
(I personally think the problem lies inside the class of the NN, but for the sake of having a working example I'm putting everything).
def train_neural_network(data_train_X, data_train_Y, batch_size, learning_rate, graph = True, dropout = 0.0 ):
input_size = len(data_test_X.columns)
hidden_size = 200
num_classes = 4
num_epochs = 120
batch_size = batch_size
learning_rate = learning_rate
# The class of NN
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes, p = dropout):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, num_classes)
def forward(self, x, p = dropout):
out = F.relu(self.fc1(x))
out = F.relu(self.fc2(out))
out = nn.Dropout(out, p) #drop
out = self.fc3(out)
return out
# Prepare data
X_train = torch.from_numpy(data_train_X.values).float()
Y_train = torch.from_numpy(data_train_Y.values).float()
# Loading data
train = torch.utils.data.TensorDataset(X_train, Y_train)
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size)
net = NeuralNet(input_size, hidden_size, num_classes)
# Loss
criterion = nn.CrossEntropyLoss()
# Optimiser
optimiser = torch.optim.SGD(net.parameters(), lr=learning_rate)
# Proper training
total_step = len(train_loader)
loss_values = []
for epoch in range(num_epochs+1):
net.train()
train_loss = 0.0
for i, (predictors, results) in enumerate(train_loader, 0):
# Forward pass
outputs = net(predictors)
results = results.long()
results = results.squeeze_()
loss = criterion(outputs, results)
# Backward and optimise
optimiser.zero_grad()
loss.backward()
optimiser.step()
# Update loss
train_loss += loss.item()
loss_values.append(train_loss / batch_size )
print('Finished Training')
return net
And when I call the function:
net = train_neural_network(data_train_X = data_train_X, data_train_Y = data_train_Y, batch_size = batch_size, learning_rate = learning_rate, dropout = 0.1)
The error is the following:
net = train_neural_network(data_train_X = data_train_X, data_train_Y = data_train_Y, batch_size = batch_size, learning_rate = learning_rate, dropout = 0.1)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/dropout.py in __init__(self, p, inplace)
8 def __init__(self, p=0.5, inplace=False):
9 super(_DropoutNd, self).__init__()
---> 10 if p < 0 or p > 1:
11 raise ValueError("dropout probability has to be between 0 and 1, "
12 "but got {}".format(p))
RuntimeError: bool value of Tensor with more than one value is ambiguous
Why do you think there is an error?
Before putting the droprate, everything was working. Additional points for you if you know how to
implement a bias inside my network! For example, on the hidden layer. I can't find any example online.
Change your architecture for this:
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes, p=dropout):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, num_classes)
self.dropout = nn.Dropout(p=p)
def forward(self, x):
out = F.relu(self.fc1(x))
out = F.relu(self.fc2(out))
out = self.dropout(self.fc3(out))
return out
Let me know if it works.

Tensoflow: Calling prediction from a function returns 'RuntimeError: Attempted to use a closed Session'

I have implemented a simple MLP in tensorflow. The structure is a class NeuralNet:
class NeuralNet:
def __init__(self, **options):
self.type = options.get('net_type') # MLP, CNN, RNN
self.n_class = options.get('classes')
self.alpha = options.get('alpha')
self.batch_size = options.get('batch_size')
self.epoch = options.get('epochs')
self.model = {}
It has 3 different function:
Fit:
def fit (self, features, labels):
if self.type == 'MLP':
input_size = len(features[0])
n_nodes_hl1 = input_size//5
batch_size = 50
sess = tf.InteractiveSession()
x = tf.placeholder(tf.float32, [None, input_size])
y = tf.placeholder(tf.float32, [None, self.n_class])
labels = self.labels_to_onehot(labels)
weights = {'hidden_1': tf.Variable(tf.random_normal([input_size, n_nodes_hl1])),
'output': tf.Variable(tf.random_normal([n_nodes_hl1, self.n_class]))}
biases = {'hidden_1': tf.Variable(tf.random_normal([n_nodes_hl1])),
'output': tf.Variable(tf.random_normal([self.n_class]))}
def neural_network_model(data, weight, bias):
l1 = tf.add(tf.matmul(data, weight['hidden_1']), bias['hidden_1'])
l1 = tf.nn.relu(l1)
output = tf.matmul(l1, weight['output']) + bias['output']
return output
sess.run(tf.global_variables_initializer())
prediction = neural_network_model(x, weights, biases)
l2 = self.alpha * tf.nn.l2_loss(weights['hidden_1']) + self.alpha * tf.nn.l2_loss(weights['output'])
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction)+l2)
train_step = tf.train.AdamOptimizer(0.005).minimize(cross_entropy)
sess=tf.Session()
sess.run(tf.global_variables_initializer())
for epoch in range(self.epoch):
epoch_loss = 0
i = 0
while i < len(features):
start = i
end = i + batch_size
batch_x = np.array(features[start:end])
batch_y = np.array(labels[start:end])
_, c = sess.run([train_step, cross_entropy], feed_dict={x: batch_x,
y: batch_y})
epoch_loss += c
i += batch_size
self.model['session'] = sess
self.model['y'] = y
self.model['x'] = x
self.model['prediction'] = prediction
Test (testing accuracy):
def test(self, test_features, test_labels):
with self.model['session']:
test_labels = np.eye(self.n_class)[[int(int(i)/2) for i in test_labels]]
correct = tf.equal(tf.argmax(self.model['prediction'], 1), tf.argmax(self.model['y'], 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
accuracy = accuracy.eval({self.model['x']: test_features, self.model['y']: test_labels})
print('Accuracy:', accuracy)
return accuracy
Predict
def predict(self, test_features):
with self.model['session']:
pred = self.model['prediction']
predicted = pred.eval({self.model['x']: test_features})
return predicted
When running the predict method, It returns a RuntimeError: ('Attempted to use a closed Session.')
My question is:
Why does the test method run smoothly, while calling the session the same way in the predict method fails?
Would I have to create a tf object and evaluate it? If yes, which object should it be?
I couldn't run your code, but I have one hypothesis.
maybe you run the test function after fitting is over.
In the test function, you handle session using context manager('with block').
so, my hypothesis is, your session is automatically closed after session context manager block is finished.
def test(self, test_features, test_labels):
with self.model['session']:
test_labels = np.eye(self.n_class)[[int(int(i)/2) for i in test_labels]]
correct = tf.equal(tf.argmax(self.model['prediction'], 1), tf.argmax(self.model['y'], 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
accuracy = accuracy.eval({self.model['x']: test_features, self.model['y']: test_labels})
print('Accuracy:', accuracy)
return accuracy
## at this point, your session is maybe closed.
If my hypothesis is right, you may just pass the sess, and run the graph using sess.run(~) and close manually use sess.close() method.
P.S. Why you first assign interactiveSession() to sess, and run tf.global_variables_initializer()?
I think you just can use tf.global_variables_initializer() once, between the point graph construction is finished and the point start training.
UPDATED
In my hypothesis, which function you run at the first time doesn't matter, because both function use 'with' block. any function which run first will close the session at the end of with block.
I suggested to pass the sess, that means,
def fit(self, ~):
# construct graph
self.sess = tf.Session()
def test(self, ~):
# codes will be here
acc_val = self.sess.run([accuracy], feed_dict={~})
return acc_val
def predict(self, ~):
# codes will be here
predicted = self.sess.run([pred], feed_dict={~})
return predicted
I hope this code give you a hunch.

Categories