pytorch loss value not change - python

I wrote a module based on this article: http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
The idea is pass the input into multiple streams then concat together and connect to a FC layer. I divided my source code into 3 custom modules: TextClassifyCnnNet >> FlatCnnLayer >> FilterLayer
FilterLayer:
class FilterLayer(nn.Module):
def __init__(self, filter_size, embedding_size, sequence_length, out_channels=128):
super(FilterLayer, self).__init__()
self.model = nn.Sequential(
nn.Conv2d(1, out_channels, (filter_size, embedding_size)),
nn.ReLU(inplace=True),
nn.MaxPool2d((sequence_length - filter_size + 1, 1), stride=1)
)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
def forward(self, x):
return self.model(x)
FlatCnnLayer:
class FlatCnnLayer(nn.Module):
def __init__(self, embedding_size, sequence_length, filter_sizes=[3, 4, 5], out_channels=128):
super(FlatCnnLayer, self).__init__()
self.filter_layers = nn.ModuleList(
[FilterLayer(filter_size, embedding_size, sequence_length, out_channels=out_channels) for
filter_size in filter_sizes])
def forward(self, x):
pools = []
for filter_layer in self.filter_layers:
out_filter = filter_layer(x)
# reshape from (batch_size, out_channels, h, w) to (batch_size, h, w, out_channels)
pools.append(out_filter.view(out_filter.size()[0], 1, 1, -1))
x = torch.cat(pools, dim=3)
x = x.view(x.size()[0], -1)
x = F.dropout(x, p=dropout_prob, training=True)
return x
TextClassifyCnnNet (main module):
class TextClassifyCnnNet(nn.Module):
def __init__(self, embedding_size, sequence_length, num_classes, filter_sizes=[3, 4, 5], out_channels=128):
super(TextClassifyCnnNet, self).__init__()
self.flat_layer = FlatCnnLayer(embedding_size, sequence_length, filter_sizes=filter_sizes,
out_channels=out_channels)
self.model = nn.Sequential(
self.flat_layer,
nn.Linear(out_channels * len(filter_sizes), num_classes)
)
def forward(self, x):
x = self.model(x)
return x
def fit(net, data, save_path):
if torch.cuda.is_available():
net = net.cuda()
for param in list(net.parameters()):
print(type(param.data), param.size())
optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0.1)
X_train, X_test = data['X_train'], data['X_test']
Y_train, Y_test = data['Y_train'], data['Y_test']
X_valid, Y_valid = data['X_valid'], data['Y_valid']
n_batch = len(X_train) // batch_size
for epoch in range(1, n_epochs + 1): # loop over the dataset multiple times
net.train()
start = 0
end = batch_size
for batch_idx in range(1, n_batch + 1):
# get the inputs
x, y = X_train[start:end], Y_train[start:end]
start = end
end = start + batch_size
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
predicts = _get_predict(net, x)
loss = _get_loss(predicts, y)
loss.backward()
optimizer.step()
if batch_idx % display_step == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(x), len(X_train), 100. * batch_idx / (n_batch + 1), loss.data[0]))
# print statistics
if epoch % display_step == 0 or epoch == 1:
net.eval()
valid_predicts = _get_predict(net, X_valid)
valid_loss = _get_loss(valid_predicts, Y_valid)
valid_accuracy = _get_accuracy(valid_predicts, Y_valid)
print('\r[%d] loss: %.3f - accuracy: %.2f' % (epoch, valid_loss.data[0], valid_accuracy * 100))
print('\rFinished Training\n')
net.eval()
test_predicts = _get_predict(net, X_test)
test_loss = _get_loss(test_predicts, Y_test).data[0]
test_accuracy = _get_accuracy(test_predicts, Y_test)
print('Test loss: %.3f - Test accuracy: %.2f' % (test_loss, test_accuracy * 100))
torch.save(net.flat_layer.state_dict(), save_path)
def _get_accuracy(predicts, labels):
predicts = torch.max(predicts, 1)[1].data[0]
return np.mean(predicts == labels)
def _get_predict(net, x):
# wrap them in Variable
inputs = torch.from_numpy(x).float()
# convert to cuda tensors if cuda flag is true
if torch.cuda.is_available:
inputs = inputs.cuda()
inputs = Variable(inputs)
return net(inputs)
def _get_loss(predicts, labels):
labels = torch.from_numpy(labels).long()
# convert to cuda tensors if cuda flag is true
if torch.cuda.is_available:
labels = labels.cuda()
labels = Variable(labels)
return F.cross_entropy(predicts, labels)
It seems that parameters 're just updated slightly each epoch, the accuracy remains for all the process. While with the same implementation and the same params in Tensorflow, it runs correctly.
I'm new to Pytorch, so maybe my instructions has something wrong, please help me to find out. Thank you!
P.s: I try to use F.nll_loss + F.log_softmax instead of F.cross_entropy. Theoretically, it should return the same, but in fact another result is printed out (but it still be a wrong loss value)

I have seen that in your original code, weight_decay term is set to be 0.1. weight_decay is used to regularize the network's parameters. This term maybe too strong so that the regularization is too much. Try to reduce the value of weight_decay.
For convolutional neural networks in computer vision tasks. weight_decay term are usually set to be 5e-4 or 5e-5. I am not familiar with text classification. These values may work for you out of the box or you have to tweak it a little bit by trial and error.
Let me know if it works for you.

I realised that L2_loss in Adam Optimizer make loss value remain unchanged (I haven't tried in other Optimizer yet). It works when I remove L2_loss:
# optimizer = optim.Adam(net.parameters(), lr=0.01, weight_decay=0.1)
optimizer = optim.Adam(model.parameters(), lr=0.001)
=== UPDATE (See above answer for more detail!) ===
self.features = nn.Sequential(self.flat_layer)
self.classifier = nn.Linear(out_channels * len(filter_sizes), num_classes)
...
optimizer = optim.Adam([
{'params': model.features.parameters()},
{'params': model.classifier.parameters(), 'weight_decay': 0.1}
], lr=0.001)

In my case, I was facing the same error. On my laptop without GPU the training was fine. When I tried on GPU the model didn’t change the accuracy and loss after the first epochs. I was using nn.CrossEntropyLoss() with Adam.
Changing Adam with SGD worked for me.
I am sharing this, anyone may suffer from this.

Related

Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead BUT i'm not using numpy here

I get this error in the training loop for this neural network:
class YourModel(torch.nn.Module):
def __init__(self):
super(YourModel, self).__init__()
self.fc1 = nn.Linear(50, 128)
self.sigmoid = nn.Sigmoid()
self.fc2 = nn.Linear(128, 1)
def forward(self, x1, x2):
x = torch.cat((x1, x2), dim=1)
out = self.fc1(x)
out = self.sigmoid(out)
out = self.fc2(out)
return out
model = YourModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()
My dataloader contains 3 datasets, 1 with 25 features for 8000 documents, other with 25 features of 8000 queries and the last one with the relation between both (0 or 1). So that's why I'm using a neural network for binary classification. (However if you know an alternative neural network I'm open to options)
My batch_size is 1 right now and here is my training loop:
def train(dataloader, model, loss_fn, optimizer):
model.train()
train_loss = 0
num_batches = len(dataloader)
all_pred = []
all_real = []
for batch, i in enumerate(train_dataloader): #access to each batch
i_1 = i[0]
i_2 = i[1]
y = i[2].float().view(1, 1) #find relevance
#y = torch.clamp(y, min=0, max=1)
#x = np.hstack((i_1, i_2))
#x = torch.Tensor(x)
#x = torch.clamp(x, min=0, max=1)
# Zero the gradients
optimizer.zero_grad()
# Forward pass
y_pred = model(i_1, i_2).float()
y_pred = torch.clamp(y_pred, min=0, max=1)
loss = loss_fn(y_pred, y)
# Backward pass
loss.backward()
# Update the parameters
optimizer.step()
train_loss += loss.item() #sum the loss
all_pred.append(y_pred)
all_real.append(y)
if batch > 0 and batch%1000 == 0:
print(f"Partial loss: {train_loss/batch}, F1: {f1_score(all_real, all_pred)}")
train_loss /= num_batches
print(f"Total loss: {train_loss}") #print loss of every epoch
return train_loss
I'm getting this error: "Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead." but as far as I know I'm not calling numpy on any tensors. And if I use the detach method then I get the an error saying that the loss can not be computed because the tensor of 0 doesn't need grad. So it is pretty much a loop.

Can't backward pass two losses in Classification Transformer Model

For my model I'm using a roberta transformer model and the Trainer from the Huggingface transformer library.
I calculate two losses:
lloss is a Cross Entropy Loss and dloss calculates the loss inbetween hierarchy layers.
The total loss is the sum of lloss and dloss. (Based on this)
When calling total_loss.backwards() however, I get the error:
RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed
Any idea why that happens? Can I force it to only call backwards once? Here is the loss calculation part:
dloss = calculate_dloss(prediction, labels, 3)
lloss = calculate_lloss(predeiction, labels, 3)
total_loss = lloss + dloss
total_loss.backward()
def calculate_lloss(predictions, true_labels, total_level):
'''Calculates the layer loss.
'''
loss_fct = nn.CrossEntropyLoss()
lloss = 0
for l in range(total_level):
lloss += loss_fct(predictions[l], true_labels[l])
return self.alpha * lloss
def calculate_dloss(predictions, true_labels, total_level):
'''Calculate the dependence loss.
'''
dloss = 0
for l in range(1, total_level):
current_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l]), dim=1)
prev_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l-1]), dim=1)
D_l = self.check_hierarchy(current_lvl_pred, prev_lvl_pred, l) #just a boolean tensor
l_prev = torch.where(prev_lvl_pred == true_labels[l-1], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
l_curr = torch.where(current_lvl_pred == true_labels[l], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
dloss += torch.sum(torch.pow(self.p_loss, D_l*l_prev)*torch.pow(self.p_loss, D_l*l_curr) - 1)
return self.beta * dloss
There is nothing wrong with having a loss that is the sum of two individual losses, here is a small proof of principle adapted from the docs:
import torch
import numpy
from sklearn.datasets import make_blobs
class Feedforward(torch.nn.Module):
def __init__(self, input_size, hidden_size):
super(Feedforward, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
self.relu = torch.nn.ReLU()
self.fc2 = torch.nn.Linear(self.hidden_size, 1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
hidden = self.fc1(x)
relu = self.relu(hidden)
output = self.fc2(relu)
output = self.sigmoid(output)
return output
def blob_label(y, label, loc): # assign labels
target = numpy.copy(y)
for l in loc:
target[y == l] = label
return target
x_train, y_train = make_blobs(n_samples=40, n_features=2, cluster_std=1.5, shuffle=True)
x_train = torch.FloatTensor(x_train)
y_train = torch.FloatTensor(blob_label(y_train, 0, [0]))
y_train = torch.FloatTensor(blob_label(y_train, 1, [1,2,3]))
x_test, y_test = make_blobs(n_samples=10, n_features=2, cluster_std=1.5, shuffle=True)
x_test = torch.FloatTensor(x_test)
y_test = torch.FloatTensor(blob_label(y_test, 0, [0]))
y_test = torch.FloatTensor(blob_label(y_test, 1, [1,2,3]))
model = Feedforward(2, 10)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
model.eval()
y_pred = model(x_test)
before_train = criterion(y_pred.squeeze(), y_test)
print('Test loss before training' , before_train.item())
model.train()
epoch = 20
for epoch in range(epoch):
optimizer.zero_grad() # Forward pass
y_pred = model(x_train) # Compute Loss
lossCE= criterion(y_pred.squeeze(), y_train)
lossSQD = (y_pred.squeeze()-y_train).pow(2).mean()
loss=lossCE+lossSQD
print('Epoch {}: train loss: {}'.format(epoch, loss.item())) # Backward pass
loss.backward()
optimizer.step()
There must be a real second time that you call directly or indirectly backward on some varaible that then traverses through your graph. It is a bit too much to ask for the complete code here, only you can check this or at least reduce it to a minimal example (while doing so, you might already find the issue). Apart from that, I would start checking:
Does it already occur in the first iteration of training? If not: are you reusing any calculation results for the second iteration without a detach?
When you do backward on your losses individually lloss.backward() followed by dloss.backward() (this has the same effect as adding them together first as gradients are accumulated): what happens? This will let you track down for which of the two losses the error occurs.
After backward() your comp. graph is freed so for the second backward you need to create a new graph by providing inputs again. If you want to reiterate the same graph after backward (for some reason) you need to specify retain_graph flag in backward as True. see retain_graph here.
P.S. As the summation of Tensors is automatically differentiable, summing the losses would not cause any issue in the backward.

Linear regression using Pytorch

I have classification problem. I am using Pytorch, My input is sequence of length 341 and output one of three classes {0,1,2}, I want to train linear regression model using pytorch, I created the following class but during the training, the loss values start to have numbers then inf then NAN. I do not know how to fix that . Also I tried to initialize the weights for linear model but it is the same thing. Any suggestions.
class regression(nn.Module):
def __init__(self, input_dim):
super().__init__()
self.input_dim = input_dim
# One layer
self.linear = nn.Linear(input_dim, 1)
def forward(self, x):
y_pred = self.linear(x)
return y_pred
criterion = torch.nn.MSELoss()
def fit(model, data_loader, optim, epochs):
for epoch in range(epochs):
for i, (X, y) in enumerate(data_loader):
X = X.float()
y = y.unsqueeze(1).float()
X = Variable(X, requires_grad=True)
y = Variable(y, requires_grad=True)
# Make a prediction for the input X
pred = model(X)
#loss = (y-pred).pow(2).mean()
loss = criterion(y, pred)
optim.zero_grad()
loss.backward()
optim.step()
print(loss)
print(type(loss))
# Give some feedback after each 5th pass through the data
if epoch % 5 == 0:
print("Epoch", epoch, f"loss: {loss}")
return None
regnet = regression(input_dim=341)
optim = SGD(regnet.parameters(), lr=0.01)
fit(regnet, data_loader, optim=optim, epochs=5)
pred = regnet(torch.Tensor(test_set.data_info).float())
pred = pred.detach().numpy()
I would additionally suggest to replace MSE with CrossEntropy Loss as it is better suited for multi-class classificiation problems.
import random
import torch
from torch import nn, optim
from matplotlib import pyplot as plt
# Generate random dataset with your shape to test
# Replace this with your own dataset
data = []
for label in [0, 1, 2]:
for i in range(1000):
data.append((torch.rand(341), label))
# train test split
random.shuffle(data)
train, val = data[:1500], data[1500:]
def run_gradient_descent(model, data_train, data_val, batch_size=64, learning_rate=0.01, weight_decay=0, num_epochs=10):
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
iters, losses = [], []
iters_sub, train_acc, val_acc = [], [] ,[]
train_loader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True)
# training
n = 0 # the number of iterations
for epoch in range(num_epochs):
for xs, ts in iter(train_loader):
if len(ts) != batch_size:
continue
zs = model(xs)
loss = criterion(zs, ts) # compute the total loss
loss.backward() # compute updates for each parameter
optimizer.step() # make the updates for each parameter
optimizer.zero_grad() # a clean up step for PyTorch
# save the current training information
iters.append(n)
losses.append(float(loss)/batch_size) # compute *average* loss
if n % 10 == 0:
iters_sub.append(n)
train_acc.append(get_accuracy(model, data_train))
val_acc.append(get_accuracy(model, data_val))
# increment the iteration number
n += 1
# plotting
plt.title("Training Curve (batch_size={}, lr={})".format(batch_size, learning_rate))
plt.plot(iters, losses, label="Train")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.show()
plt.title("Training Curve (batch_size={}, lr={})".format(batch_size, learning_rate))
plt.plot(iters_sub, train_acc, label="Train")
plt.plot(iters_sub, val_acc, label="Validation")
plt.xlabel("Iterations")
plt.ylabel("Accuracy")
plt.legend(loc='best')
plt.show()
return model
def get_accuracy(model, data):
loader = torch.utils.data.DataLoader(data, batch_size=500)
correct, total = 0, 0
for xs, ts in loader:
zs = model(xs)
pred = zs.max(1, keepdim=True)[1] # get the index of the max logit
correct += pred.eq(ts.view_as(pred)).sum().item()
total += int(ts.shape[0])
return correct / total
class MyRegression(nn.Module):
def __init__(self, input_dim, output_dim):
super(MyRegression, self).__init__()
# One layer
self.linear = nn.Linear(input_dim, output_dim)
def forward(self, x):
return self.linear(x)
model = MyRegression(341, 3)
run_gradient_descent(model, train, val, batch_size=64, learning_rate=0.01, num_epochs=10)
cause of my reputation number I can't comment.so if I was you. I'm gonna build like this: I think there is something wrong with your method of making a Module.
class regression(nn.Module):
def __init__(self,input_dim,output_dim):
super(regression,self).__init__()
#function
self.linear=nn.Linear(input_dim,output_dim)
def forward(self,x):
return self.linear(x)
#define the model
input_dim=341
output_dim=3
model=LinearRegression(input_dim,output_dim)
# Mean square error
mse=nn.MSELoss()
#Optimization
learning_rate=0.01
optimizer=torch.optim.SGD(model.parameters(),lr=learning_rate)
#train the model
loss_list=[]
iteration_number=X
for iteration in range(iteration_number):
#optimiziation
optimizer.zero_grad()
#forward to get output
results=model("input_datas_tensor")
#loss calculate
loss=mse(results,"outputs_datas_tensor")
#backward propagation
loss.backward()
#updating parameters
optimizer.step()
#store loss
loss_list.append(loss.data)
if(iteration %5==0):
print("epoch{} ,loss{}".format(iteration,loss.data))

Why am I getting a low error before I did any optimization?

I am using a model training program I have built for a toy example and trying to use it on another example.
The only difference is this model was used for regression, hence I was using MSE as the error criterion, and now it is used for binary classification, hence I am using BCEWithLogitsLoss.
The model is very simple:
class Model(nn.Module):
def __init__(self, input_size, output_size):
super(Model, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(input_size, 8*input_size),
nn.PReLU() #parametric relu - same as leaky relu except the slope is learned
)
self.fc2 = nn.Sequential(
nn.Linear(8*input_size, 80*input_size),
nn.PReLU()
)
self.fc3 = nn.Sequential(
nn.Linear(80*input_size, 32*input_size),
nn.PReLU()
)
self.fc4 = nn.Sequential(
nn.Linear(32*input_size, 4*input_size),
nn.PReLU()
)
self.fc = nn.Sequential(
nn.Linear(4*input_size, output_size),
nn.PReLU()
)
def forward(self, x, dropout=dropout, batchnorm=batchnorm):
x = self.fc1(x)
x = self.fc2(x)
x = self.fc3(x)
x = self.fc4(x)
x = self.fc(x)
return x
And this is where I run it:
model = Model(input_size, output_size)
if (loss == 'MSE'):
criterion = nn.MSELoss()
if (loss == 'BCELoss'):
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = lr)
model.train()
for epoch in range(num_epochs):
# Forward pass and loss
train_predictions = model(train_features)
print(train_predictions)
print(train_targets)
loss = criterion(train_predictions, train_targets)
# Backward pass and update
loss.backward()
optimizer.step()
# zero grad before new step
optimizer.zero_grad()
train_size = len(train_features)
train_loss = criterion(train_predictions, train_targets).item()
pred = train_predictions.max(1, keepdim=True)[1]
correct = pred.eq(train_targets.view_as(pred)).sum().item()
#train_loss /= train_size
accuracy = correct / train_size
print('\nTrain set: Loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
train_loss, correct, train_size,
100. * accuracy))
However, when I print the loss, for some reason the loss already starts very low (around 0.6) before I have done any backwards pass! It remains this low all subsequent epochs.
The prediction vector, however, looks like random garbage...
tensor([[-0.0447],
[-0.0640],
[-0.0564],
...,
[-0.0924],
[-0.0113],
[-0.0774]], grad_fn=<PreluBackward>)
tensor([[0.],
[0.],
[0.],
...,
[0.],
[0.],
[1.]])
epoch: 1, loss = 0.6842
I have no clue why is it doing that, and would appriciate any help.
Thanks!
EDIT:
I added the params if they can help anyone figuring this out:
if (dataset == 'adult_train.csv'):
input_size=9
print_every = 1
output_size = 1
lr = 0.001
num_epochs = 10
loss='BCELoss'
EDIT2: Added accuracy calculation in the middle block
BCELoss is not error.
The entropy of a Bernoulli distribution with p=0.5 is -ln(0.5) = 0.693. This is the loss you would expect if
Your data is evenly distributed
Your network is guessing randomly
or
Your network always predicts a uniform distribution
Your model is in the second case. The network is currently guessing slightly negative logits for every prediction. Those will be interpreted as 0 class predictions. Since it seems your data is imbalanced towards 0 labels your accuracy will be the same as a model that always predicts 0. This is just an artifact of random weight initialization. If you keep reinitializing your model you'll find that sometimes it will always predict 1 too.

PyTorch - How to set Activation Rules of neurons to increase efficiency of Neural Network?

I'm trying to make a Back Propagation Neural Network with PyTorch. I can successfully execute and test its accuracy, but it doesn't work very efficiently. Now, I'm supposed to increase its efficiency by setting different activation rules for neurons, so that those neurons that don't contribute to the final output get excluded (pruned) from the computations, thereby increasing the time and accuracy.
My code looks like this (extracted snippets) -
# Hyper Parameters
input_size = 20
hidden_size = 50
num_classes =130
num_epochs = 500
batch_size = 5
learning_rate = 0.1
# normalise input data
for column in data:
# the last column is target
if column != data.shape[1] - 1:
data[column] = data.loc[:, [column]].apply(lambda x: (x - x.mean()) / x.std())
# randomly split data into training set (80%) and testing set (20%)
msk = np.random.rand(len(data)) < 0.8
train_data = data[msk]
test_data = data[~msk]
# define train dataset and a data loader
train_dataset = DataFrameDataset(df=train_data)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Neural Network
class Net(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(Net, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.sigmoid = nn.Sigmoid()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.sigmoid(out)
out = self.fc2(out)
return out
net = Net(input_size, hidden_size, num_classes)
# train the model by batch
for epoch in range(num_epochs):
for step, (batch_x, batch_y) in enumerate(train_loader):
# convert torch tensor to Variable
X = Variable(batch_x)
Y = Variable(batch_y.long())
# Forward + Backward + Optimize
optimizer.zero_grad() # zero the gradient buffer
outputs = net(X)
loss = criterion(outputs, Y)
all_losses.append(loss.data[0])
loss.backward()
optimizer.step()
if epoch % 50 == 0:
_, predicted = torch.max(outputs, 1)
# calculate and print accuracy
total = predicted.size(0)
correct = predicted.data.numpy() == Y.data.numpy()
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Accuracy: %.2f %%' % (epoch + 1, num_epochs, step + 1, len(train_data) // batch_size + 1, loss.data[0], 100 * sum(correct)/total))
Can someone tell me how to do that in PyTorch as I'm very new to PyTorch.
I'm not sure if that question is supposed to be on stackoverflow, but I will give you a hint anyway. You are working with a sigmoid activation function at the moment, the gradient of which vanishes if the input value is too large to small. A commonly used approach is to use the ReLU activation function (stands for rectified linear unit).
ReLU(x) is the identity for the positive domain and 0 for the negative domain, in Python that would be written as follows:
def ReLU(x):
if(x > 0):
return x
else:
return 0
It should be readily available in PyTorch

Categories