Assume that I have 77 samples to train my CNN, and my batch size is 10. Then the last batch has a batch size of 7 instead of 10. Somehow when I pass it to the loss function such as nn.MSELoss(), it gives me the error:
RuntimeError: The size of tensor a (10) must match the size of tensor
b (7) at non-singleton dimension 1
So pytorch doesn't support batches with different sizes?
My code in doubt:
import numpy as np
import torch
from torch import nn
import torchvision
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 6, (5,4))
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(64, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, x.shape[1] * x.shape[2] * x.shape[3])
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
model = Net()
batch_size = 10
# Generating Artifical data
x_train = torch.randn((77,1,20,20))
y_train = torch.randint(0,10,size=(77,),dtype=torch.float)
trainset = torch.utils.data.TensorDataset(x_train,y_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)
# testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=0)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for epoch in range(20): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i%10==0:
print('epoch{}, step{}, loss: {}'.format(epoch + 1, i + 1, running_loss))
# print("frac post = {}".format(frac_post))
running_loss = 0.0
The problem is not due to the batch size, but to a failure to broadcast properly between the 10 outputs of your CNN and the single label provided in each example.
If you look at the model output and label tensor shapes during the batch where the error is thrown,
print(outputs.shape, labels.shape)
#out: torch.Size([7, 10]) torch.Size([7])
you'll see that the labels are stored in a singleton tensor. According to pytorch broadcasting rules, to be broadcastable two tensors have to be compatible in all trailing dimensions. In this case, the trailing dimension of the model output (10) is incompatible with that of the label (7).
To fix, either add a dummy dimension to the label (assuming you actually want to broadcast the labels to match your ten network outputs), or define a network with scalar outputs. For example:
y_train = torch.randint(0,10,size=(77,1),dtype=torch.float)
results in
print(outputs.shape, labels.shape)
#out: torch.Size([7, 10]) torch.Size([7,1])
# these are broadcastable
Related
I'm trying to train an NN to develop a relationship between a certain range of inputs and some mathematical coefficients but I'm struggling to figure out why the loss is not decreasing over the course of training. I've made a simple custom MLP and dataset to interface with the PyTorch module. The dataset will take in a 6 x m matrix for the inputs(m for however many different combinations I have) and a r x m matrix for the coefficients. However, when running the training script the training loss doesn't decrease over the course of 100 epochs. All the shapes match when entering and exiting the NN so I'm lost as to what might be going wrong. These are the shapes of the input and coefficients matrices for the training and test phase respectively.
inputs_train = (6, 581)
coefficients_train = (259, 581)
inputs_test = (6, 145)
coefficients_test = (259, 145)
All the relevant code is shown below.
class MLP(nn.Module):
def __init__(self, r):
super(MLP, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(6, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, r),
nn.ReLU()
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
class ExpansionCoefficientsDataset(Dataset):
def __init__(self, inputs, alphas):
self.geom_params = inputs
self.expansion_coefficients = alphas
def __len__(self):
return self.expansion_coefficients.shape[1]
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = {'inputs': self.geom_params[:,idx], 'alphas': self.expansion_coefficients[:,idx]}
return sample
train_dataset = ExpansionCoefficientsDataset(inputs_train, coefficients_train)
test_dataset = ExpansionCoefficientsDataset(inputs_test, coefficients_test)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16)
#%%
model = MLP(r_ideal).double()
criterion = torch.nn.L1Loss()
num_epochs = 100
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
batch_size = len(train_loader)
# Training loop starts
for epoch in range(num_epochs):
print(f"-------------------------------\nEpoch {epoch+1}\n-------------------------------")
size = len(train_loader.dataset)
# Iterating over the training dataset
for i_batch, sample_batched in enumerate(train_loader):
pred = model(sample_batched['inputs'])
# Calculating loss
loss = criterion(pred, sample_batched['alphas'])
# Updating weights according
# to the calculated loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i_batch % 8 == 0:
loss, current = loss.item(), i_batch * sample_batched['inputs'].size()[0]
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
I'm trying to use pytorch in for the IMBD dataset, to predict the positive and negative reviews. When I get to the training state, the following error is given by the criterion function:
ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1136, 64, 1]))
After some research, I saw that the error is because the output of the model is returning a tensor of size [1136, 64, 1], and criterion is expecting only batch results.
Howerver, I don't know how to solve this error.
My code:
import torch
import spacy
import torch.nn as nn
from torchtext.legacy import data
import sys
import csv
import torch.optim as optim
import re
import nltk
from nltk.corpus import stopwords
from torchtext import vocab
from torchtext.legacy.data import Field
from torchtext.legacy import datasets
import pandas as pd
import re
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import random
SEED = 1234
torch.manual_seed(SEED) # For reproducibility
torch.backends.cudnn.deterministic = True
import torch.nn.functional as F
import torch.nn as nn
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
#text = [sent len, batch size]
embedded = self.embedding(text)
h_1 = F.relu(self.hidden_fc(embedded))
# assert torch.equal(output[-1,:,:], h_1.squeeze(0))
# [batch size, output dim]
return self.fc(h_1.squeeze(0))
def binary_accuracy(preds, y):
"""
Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
"""
#round predictions to the closest integer
rounded_preds = torch.round(torch.sigmoid(preds)) # 0.75 --> 1 0.4 --> 0
correct = (rounded_preds == y).float() #convert into float for division
acc = correct.sum() / len(correct)
return acc
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train() #Train mode is on
for batch in iterator:
optimizer.zero_grad() #Reset the gradients
predictions = model(batch.text) ## forward propagation
print(predictions.shape)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward() ## backward propagation / calculate gradients
optimizer.step() ## update parameters
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval() #Evaluation mode is on
with torch.no_grad():
for batch in iterator:
predictions = model(batch.text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
TEXT = data.Field(tokenize = 'spacy',
tokenizer_language = 'en_core_web_sm',
lower = True)
LABEL = data.LabelField(dtype = torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) ## IMDB reviews dataset
train_data, valid_data = train_data.split(random_state = random.seed(SEED))
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE) #Build the vocabulary using the top frequent 25K words
LABEL.build_vocab(train_data)
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data),
batch_size = BATCH_SIZE)
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
model = MLP(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
To summarize your problem, you have reviews you want to classify as positive or negative. To do so you train an embedding space to map each word to a vector, then output a probability for each sentence and supervised with the corresponding label using a binary cross-entropy loss nn.BCELossWithLogits.
You current model is comprised of:
nn.Embedding: embeds each word in the sequence independently thus converting the input tensor shape from (seq_len, batch_size) to (seq_len, batch_size, embedding_dim). Where seq_len is the number of tokens in your input sequence.
nn.Linear layer reduces the dimensionality by projecting the features, the tensor shape is converted from (seq_len, batch_size, embedding_dim) to (seq_len, batch_size, hidden_dim).
A non-linearity layer is applied to the sequence of word vectors. Note how the structure of the sentence is retained. And finally, apply a second linear layer to map from (seq_len, batch_size, hidden_dim) to (seq_len, batch_size, output_dim). Still with the sentence structure (cf. the dim=0 with seq_len).
This is the reason why you are getting (1136, 64, 1) as the predictions shape: 1136 must be your sequence length, 64 is BATCH_SIZE, while 1 is OUTPUT_DIM.
Yet you are trying to classify each sequence as a whole, what you would need instead is a single tensor or scalar value per sentence, i.e. a shape of (1, 64, 1). This implies reducing the first dimension corresponding to the sequence dimension, to a single value.
A straightforward and easy way to reduce the dimension such that you can represent the whole sentence with a single vector is by applying an average pool to the sentence. The average vector of the words in each sentence should give you the sentiment of the positiveness/negativeness of the overall sentence. You can apply this operator before the final projection to remain in a relatively high dimension, either with nn.AdaptiveAvgPool1d with an output size of 1 or simply torch.Tensor.mean.
Here is a possible implementation with nn.AdaptiveAvgPool1d:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.avg = nn.AdaptiveAvgPool1d(1) # reduces (N, C, L_in) to (N, C, 1)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = self.avg(h_1.permute(1,2,0))
# (batch_size, hidden_dim, 1) = (64, 256, 1)
out = self.fc(avg.squeeze(-1))
# (batch_size, 1, 1) = (64, 1)
return out
Or with torch.Tensor.mean:
class MLP(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
super().__init__()
self.embedding = nn.Embedding(input_dim, embedding_dim)
self.hidden_fc = nn.Linear(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# (seq_len, batch_size) = (1136, 64)
embedded = self.embedding(text)
# (seq_len, batch_size, embedding_dim) = (1136, 64, 100)
h_1 = F.relu(self.hidden_fc(embedded))
# (seq_len, batch_size, hidden_dim) = (1136, 64, 256)
avg = h_1.mean(0)
# (batch_size, hidden_dim, 1) = (64, 256)
out = self.fc(avg)
# (batch_size, 1, 1) = (64, 1)
return out
Alternative methods involve using more sophisticated neural network layers such as recurrent neural network blocks (nn.RNN, nn.LSTM, nn.GRU)...
I'm having trouble understanding why this is throwing an error. This code is pulled directly from the PyTorch documentation for a NN classifier for the fashion MNIST dataset. However when I try to flip this to the MNIST handwritten digits data set it comes up with the following error:
RuntimeError: The size of tensor a (10) must match the size of tensor b (64) at non-singleton dimension 1
This occurs when using the loss function during the training loop function. Can anyone help me understand why this is happening. Thanks!
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda
import torchvision.models as models
import matplotlib.pyplot as plt
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
print(f"Using {device} device")
training_data = datasets.MNIST(
root="data",
train=True,
download=True,
transform=ToTensor()
)
test_data = datasets.MNIST(
root="data",
train=False,
download=True,
transform=ToTensor()
)
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def train_loop(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Compute prediction and loss
X, y = X.to(device), y.to(device)
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test_loop(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
def save_checkpoint(state, filename = "checkpoint.pth.tar"):
print("=> Saving checkpoint")
torch.save(state, filename)
model = NeuralNetwork().to(device)
learning_rate = 1e-3
batch_size = 64
epochs = 10
# Initialize the loss function
loss_fn = nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train_loop(train_dataloader, model, loss_fn, optimiser)
test_loop(test_dataloader, model, loss_fn)
print("Done!")
torch.nn.MSELoss is an implemention of mean squared error. You can't measure the difference between two tensors if they're different sizes (MSELoss does not allow for broadcasting). So if you're using MSELoss, then the predictions and the targets must be the same shape. In your case, preds is a tensor of shape [64, 10], and y is a tensor of shape [64].
The reason y is of shape [64] rather than [64, 10] is that most classification dataset implementations represent targets as integer labels rather than one-hot encoded vectors. In theory, you could convert these integer label targets to one-hot encoded targets.
But in reality, since this is a classification problem, you should probably be using something like nn.CrossEntropyLoss rather than nn.MSELoss. The former is a conventional classification loss function, and it allows the targets to be integer labels rather than one-hot labels (so just swapping out MSELoss for CrossEntropyLoss should solve your problem). MSELoss is better suited for regression tasks and such.
The following code (MNIST MLP in PyTorch) delivers approximately the same training loss regardless of having the last layer in the forward pass returning:
F.log_softmax(x)
(x)
Option 1 is incorrect because I use criterion = nn.CrossEntropyLoss() and yet the results are almost identical. Am I missing anything?
import torch
import numpy as np
import time
from torchvision import datasets
import torchvision.transforms as transforms
# number of subprocesses to use for data loading
num_workers = 0
# how many samples per batch to load
batch_size = 20
# convert data to torch.FloatTensor
transform = transforms.ToTensor()
# choose the training and test datasets
train_data = datasets.MNIST(root='data', train=True,
download=True, transform=transform)
test_data = datasets.MNIST(root='data', train=False,
download=True, transform=transform)
# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size,
num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size,
num_workers=num_workers)
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# linear layer (784 -> 1 hidden node)
self.fc1 = nn.Linear(28 * 28, 512)
self.dropout1= nn.Dropout(p=0.2, inplace= False)
self.fc2 = nn.Linear(512, 256)
self.dropout2= nn.Dropout(p=0.2, inplace= False)
self.dropout = nn.Dropout(p=0.2, inplace= False)
self.fc3 = nn.Linear(256, 10)
def forward(self, x):
# flatten image input
x = x.view(-1, 28 * 28)
# add hidden layer, with relu activation function
x = F.relu(self.fc1(x))
x = self.dropout1(x)
x = F.relu(self.fc2(x))
x = self.dropout2(x)
x = self.fc3(x)
# return F.log_softmax(x)
return x
# initialize the NN
model = Net()
print(model)
model.to('cuda')
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
n_epochs = 10
model.train() # prep model for training
for epoch in range(n_epochs):
# monitor training loss
train_loss = 0.0
start = time.time()
for data, target in train_loader:
data, target = data.to('cuda'), target.to('cuda')
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
train_loss += loss.item()*data.size(0)
train_loss = train_loss/len(train_loader.dataset)
print('Epoch: {} \tTraining Loss: {:.6f} \ttime: {:.6f}'.format(
epoch+1,
train_loss,
time.time()-start
))
For numerical stability, the nn.CrossEntropyLoss() is implemented with the softmax layer inside it. So you should NOT use the softmax layer in your forward pass.
From the docs (https://pytorch.org/docs/stable/nn.html#crossentropyloss):
This criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class.
Using the softmax layer in the forward pass will lead to worse metrics because the gradient magnitudes are lowered (thus, the weight updates are also lowered). I learned it the hard way!
I guess your problem is that the loss is similar at the beginning of training, but at the end of the training, they should not. It is usually a good sanity check to overfit your model in one batch of data. The model should reach 100% accuracy if the batch is small enough. If the model is taking too long to train than you probably have a bug somewhere.
Hope that helps =)
I copied the CIFAR10 sample network from PyTorch tutorial and added more layers, including BN. Even after 45 epochs, the network keeps achieving 68% classification accuracy on the test set.
The network consists of:
2 convolutional layers with 3x3 kernels (input size reduces from 32px to 28px)
one max pooling layer (input size reduces from 28px to 14px)
3 convolutional layers with 3x3 kernels (input size reduces from 14px to 8px)
A fully connected network with 3 layers of 256->256->10 neurons
batch normaliation is applied on all layers, including the convolutional layers, except for the last FC layer
Relu is applied on all the convolutional layers and all the hidden FC layers
Did I build/use anything improperly?
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1_1 = nn.Conv2d(3, 16, 3) # 32 -> 30
self.bn1_1 = nn.BatchNorm2d(16)
self.conv1_2 = nn.Conv2d(16, 16, 3) # 30 - > 28
self.bn1_2 = nn.BatchNorm2d(16)
self.pool = nn.MaxPool2d(2, 2) # 28 -> 14
self.conv2_1 = nn.Conv2d(16, 16, 3) # 14 -> 12
self.bn2_1 = nn.BatchNorm2d(16)
self.conv2_2 = nn.Conv2d(16, 16, 3) # 12 -> 10
self.bn2_2 = nn.BatchNorm2d(16)
self.conv2_3 = nn.Conv2d(16, 16, 3) # 10 -> 8
self.bn2_3 = nn.BatchNorm2d(16)
self.fc1 = nn.Linear(16 * 8 * 8, 256)
self.bn4 = nn.BatchNorm1d(256)
self.fc2 = nn.Linear(256, 256)
self.bn5 = nn.BatchNorm1d(256)
self.fc3 = nn.Linear(256, 10)
def forward(self, x):
x = F.relu(self.bn1_1(self.conv1_1(x)))
x = self.pool(F.relu(self.bn1_2(self.conv1_2(x))))
x = F.relu(self.bn2_1(self.conv2_1(x)))
x = F.relu(self.bn2_2(self.conv2_2(x)))
x = F.relu(self.bn2_3(self.conv2_3(x)))
x = x.view(-1, 16 * 8 * 8)
x = F.relu(self.bn4(self.fc1(x)))
x = F.relu(self.bn5(self.fc2(x)))
x = self.fc3(x)
return x
net = Net()
device = 'cuda:0'
net.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=8,
shuffle=True, num_workers=2)
for epoch in range(128): # loop over the dataset multiple times
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
Note: added the "Python" tag so the code gets highlighted
Note: updated the forward method to apply F.relu on the hidden FC layers
Use sigmoid activation for the last layer.