GRU Loss decreased upto 0.9 but not further, PyTorch

the code that I am using for experimenting with GRU.
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import *
class N(nn.Module):
def __init__(self):
self.embed = nn.Embedding(5,2)
self.layers = 4
self.gru = nn.GRU(2, 512, self.layers, batch_first=True)
self.bat = nn.BatchNorm1d(4)
self.bat1 = nn.BatchNorm1d(4)
self.bat2 = nn.BatchNorm1d(4)
self.fc = nn.Linear(512,100)
self.fc1 = nn.Linear(100,100)
self.fc2 = nn.Linear(100,5)
self.s = nn.Softmax(dim=-1)
def forward(self,x):
h0 = torch.zeros(self.layers, x.size(0), 512).requires_grad_()
x = self.embed(x)
x,hn = self.gru(x,h0)
x = self.bat(x)
x = self.fc(x)
x = nn.functional.relu(x)
x = self.bat1(x)
x = self.fc1(x)
x = nn.functional.relu(x)
x = self.bat2(x)
x = self.fc2(x)
softmaxed = self.s(x)
return softmaxed
inp = torch.tensor([[4,3,2,1],[2,3,4,1],[4,1,2,3],[1,2,3,4]])
out = torch.tensor([[3,2,1,4],[3,2,4,1],[1,2,3,4],[2,3,4,1]])
k = 0
n = N()
opt = torch.optim.Adam(n.parameters(),lr=0.0001)
while k<10000:
o = n(inp)
o = o.view(-1, o.size(-1))
out = out.view(-1)
loss = nn.functional.cross_entropy(o.view(-1,o.size(-1)),out.view(-1)-1)
acc = ((torch.argmax(o, dim=1) == (out -1)).sum().item() / out.size(0))
if k==10000:
print(torch.argmax(o, dim=1))
Shrinked Output:
torch.Size([4, 4])
tensor(0.9593, grad_fn=<NllLossBackward>) 0.9375
torch.Size([4, 4])
tensor(0.9593, grad_fn=<NllLossBackward>) 0.9375
tensor([4.8500e-01, 9.7813e-06, 5.1498e-01, 6.2428e-06, 7.5929e-06],
The Loss is 0.9593 and accuracy reached up to 0.9375. For this simple input data, the GRU loss is this big. What is the reason? Is there anything wrong in this code? I used cross_entropy as loss function and Adam as the optimizer. Learning rate is 0.001. I tried multiple learning rates but all gave the same final result. I added batch normalization, it speed up the training, but the same loss and accuracy. Why loss does not decrease up to 0.2 or something.

I think it's because you are using cross entropy loss function which in PyTorch combines log-softmax and negative log likelihood. Since your model already performs softmax before returning the output, you actually end up calculating the negative log likelihood for softmax of softmax. Try removing the final softmax from your model.
PyTorch documentation for cross entropy loss:


Linear Regression in PyTorch

It's a simple regression problem. But no matter how much I try, I can't get the answer I want. I'm guessing the weight should be 32 (4 * 8) but, the code returns 25. Why is that?
This is my full source code:
import torch
import torch.nn as nn
import torch.optim as op
X = torch.FloatTensor([[1., 2.],[2., 4.],[3., 6.]])
Y = torch.FloatTensor([[2.],[8.],[18.]])
class TEST(nn.Module):
def __init__(self):
self.l1 = nn.Linear(2,1)
def forward(self, input):
x = self.l1(input)
return x
epochs = 2000
lr = 0.001
model = TEST()
loss_func = nn.MSELoss()
optimizer = op.SGD(model.parameters(), lr=lr)
for epoch in range(epochs):
output = model(X)
loss = loss_func(output, Y)
if epoch%10 == 0:
print('loss[{}] : {}'.format(epoch, loss))
XX = torch.FloatTensor([[4., 8.]])
This is the output of the code:
loss[1920] : 0.8891088366508484
loss[1930] : 0.8890921473503113
loss[1940] : 0.8890781402587891
loss[1950] : 0.8890655636787415
loss[1960] : 0.8890505433082581
loss[1970] : 0.8890388011932373
loss[1980] : 0.889029324054718
loss[1990] : 0.8890181183815002
tensor([[25.3124]], grad_fn=<AddmmBackward>)
You are trying to approximate y = x1*x2 but are using a single linear layer i.e. a purely linear model. Ultimately, what happens is you are learning weights a and b such that y = a*x1 + b*x2. However, this model cannot approximate the distribution of x1, x2 -> x1*x2.

tensorflow 2 : loss using hidden layers output

I am trying to implement the OSME MAMC model describe in article
I'm stuck where I have to add a cost that doesn't depend on y_true and y_pred but on hidden layers and y_true.
It can't be right as tensorflow custom loss, for which we need y_true and y_pred.
I wrote the model into class, then tried to use gradient tape to add NPairLoss to Softmax output loss, but gradient is NaN during training.
I think my approach isn't good, but I have no idea how to design / write it.
Here my model :
class OSME_network(tf.keras.Model):
def __init__(self, nbrclass=10, weight="imagenet",input_tensor=(32,32,3)):
super(OSME_network, self).__init__()
self.nbrclass = nbrclass
self.weight = weight
self.Resnet_50=ResNet50(include_top=False, weights=self.weight, input_shape=self.input_tensor)
self.split=Lambda(lambda x: tf.split(x,num_or_size_splits=2,axis=-1))
self.d1=tf.keras.layers.Dense(1024, name='fc1')
def call(self,x): #set à construire le model sequentiellement
xx_1 = self.s_1(x_1)
xx_2 = self.s_2(x_2)
xxx_1 = self.d1(xx_1)
xxx_2 = self.d2(xx_2)
xxxx_1 = self.fl1(xxx_1)
xxxx_2 = self.fl2(xxx_2)
fc = self.fc([xxxx_1,xxxx_2]) #fc1 + fc2
return xxxx_1,xxxx_2,ret
class OSME_Layer(tf.keras.layers.Layer):
def __init__(self,ch,ratio):
def call(self,inputs):
se_shape = (1, 1,
se = Reshape(se_shape)(squeeze)
return scale
class NPairLoss():
def __init__(self):
self._inputs = None
def __call__(self,inputs,y):
targets=tf.argmax(y, axis=1)
b, p, _ = inputs.shape
n = b * p
inputs=tf.reshape(inputs, [n, -1])
targets = tf.repeat(targets,repeats=p)
parts = tf.tile(tf.range(p),[b])
same_class_mask = tf.math.equal(tf.broadcast_to(targets,[n, n]),tf.transpose(tf.broadcast_to(targets,(n, n))))
same_atten_mask = tf.math.equal(tf.broadcast_to(parts,[n, n]),tf.transpose(tf.broadcast_to(parts,(n, n))))
s_sasc = same_class_mask & same_atten_mask
s_sadc = (~same_class_mask) & same_atten_mask
s_dasc = same_class_mask & (~same_atten_mask)
s_dadc = (~same_class_mask) & (~same_atten_mask)
loss_sasc = 0
loss_sadc = 0
loss_dasc = 0
for i in range(n):
pos = prod[i][s_sasc[i]]
neg = prod[i][s_sadc[i] | s_dasc[i] | s_dadc[i]]
pos = tf.transpose(tf.broadcast_to(pos,[n_neg,n_pos]))
neg = tf.broadcast_to(neg,[n_pos,n_neg])
exp=tf.clip_by_value(tf.math.exp(neg - pos),clip_value_min=0,clip_value_max=9e6) # need to clip value, else inf
loss_sasc += tf.reduce_sum(tf.math.log(1 + tf.reduce_sum(exp,axis=1)))
pos = prod[i][s_sadc[i]]
neg = prod[i][s_dadc[i]]
n_pos = tf.shape(pos)[0]
n_neg = tf.shape(neg)[0]
pos = tf.transpose(tf.broadcast_to(pos,[n_neg,n_pos])) #np.transpose(np.tile(pos,[n_neg,1]))
neg = tf.broadcast_to(neg,[n_pos,n_neg])#np.tile(neg,[n_pos,1])
exp=tf.clip_by_value(tf.math.exp(neg - pos),clip_value_min=0,clip_value_max=9e6)
loss_sadc += tf.reduce_sum(tf.math.log(1 + tf.reduce_sum(exp,axis=1)))
pos = prod[i][s_dasc[i]]
neg = prod[i][s_dadc[i]]
n_pos = tf.shape(pos)[0]
n_neg = tf.shape(neg)[0]
pos = tf.transpose(tf.broadcast_to(pos,[n_neg,n_pos])) #np.transpose(np.tile(pos,[n_neg,1]))
neg = tf.broadcast_to(neg,[n_pos,n_neg])#np.tile(neg,[n_pos,1])
exp=tf.clip_by_value(tf.math.exp(neg - pos),clip_value_min=0,clip_value_max=9e6)
loss_dasc += tf.reduce_sum(tf.math.log(1 + tf.reduce_sum(exp,axis=1)))
return (loss_sasc + loss_sadc + loss_dasc) / n
then, for training :
def train_step(x,y):
with tf.GradientTape() as tape:
loss=cce(y, y_pred) +0.001*layerLoss
return loss
model=OSME_network(weight="imagenet",nbrclass=10,input_tensor=(32, 32, 3))
model.compile(optimizer=opt, loss=categorical_crossentropy, metrics=["acc"]),32,32,3))
cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True,name='categorical_crossentropy')
for each batch :
Thanks for any help :)
You can use tensorflow's add_loss.
model.compile() loss functions in Tensorflow always take two parameters y_true and y_pred. Using model.add_loss() has no such restriction and allows you to write much more complex losses that depend on many other tensors, but it has the inconvenience of being more dependent on the model, whereas the standard loss functions work with just any model.
You can find the official documentation of add_loss here. Add loss tensor(s), potentially dependent on layer inputs. This method can be used inside a subclassed layer or model's call function, in which case losses should be a Tensor or list of Tensors. There are few example in the documentation to explain the add_loss.
This method can also be called directly on a Functional Model during construction. In this case, any loss Tensors passed to this Model must be symbolic and be able to be traced back to the model's Inputs. These losses become part of the model's topology and are tracked in get_config.
Example :
inputs = tf.keras.Input(shape=(10,))
x = tf.keras.layers.Dense(10)(inputs)
outputs = tf.keras.layers.Dense(1)(x)
model = tf.keras.Model(inputs, outputs)
# Activity regularization.
You can call self.add_loss(loss_value) from inside the call method of a custom layer. Here's a simple example that adds activity regularization.
class ActivityRegularizationLayer(layers.Layer):
def call(self, inputs):
self.add_loss(tf.reduce_sum(inputs) * 0.1)
return inputs # Pass-through layer.
inputs = keras.Input(shape=(784,), name='digits')
x = layers.Dense(64, activation='relu', name='dense_1')(inputs)
# Insert activity regularization as a layer
x = ActivityRegularizationLayer()(x)
x = layers.Dense(64, activation='relu', name='dense_2')(x)
outputs = layers.Dense(10, name='predictions')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
# The displayed loss will be much higher than before
# due to the regularization component., y_train,
You can find good example using add_loss here and here with explanations.
Hope this answers your question. Happy Learning.

Why can't I learn XOR function with this network and constraints?

Let's say I have the following constraints and the network:
The architecture is fixed (see this image) (note that there are no biases)
Activation function for the hidden layer is ReLU
There's no activation function for the output layer (should just return the sum of the inputs it receive).
I tried to implement this in pytorch with various initialization schemes and different data sets but I failed (the code is at the bottom).
My questions are:
Is there anything wrong with my NN training process?
Is this a feasible problem? If yes, how?
If this is doable, can we still achieve that by constraining the weights to be in the set {-1, 0, 1}
import torch
import torch.nn as nn
import torch.optim as optim
import as data_utils
import numpy as np
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.fc1 = nn.Linear(2,2,bias=False)
self.fc2 = nn.Linear(2,1, bias=False)
self.rl = nn.ReLU()
def forward(self, x):
x = self.fc1(x)
x = self.rl(x)
x = self.fc2(x)
return x
#create an XOR data set to train
rng = np.random.RandomState(0)
X = rng.randn(200, 2)
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype('int32')
# test data set
X_test = np.array([[0,0],[0,1], [1,0], [1,1]])
train = data_utils.TensorDataset(torch.from_numpy(X).float(), \
train_loader = data_utils.DataLoader(train, batch_size=50, shuffle=True)
test = torch.from_numpy(X_test).float()
# training the network
num_epoch = 10000
net = Network(), max=1), max=1)
# define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters())
for epoch in range(num_epoch):
running_loss = 0 # loss per epoch
for (X, y)in train_loader:
# make the grads zero
# forward propagate
out = net(X)
# calculate loss and update
loss = criterion(out, y)
running_loss +=
if epoch%500== 0:
print("Epoch: {0} Loss: {1}".format(epoch, running_loss))
The loss doesn't improve. It gets stuck in some value after a few epochs ( i'm not sure how to make this reproducible as I'm getting different values every time)
net(test) returns a set of predictions that are no way close to XOR output.
You need to use a nonlinear activation function such as sigmoid in your hidden and output layers . because xor is not linearly separable.Also biases are required.

PyTorch - How to set Activation Rules of neurons to increase efficiency of Neural Network?

I'm trying to make a Back Propagation Neural Network with PyTorch. I can successfully execute and test its accuracy, but it doesn't work very efficiently. Now, I'm supposed to increase its efficiency by setting different activation rules for neurons, so that those neurons that don't contribute to the final output get excluded (pruned) from the computations, thereby increasing the time and accuracy.
My code looks like this (extracted snippets) -
# Hyper Parameters
input_size = 20
hidden_size = 50
num_classes =130
num_epochs = 500
batch_size = 5
learning_rate = 0.1
# normalise input data
for column in data:
# the last column is target
if column != data.shape[1] - 1:
data[column] = data.loc[:, [column]].apply(lambda x: (x - x.mean()) / x.std())
# randomly split data into training set (80%) and testing set (20%)
msk = np.random.rand(len(data)) < 0.8
train_data = data[msk]
test_data = data[~msk]
# define train dataset and a data loader
train_dataset = DataFrameDataset(df=train_data)
train_loader =, batch_size=batch_size, shuffle=True)
# Neural Network
class Net(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(Net, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.sigmoid = nn.Sigmoid()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.sigmoid(out)
out = self.fc2(out)
return out
net = Net(input_size, hidden_size, num_classes)
# train the model by batch
for epoch in range(num_epochs):
for step, (batch_x, batch_y) in enumerate(train_loader):
# convert torch tensor to Variable
X = Variable(batch_x)
Y = Variable(batch_y.long())
# Forward + Backward + Optimize
optimizer.zero_grad() # zero the gradient buffer
outputs = net(X)
loss = criterion(outputs, Y)
if epoch % 50 == 0:
_, predicted = torch.max(outputs, 1)
# calculate and print accuracy
total = predicted.size(0)
correct = ==
print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Accuracy: %.2f %%' % (epoch + 1, num_epochs, step + 1, len(train_data) // batch_size + 1,[0], 100 * sum(correct)/total))
Can someone tell me how to do that in PyTorch as I'm very new to PyTorch.
I'm not sure if that question is supposed to be on stackoverflow, but I will give you a hint anyway. You are working with a sigmoid activation function at the moment, the gradient of which vanishes if the input value is too large to small. A commonly used approach is to use the ReLU activation function (stands for rectified linear unit).
ReLU(x) is the identity for the positive domain and 0 for the negative domain, in Python that would be written as follows:
def ReLU(x):
if(x > 0):
return x
return 0
It should be readily available in PyTorch

PyTorch: Extract learned weights correctly

I am trying to extract the weights from a linear layer, but they do not appear to change, although error is dropping monotonously (i.e. training is happening). Printing the weights' sum, nothing happens because it stays constant:
Here are the code snippets:
def train(epochs):
for epoch in range(1, epochs+1):
# Train on train set
for batch_idx, (data, target) in enumerate(train_loader):
data, target = Variable(data), Variable(data)
output = model(data)
loss = criterion(output, target)
# Define model
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(100, 80, bias=False)
init.normal(self.fc1.weight, mean=0, std=1)
self.fc2 = nn.Linear(80, 87)
self.fc3 = nn.Linear(87, 94)
self.fc4 = nn.Linear(94, 100)
def forward(self, x):
x = self.fc1(x)
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
return x
Maybe I am looking on the wrong parameters, although I checked the docs. Thanks for your help!
Use model.parameters() to get trainable weight for any model or layer. Remember to put it inside list(), or you cannot print it out.
The following code snip worked
>>> import torch
>>> import torch.nn as nn
>>> l = nn.Linear(3,5)
>>> w = list(l.parameters())
>>> w
