Training Graph Neural Network (GNN) to create Embeddings using spektral - python

I am working to create a Graph Neural Network (GNN) which can create embeddings of the input graph for its usage in other applications like Reinforcement Learning.
I have started with example from the spektral library TUDataset classification with GIN and modified it to divide the network into two parts. The first part to produce embeddings and second part to produce classification. My goal is to train this network using supervised learning on dataset with graph labels e.g. TUDataset and use the first part (embedding generation) once trained in other applications.
I am getting different results from my approach in two different datasets. The TUDataset shows improved loss and accuracy with this new approach whereas the other other local dataset shows significant increase in the loss.
Can I get any feedback if my approach to create embedding is appropriate or any suggestions for further improvement?
here is my code used to generate graph embeddings:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from import DisjointLoader
from spektral.datasets import TUDataset
from spektral.layers import GINConv, GlobalAvgPool
learning_rate = 1e-3 # Learning rate
channels = 128 # Hidden units
layers = 3 # GIN layers
epochs = 300 # Number of training epochs
batch_size = 32 # Batch size
dataset = TUDataset("PROTEINS", clean=True)
# Parameters
F = dataset.n_node_features # Dimension of node features
n_out = dataset.n_labels # Dimension of the target
# Train/test split
idxs = np.random.permutation(len(dataset))
split = int(0.9 * len(dataset))
idx_tr, idx_te = np.split(idxs, [split])
dataset_tr, dataset_te = dataset[idx_tr], dataset[idx_te]
loader_tr = DisjointLoader(dataset_tr, batch_size=batch_size, epochs=epochs)
loader_te = DisjointLoader(dataset_te, batch_size=batch_size, epochs=1)
class GIN0(Model):
def __init__(self, channels, n_layers):
self.conv1 = GINConv(channels, epsilon=0, mlp_hidden=[channels, channels])
self.convs = []
for _ in range(1, n_layers):
GINConv(channels, epsilon=0, mlp_hidden=[channels, channels])
self.pool = GlobalAvgPool()
self.dense1 = Dense(channels, activation="relu")
def call(self, inputs):
x, a, i = inputs
x = self.conv1([x, a])
for conv in self.convs:
x = conv([x, a])
x = self.pool([x, i])
return self.dense1(x)
# Build model
model = GIN0(channels, layers)
model_op = Sequential()
model_op.add(Dropout(0.5, input_shape=(channels,)))
model_op.add(Dense(n_out, activation="softmax"))
opt = Adam(lr=learning_rate)
loss_fn = CategoricalCrossentropy()
#tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True)
def train_step(inputs, target):
with tf.GradientTape(persistent=True) as tape:
node2vec = model(inputs, training=True)
predictions = model_op(node2vec, training=True)
loss = loss_fn(target, predictions)
loss += sum(model.losses)
gradients = tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(gradients, model.trainable_variables))
gradients2 = tape.gradient(loss, model_op.trainable_variables)
opt.apply_gradients(zip(gradients2, model_op.trainable_variables))
acc = tf.reduce_mean(categorical_accuracy(target, predictions))
return loss, acc
print("Fitting model")
current_batch = 0
model_lss = model_acc = 0
for batch in loader_tr:
lss, acc = train_step(*batch)
model_lss += lss.numpy()
model_acc += acc.numpy()
current_batch += 1
if current_batch == loader_tr.steps_per_epoch:
model_lss /= loader_tr.steps_per_epoch
model_acc /= loader_tr.steps_per_epoch
print("Loss: {}. Acc: {}".format(model_lss, model_acc))
model_lss = model_acc = 0
current_batch = 0
def tolist(predictions):
result = []
for item in predictions:
result.append((float(item[0]), float(item[1])))
return result
loss_data = []
print("Testing model")
model_lss = model_acc = 0
for batch in loader_te:
inputs, target = batch
node2vec = model(inputs, training=False)
predictions = model_op(node2vec, training=False)
predictions_list = tolist(predictions)
model_lss += loss_fn(target, predictions)
model_acc += tf.reduce_mean(categorical_accuracy(target, predictions))
model_lss /= loader_te.steps_per_epoch
model_acc /= loader_te.steps_per_epoch
print("Done. Test loss: {}. Test acc: {}".format(model_lss, model_acc))
for batchi in loss_data:
for batchi in loss_data:

Your approach to generate graph embeddings is correct, the GIN0 model will return a vector given a graph.
This code here, however, seems weird:
gradients = tape.gradient(loss, model.trainable_variables)
opt.apply_gradients(zip(gradients, model.trainable_variables))
gradients2 = tape.gradient(loss, model_op.trainable_variables)
opt.apply_gradients(zip(gradients2, model_op.trainable_variables))
What you're doing here is that you're updating the weights of model twice, and the weights of model_op once.
When you compute the loss in the context of a tf.GradientTape, all computations that went into computing the final value are tracked. This means that if you call loss = foo(bar(x)) and then compute the training step using that loss, the weights of both foo and bar will be updated.
Besides this, I don't see issues with the code so it will mostly depend on the local dataset that you are using.


Linear regression using Pytorch

I have classification problem. I am using Pytorch, My input is sequence of length 341 and output one of three classes {0,1,2}, I want to train linear regression model using pytorch, I created the following class but during the training, the loss values start to have numbers then inf then NAN. I do not know how to fix that . Also I tried to initialize the weights for linear model but it is the same thing. Any suggestions.
class regression(nn.Module):
def __init__(self, input_dim):
self.input_dim = input_dim
# One layer
self.linear = nn.Linear(input_dim, 1)
def forward(self, x):
y_pred = self.linear(x)
return y_pred
criterion = torch.nn.MSELoss()
def fit(model, data_loader, optim, epochs):
for epoch in range(epochs):
for i, (X, y) in enumerate(data_loader):
X = X.float()
y = y.unsqueeze(1).float()
X = Variable(X, requires_grad=True)
y = Variable(y, requires_grad=True)
# Make a prediction for the input X
pred = model(X)
#loss = (y-pred).pow(2).mean()
loss = criterion(y, pred)
# Give some feedback after each 5th pass through the data
if epoch % 5 == 0:
print("Epoch", epoch, f"loss: {loss}")
return None
regnet = regression(input_dim=341)
optim = SGD(regnet.parameters(), lr=0.01)
fit(regnet, data_loader, optim=optim, epochs=5)
pred = regnet(torch.Tensor(test_set.data_info).float())
pred = pred.detach().numpy()
I would additionally suggest to replace MSE with CrossEntropy Loss as it is better suited for multi-class classificiation problems.
import random
import torch
from torch import nn, optim
from matplotlib import pyplot as plt
# Generate random dataset with your shape to test
# Replace this with your own dataset
data = []
for label in [0, 1, 2]:
for i in range(1000):
data.append((torch.rand(341), label))
# train test split
train, val = data[:1500], data[1500:]
def run_gradient_descent(model, data_train, data_val, batch_size=64, learning_rate=0.01, weight_decay=0, num_epochs=10):
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
iters, losses = [], []
iters_sub, train_acc, val_acc = [], [] ,[]
train_loader =, batch_size=batch_size, shuffle=True)
# training
n = 0 # the number of iterations
for epoch in range(num_epochs):
for xs, ts in iter(train_loader):
if len(ts) != batch_size:
zs = model(xs)
loss = criterion(zs, ts) # compute the total loss
loss.backward() # compute updates for each parameter
optimizer.step() # make the updates for each parameter
optimizer.zero_grad() # a clean up step for PyTorch
# save the current training information
losses.append(float(loss)/batch_size) # compute *average* loss
if n % 10 == 0:
train_acc.append(get_accuracy(model, data_train))
val_acc.append(get_accuracy(model, data_val))
# increment the iteration number
n += 1
# plotting
plt.title("Training Curve (batch_size={}, lr={})".format(batch_size, learning_rate))
plt.plot(iters, losses, label="Train")
plt.title("Training Curve (batch_size={}, lr={})".format(batch_size, learning_rate))
plt.plot(iters_sub, train_acc, label="Train")
plt.plot(iters_sub, val_acc, label="Validation")
return model
def get_accuracy(model, data):
loader =, batch_size=500)
correct, total = 0, 0
for xs, ts in loader:
zs = model(xs)
pred = zs.max(1, keepdim=True)[1] # get the index of the max logit
correct += pred.eq(ts.view_as(pred)).sum().item()
total += int(ts.shape[0])
return correct / total
class MyRegression(nn.Module):
def __init__(self, input_dim, output_dim):
super(MyRegression, self).__init__()
# One layer
self.linear = nn.Linear(input_dim, output_dim)
def forward(self, x):
return self.linear(x)
model = MyRegression(341, 3)
run_gradient_descent(model, train, val, batch_size=64, learning_rate=0.01, num_epochs=10)
cause of my reputation number I can't if I was you. I'm gonna build like this: I think there is something wrong with your method of making a Module.
class regression(nn.Module):
def __init__(self,input_dim,output_dim):
def forward(self,x):
return self.linear(x)
#define the model
# Mean square error
#train the model
for iteration in range(iteration_number):
#forward to get output
#loss calculate
#backward propagation
#updating parameters
#store loss
if(iteration %5==0):
print("epoch{} ,loss{}".format(iteration,

Torch: How to inspect weights after training?

I am wondering what I am doing wrong when looking to see how the weights changed during training.
My loss goes down considerably but it appears that the initialized weights are the same as trained weights. Am I looking in the wrong location? I would appreciate any insight that you might have!
import torch
import numpy as np
from torchvision import datasets, transforms
from import DataLoader
import torch.nn.functional as F
# setup GPU/CPU processing
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# initialize model
class mlp1(torch.nn.Module):
def __init__(self, num_features, num_hidden, num_classes):
super(mlp1, self).__init__()
self.num_classes = num_classes
self.input_layer = torch.nn.Linear(num_features, num_hidden)
self.out_layer = torch.nn.Linear(num_hidden, num_classes)
def forward(self, x):
x = self.input_layer(x)
x = torch.sigmoid(x)
logits = self.out_layer(x)
probas = torch.softmax(logits, dim=1)
return logits, probas
# instantiate model
model = mlp1(num_features=28*28, num_hidden=100, num_classes=10).to(device)
# check initial weights
weight_check_pre = model.state_dict()['input_layer.weight'][0][0:25]
# optim
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
# download data
train_dataset = datasets.MNIST(root='data',
# data loader
train_dataloader = DataLoader(dataset=train_dataset,
# train
for epoch in range(NUM_EPOCHS):
for batch_idx, (features, targets) in enumerate(train_dataloader):
# send data to device
features = features.view(-1, 28*28).to(device)
targets =
# forward
logits, probas = model(features)
# loss
loss = F.cross_entropy(logits, targets)
# now update weights
if not batch_idx % 50:
print ('Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f'
%(epoch+1, NUM_EPOCHS, batch_idx,
len(train_dataloader), loss))
# check post training
weight_check_post = model.state_dict()['input_layer.weight'][0][0:25]
# compare
weight_check_pre == weight_check_post # all equal
That is because both variables are referencing the same object (dictionary) in memory and so will always equal to each other.
You can do this to get actual copies of the state_dict.
import copy
# check initial weights
weight_check_pre = copy.deepcopy(model.state_dict()['input_layer.weight'][0][0:25])
# check post training
weight_check_post = copy.deepcopy(model.state_dict()['input_layer.weight'][0][0:25])

Loss on dev set is always increasing unlike training set loss

I designed a network for a text classification problem. To do this, I'm using huggingface transformet's BERT model with a linear layer above that for fine-tuning. My problem is that the loss on the training set is decreasing which is fine, but when it comes to do the evaluation after each epoch on the development set, the loss is increasing with epochs. I'm posting my code to investigate if there's something wrong with it.
for epoch in range(1, args.epochs + 1):
total_train_loss = 0
for step, batch in enumerate(train_dataloader):
loss = trainer.step(batch)
total_train_loss += loss
avg_train_loss = total_train_loss / len(train_dataloader)'Training loss for epoch %d/%d: %4.2f') % (epoch, args.epochs, avg_train_loss))
print("\n-------------------------------")'Start validation ...')
y_hat = list()
y = list()
total_dev_loss = 0
for step, batch_val in enumerate(dev_dataloader):
true_labels_ids, predicted_labels_ids, loss = trainer.validate(batch_val)
total_dev_loss += loss
avg_dev_loss = total_dev_loss / len(dev_dataloader)
print(("\n-Total dev loss: %4.2f on epoch %d/%d\n") % (avg_dev_loss, epoch, args.epochs))
print("Training terminated!")
Following is the trainer file, which I use for doing a forward pass on a given batch and then backpropagate accordingly.
class Trainer(object):
def __init__(self, args, model, device, data_points, is_test=False, train_stats=None):
self.args = args
self.model = model
self.device = device
self.loss = nn.CrossEntropyLoss(reduction='none')
if is_test:
# Should load the model from checkpoint
self.model.load_state_dict(torch.load(args.saved_model))'Loaded saved model from %s' % args.saved_model)
self.optim = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = data_points * self.args.epochs
self.scheduler = get_linear_schedule_with_warmup(self.optim, num_warmup_steps=0,
def step(self, batch):
batch = tuple( for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
outputs = self.model(batch_input_ids,
loss = self.loss(outputs, batch_labels)
loss = loss.sum()
(loss / loss.numel()).backward()
torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
return loss
def validate(self, batch):
batch = tuple( for t in batch)
batch_input_ids, batch_input_masks, batch_labels = batch
with torch.no_grad():
model_output = self.model(batch_input_ids,
predicted_label_ids = self._predict(model_output)
label_ids ='cpu').numpy()
loss = self.loss(model_output, batch_labels)
loss = loss.sum()
return label_ids, predicted_label_ids, loss
def _predict(self, logits):
return np.argmax('cpu').numpy(), axis=1)
Finally, the following is my model (i.e., Classifier) class:
import torch.nn as nn
from transformers import BertModel
class Classifier(nn.Module):
def __init__(self, args, is_eval=False):
super(Classifier, self).__init__()
self.bert_model = BertModel.from_pretrained(
self.is_eval_mode = is_eval
self.linear = nn.Linear(768, 2) # binary classification
def switch_state(self):
self.is_eval_mode = not self.is_eval_mode
def forward(self, input_ids, attention_mask=None, labels=None):
bert_outputs = self.bert_model(input_ids,
# Should give the logits to the the linear layer
model_output = self.linear(bert_outputs[1])
return model_output
For visualization the loss throughout the epochs:
When I've used Bert for text classification my model has generally behaved as you tell. In part this is expected because pre-trained models tend to require few epochs to fine-tune, actually if you check Bert's paper the number of epochs recommended for fine-tuning is between 2 and 4.
On the other hand, I've usually found the optimum at just 1 or 2 epochs, which coincides with your case also. My guess is: there is a trade-off when fine-tuning pre-trained models between fitting to your downstream task and forgetting the weights learned at pre-training. Depending on the data you have, the equilibrium point may happen sooner or later and overfitting starts after that. But this paragraph is speculation based on my experience.
When validation loss increases it means your model is overfitting

How to use smac for hyper-parameter optimization of Convolution Neural Network?

Note: Long Post. Please bear with me
I have implemented a convolution neural network in PyTorch on KMNIST dataset. I need to use SMAC to optimize the learning rate and the momentum of Stochastic Gradient Descent of the CNN. I am new in hyperparameter optimization and what I learnt from the smac documentation is,
SMAC evaluates the algorithm to be optimized by invoking it through a Target Algorithm Evaluator (TAE).
We need a Scenario-object to configure the optimization process.
run_obj parameter in Scenario object specifies what SMAC is supposed to optimize.
My Ultimate goal is to get a good accuracy or low loss
This is what I have done so far:
Convolution Neural Network
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.autograd import Variable
from datasets import *
import torch.nn.functional as F
import matplotlib.pyplot as plt
# Create the model class
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__() # to inherent the features of nn.Module
self.cnn1 = nn.Conv2d(in_channels = 1, out_channels = 8, kernel_size = 3, stride = 1, padding =1)
# in_channels =1 because of grey scale image
# kernel_size = feature_size
# padding = 1 because for same padding = [(filter_size -1)/2]
# the output size of the 8 feature maps is [(input_size - filter_size +2(padding)/stride)+1]
#Batch Normalization
self.batchnorm1 = nn.BatchNorm2d(8)
self.relu = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(kernel_size =2)
# After maxpooling, the output of each feature map is 28/2 =14
self.cnn2 = nn.Conv2d(in_channels = 8, out_channels = 32, kernel_size = 5, stride = 1, padding =2)
#Batch Normalization
self.batchnorm2 = nn.BatchNorm2d(32)
#self.relu = nn.ReLU()
self.maxpool2 = nn.MaxPool2d(kernel_size =2)
# After maxpooling , the output of each feature map is 14/2 =7of them is of size 7x7 --> 32*7*7=1568
# Flatten the feature maps. You have 32 feature maps, each
self.fc1 = nn.Linear(in_features=1568, out_features = 600)
self.dropout = nn.Dropout(p=0.5)
self.fc2 = nn.Linear(in_features=600, out_features = 10)
def forward(self,x):
out = self.cnn1(x)
#out = F.relu(self.cnn1(x))
out = self.batchnorm1(out)
out = self.relu(out)
out = self.maxpool1(out)
out = self.cnn2(out)
out = self.batchnorm2(out)
out = self.relu(out)
out = self.maxpool2(out)
#Now we have to flatten the output. This is where we apply the feed forward neural network as learned
#It will the take the shape (batch_size, 1568) = (100, 1568)
out = out.view(-1, 1568)
#Then we forward through our fully connected layer
out = self.fc1(out)
out = self.relu(out)
out = self.dropout(out)
out = self.fc2(out)
return out
def train(model, train_loader, optimizer, epoch, CUDA, loss_fn):
iter_count = 0
for i, (images, labels) in enumerate(train_load):
if CUDA:
images = Variable(images.cuda())
images = images.unsqueeze(1)
images = images.type(torch.FloatTensor)
images = images.cuda()
labels = Variable(labels.cuda())
labels = labels.type(torch.LongTensor)
labels = labels.cuda()
images = Variable(images)
images = images.unsqueeze(1)
images = images.type(torch.DoubleTensor)
labels = Variable(labels)
labels = labels.type(torch.DoubleTensor)
outputs = model(images)
loss = loss_fn(outputs, labels)
cum_loss += loss
if (i+1) % batch_size == 0:
correct = 0
total = 0
acc = 0
_, predicted = torch.max(,1)
total += labels.size(0)
if CUDA:
correct += (predicted.cpu()==labels.cpu()).sum()
correct += (predicted==labels).sum()
accuracy = 100*correct/total
if i % len(train_load) == 0:
iter_count += 1
ave_loss = cum_loss/batch_size
return ave_loss
batch_size = 100
epochs = 5
e = range(epochs)
#Load datasets
train_images = variable_name.images
train_images = torch.from_numpy(train_images)
train_labels = variable_name.labels
train_labels = torch.from_numpy(train_labels)
train_dataset =, train_labels)
# Make the dataset iterable
train_load = = train_dataset, batch_size = batch_size, shuffle = True)
print('There are {} images in the training set' .format(len(train_dataset)))
print('There are {} images in the loaded training set' .format(len(train_load)))
def net(learning_rate, Momentum):
model = CNN()
CUDA = torch.cuda.is_available()
if CUDA:
model = model.cuda()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate,momentum = Momentum, nesterov= True)
iteration = 0
for epoch in range(epochs):
ave_loss = train(model, train_load, optimizer, epoch, CUDA, loss_fn)
return optimizer, loss_fn, model, total_loss
optimizer, loss_fn, model, total_loss = net(learning_rate= 0.01, Momentum = 0.09)
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
print(param_tensor, "\t", model.state_dict()[param_tensor].size())
#print("Optimizer's state_dict:")
#for var_name in optimizer.state_dict():
# print(var_name, "\t", optimizer.state_dict()[var_name]), "")
plt.plot(e, (np.array(total_loss)))
plt.xlabel("# Epoch")
smac hyperparameter optimization:
from smac.configspace import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter, \
UniformFloatHyperparameter, UniformIntegerHyperparameter
from smac.configspace.util import convert_configurations_to_array
#from ConfigSpace.conditions import InCondition
# Import SMAC-utilities
from smac.tae.execute_func import ExecuteTAFuncDict
from smac.scenario.scenario import Scenario
from smac.facade.smac_facade import SMAC
# Build Configuration Space which defines all parameters and their ranges
cs = ConfigurationSpace()
# We define a few possible types of SVM-kernels and add them as "kernel" to our cs
lr = UniformFloatHyperparameter('learning_rate', 1e-4, 1e-1, default_value='1e-2')
momentum = UniformFloatHyperparameter('Momentum', 0.01, 0.1, default_value='0.09')
cs.add_hyperparameters([lr, momentum])
def kmnist_from_cfg(cfg):
cfg = {k : cfg[k] for k in cfg if cfg[k]}
print('Config is', cfg)
#optimizer, loss_fn, model, total_loss = net(**cfg)
#optimizer, loss_fn, model, total_loss = net(learning_rate= cfg["learning_rate"], Momentum= cfg["Momentum"])
optimizer, loss_fn, model, total_loss = net(learning_rate= 0.02, Momentum= 0.05)
return optimizer, loss_fn, model, total_loss
# Scenario object
scenario = Scenario({"run_obj": "quality", # we optimize quality (alternatively runtime)
"runcount-limit": 200, # maximum function evaluations
"cs": cs, # configuration space
"deterministic": "true"
#def_value = kmnist_from_cfg(cs.get_default_configuration())
#print("Default Value: %.2f" % (def_value))
# Optimize, using a SMAC-object
print("Optimizing! Depending on your machine, this might take a few minutes.")
smac = SMAC(scenario=scenario,tae_runner=kmnist_from_cfg) #rng=np.random.RandomState(42)
smac.solver.intensifier.tae_runner.use_pynisher = False
print("SMAC", smac)
incumbent = smac.optimize()
inc_value = kmnist_from_cfg(incumbent)
print("Optimized Value: %.2f" % (inc_value))
When I give loss as the run_obj parameter, I get the error message
ArgumentError: argument --run-obj/--run_obj: invalid choice: 'total_loss' (choose from 'runtime', 'quality')
To be honest, I do not know what does "quality" means. Anyways, when I give quality as the run_obj parameter, I get the error message
TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
If I understood it correctly, the above error message is obtained when an int is expected but str is given. To check whether the problem was with configuration space, I tried
optimizer, loss_fn, model, total_loss = net(learning_rate= 0.02, Momentum= 0.05)
instead of these:
optimizer, loss_fn, model, total_loss = net(**cfg)
optimizer, loss_fn, model, total_loss = net(learning_rate= cfg["learning_rate"], Momentum= cfg["Momentum"])
the error remains the same.
Any ideas on how to use smac to optimize hyperparameters of CNN and why do I get this error message? I tried looking for similar problems online. This post was a little helpful. Unfortunately, since there is no implementation of smac on NN (at least I did not find it), I cannot figure out the solution. I ran out of all ideas.
Any help, ideas or useful link is appreciated.
Thank you!
I believe the tae_runner (kmnist_from_cfg in your case) has to be a callable that takes a configuration space point, which you correctly provide, and outputs a single number. You output a tuple of things. Perhaps only return the total_loss on the validation set? I am basing this on the svm example in the smac github at

Why can't I learn XOR function with this network and constraints?

Let's say I have the following constraints and the network:
The architecture is fixed (see this image) (note that there are no biases)
Activation function for the hidden layer is ReLU
There's no activation function for the output layer (should just return the sum of the inputs it receive).
I tried to implement this in pytorch with various initialization schemes and different data sets but I failed (the code is at the bottom).
My questions are:
Is there anything wrong with my NN training process?
Is this a feasible problem? If yes, how?
If this is doable, can we still achieve that by constraining the weights to be in the set {-1, 0, 1}
import torch
import torch.nn as nn
import torch.optim as optim
import as data_utils
import numpy as np
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.fc1 = nn.Linear(2,2,bias=False)
self.fc2 = nn.Linear(2,1, bias=False)
self.rl = nn.ReLU()
def forward(self, x):
x = self.fc1(x)
x = self.rl(x)
x = self.fc2(x)
return x
#create an XOR data set to train
rng = np.random.RandomState(0)
X = rng.randn(200, 2)
y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0).astype('int32')
# test data set
X_test = np.array([[0,0],[0,1], [1,0], [1,1]])
train = data_utils.TensorDataset(torch.from_numpy(X).float(), \
train_loader = data_utils.DataLoader(train, batch_size=50, shuffle=True)
test = torch.from_numpy(X_test).float()
# training the network
num_epoch = 10000
net = Network(), max=1), max=1)
# define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters())
for epoch in range(num_epoch):
running_loss = 0 # loss per epoch
for (X, y)in train_loader:
# make the grads zero
# forward propagate
out = net(X)
# calculate loss and update
loss = criterion(out, y)
running_loss +=
if epoch%500== 0:
print("Epoch: {0} Loss: {1}".format(epoch, running_loss))
The loss doesn't improve. It gets stuck in some value after a few epochs ( i'm not sure how to make this reproducible as I'm getting different values every time)
net(test) returns a set of predictions that are no way close to XOR output.
You need to use a nonlinear activation function such as sigmoid in your hidden and output layers . because xor is not linearly separable.Also biases are required.
