how "data" and "target" are choosen in a federated learning? (PySyft) - python

i can't understand how in function train() below, the variable (data, target) are choosen.
def train(args, model, device, federated_train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(federated_train_loader): # <-- now it is a distributed dataset
model.send(data.location) # <-- NEW: send the model to the right location`
i guess they are 2 tensor representing 2 random images of dataset train, but then the loss function
loss = F.nll_loss(output, target)
is calculated at every interaction with different target?
Also i have different question: i trained the network with images of cats, then i test it with images of cars and the accuracy reached is 97%. How is this possible? is a proper value or i'm doing something wrong?
here is the entire code:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import syft as sy # <-- NEW: import the Pysyft library
hook = sy.TorchHook(torch) # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
bob = sy.VirtualWorker(hook, id="bob") # <-- NEW: define remote worker bob
alice = sy.VirtualWorker(hook, id="alice") # <-- NEW: and alice
class Arguments():
def __init__(self):
self.batch_size = 64
self.test_batch_size = 1000
self.epochs = 2
self.lr = 0.01
self.momentum = 0.5
self.no_cuda = False
self.seed = 1
self.log_interval = 30
self.save_model = False
args = Arguments()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader
datasets.MNIST("C:\\users...\\train", train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
.federate((bob, alice)), # <-- NEW: we distribute the dataset across all the workers, it's now a FederatedDataset
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST("C:\\Users...\\test", train=False, download=True, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
def train(args, model, device, federated_train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(federated_train_loader): # <-- now it is a distributed dataset
model.send(data.location) # <-- NEW: send the model to the right location
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
model.get() # <-- NEW: get the model back
if batch_idx % args.log_interval == 0:
loss = loss.get() # <-- NEW: get the loss back
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * args.batch_size, len(federated_train_loader) * args.batch_size,
100. * batch_idx / len(federated_train_loader), loss.item()))
def test(args, model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=args.lr) # TODO momentum is not supported at the moment
for epoch in range(1, args.epochs + 1):
train(args, model, device, federated_train_loader, optimizer, epoch)
test(args, model, device, test_loader)
if (args.save_model):
torch.save(model.state_dict(), "mnist_cnn.pt")

Consider it like this. When you hook torch, all your torch tensors will get additional functionality - methods like .send(), .federate(), and attributes like .location and ._objects. Your data and target, which were once torch tensors, became pointers to tensors residing in different VirtualWorker objects due to .federate((bob, alice)).
Now data and target have additional attributes that includes .location, which will return the location of that tensor - data/target pointed by the pointer called data/target.
Federated learning sends the global model to this location, as seen in model.send(data.location).
Now, model is a pointer residing at the same location and data is also a pointer residing there. Hence when you take the output as output = model(data), output will also reside there and all we (the central server or in other words, the VirtualWorker called 'me') will get is a pointer to that output.
Now, regarding your doubt on loss calculation, since output and target are both residing in that same location, calculation of loss will also happen there. Same goes for backprop and step.
Finally, you can see model.get(), here is where the central server pulls the remote model using the pointer called model. (I'm not sure if it should be model = model.get() though).
So anything with .get() will be pulled from that worker and will be returned in our python statement. Also note that .get() will remove that object from it's location when called. Hence use .copy().get() if you are going to need it further.

Related

Can't backward pass two losses in Classification Transformer Model

For my model I'm using a roberta transformer model and the Trainer from the Huggingface transformer library.
I calculate two losses:
lloss is a Cross Entropy Loss and dloss calculates the loss inbetween hierarchy layers.
The total loss is the sum of lloss and dloss. (Based on this)
When calling total_loss.backwards() however, I get the error:
RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed
Any idea why that happens? Can I force it to only call backwards once? Here is the loss calculation part:
dloss = calculate_dloss(prediction, labels, 3)
lloss = calculate_lloss(predeiction, labels, 3)
total_loss = lloss + dloss
total_loss.backward()
def calculate_lloss(predictions, true_labels, total_level):
'''Calculates the layer loss.
'''
loss_fct = nn.CrossEntropyLoss()
lloss = 0
for l in range(total_level):
lloss += loss_fct(predictions[l], true_labels[l])
return self.alpha * lloss
def calculate_dloss(predictions, true_labels, total_level):
'''Calculate the dependence loss.
'''
dloss = 0
for l in range(1, total_level):
current_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l]), dim=1)
prev_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l-1]), dim=1)
D_l = self.check_hierarchy(current_lvl_pred, prev_lvl_pred, l) #just a boolean tensor
l_prev = torch.where(prev_lvl_pred == true_labels[l-1], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
l_curr = torch.where(current_lvl_pred == true_labels[l], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
dloss += torch.sum(torch.pow(self.p_loss, D_l*l_prev)*torch.pow(self.p_loss, D_l*l_curr) - 1)
return self.beta * dloss
There is nothing wrong with having a loss that is the sum of two individual losses, here is a small proof of principle adapted from the docs:
import torch
import numpy
from sklearn.datasets import make_blobs
class Feedforward(torch.nn.Module):
def __init__(self, input_size, hidden_size):
super(Feedforward, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
self.relu = torch.nn.ReLU()
self.fc2 = torch.nn.Linear(self.hidden_size, 1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
hidden = self.fc1(x)
relu = self.relu(hidden)
output = self.fc2(relu)
output = self.sigmoid(output)
return output
def blob_label(y, label, loc): # assign labels
target = numpy.copy(y)
for l in loc:
target[y == l] = label
return target
x_train, y_train = make_blobs(n_samples=40, n_features=2, cluster_std=1.5, shuffle=True)
x_train = torch.FloatTensor(x_train)
y_train = torch.FloatTensor(blob_label(y_train, 0, [0]))
y_train = torch.FloatTensor(blob_label(y_train, 1, [1,2,3]))
x_test, y_test = make_blobs(n_samples=10, n_features=2, cluster_std=1.5, shuffle=True)
x_test = torch.FloatTensor(x_test)
y_test = torch.FloatTensor(blob_label(y_test, 0, [0]))
y_test = torch.FloatTensor(blob_label(y_test, 1, [1,2,3]))
model = Feedforward(2, 10)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
model.eval()
y_pred = model(x_test)
before_train = criterion(y_pred.squeeze(), y_test)
print('Test loss before training' , before_train.item())
model.train()
epoch = 20
for epoch in range(epoch):
optimizer.zero_grad() # Forward pass
y_pred = model(x_train) # Compute Loss
lossCE= criterion(y_pred.squeeze(), y_train)
lossSQD = (y_pred.squeeze()-y_train).pow(2).mean()
loss=lossCE+lossSQD
print('Epoch {}: train loss: {}'.format(epoch, loss.item())) # Backward pass
loss.backward()
optimizer.step()
There must be a real second time that you call directly or indirectly backward on some varaible that then traverses through your graph. It is a bit too much to ask for the complete code here, only you can check this or at least reduce it to a minimal example (while doing so, you might already find the issue). Apart from that, I would start checking:
Does it already occur in the first iteration of training? If not: are you reusing any calculation results for the second iteration without a detach?
When you do backward on your losses individually lloss.backward() followed by dloss.backward() (this has the same effect as adding them together first as gradients are accumulated): what happens? This will let you track down for which of the two losses the error occurs.
After backward() your comp. graph is freed so for the second backward you need to create a new graph by providing inputs again. If you want to reiterate the same graph after backward (for some reason) you need to specify retain_graph flag in backward as True. see retain_graph here.
P.S. As the summation of Tensors is automatically differentiable, summing the losses would not cause any issue in the backward.

Torch: How to inspect weights after training?

I am wondering what I am doing wrong when looking to see how the weights changed during training.
My loss goes down considerably but it appears that the initialized weights are the same as trained weights. Am I looking in the wrong location? I would appreciate any insight that you might have!
import torch
import numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
# setup GPU/CPU processing
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# initialize model
class mlp1(torch.nn.Module):
def __init__(self, num_features, num_hidden, num_classes):
super(mlp1, self).__init__()
self.num_classes = num_classes
self.input_layer = torch.nn.Linear(num_features, num_hidden)
self.out_layer = torch.nn.Linear(num_hidden, num_classes)
def forward(self, x):
x = self.input_layer(x)
x = torch.sigmoid(x)
logits = self.out_layer(x)
probas = torch.softmax(logits, dim=1)
return logits, probas
# instantiate model
model = mlp1(num_features=28*28, num_hidden=100, num_classes=10).to(device)
# check initial weights
weight_check_pre = model.state_dict()['input_layer.weight'][0][0:25]
# optim
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
# download data
train_dataset = datasets.MNIST(root='data',
train=True,
transform=transforms.ToTensor(),
download=True)
# data loader
train_dataloader = DataLoader(dataset=train_dataset,
batch_size=100,
shuffle=True)
# train
NUM_EPOCHS = 1
for epoch in range(NUM_EPOCHS):
model.train()
for batch_idx, (features, targets) in enumerate(train_dataloader):
# send data to device
features = features.view(-1, 28*28).to(device)
targets = targets.to(device)
# forward
logits, probas = model(features)
# loss
loss = F.cross_entropy(logits, targets)
optimizer.zero_grad()
loss.backward()
# now update weights
optimizer.step()
### LOGGING
if not batch_idx % 50:
print ('Epoch: %03d/%03d | Batch %03d/%03d | Loss: %.4f'
%(epoch+1, NUM_EPOCHS, batch_idx,
len(train_dataloader), loss))
# check post training
weight_check_post = model.state_dict()['input_layer.weight'][0][0:25]
# compare
weight_check_pre == weight_check_post # all equal
That is because both variables are referencing the same object (dictionary) in memory and so will always equal to each other.
You can do this to get actual copies of the state_dict.
import copy
# check initial weights
weight_check_pre = copy.deepcopy(model.state_dict()['input_layer.weight'][0][0:25])
...
# check post training
weight_check_post = copy.deepcopy(model.state_dict()['input_layer.weight'][0][0:25])

How to use smac for hyper-parameter optimization of Convolution Neural Network?

Note: Long Post. Please bear with me
I have implemented a convolution neural network in PyTorch on KMNIST dataset. I need to use SMAC to optimize the learning rate and the momentum of Stochastic Gradient Descent of the CNN. I am new in hyperparameter optimization and what I learnt from the smac documentation is,
SMAC evaluates the algorithm to be optimized by invoking it through a Target Algorithm Evaluator (TAE).
We need a Scenario-object to configure the optimization process.
run_obj parameter in Scenario object specifies what SMAC is supposed to optimize.
My Ultimate goal is to get a good accuracy or low loss
This is what I have done so far:
Convolution Neural Network
import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.autograd import Variable
from datasets import *
import torch.utils.data
import torch.nn.functional as F
import matplotlib.pyplot as plt
# Create the model class
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__() # to inherent the features of nn.Module
self.cnn1 = nn.Conv2d(in_channels = 1, out_channels = 8, kernel_size = 3, stride = 1, padding =1)
# in_channels =1 because of grey scale image
# kernel_size = feature_size
# padding = 1 because for same padding = [(filter_size -1)/2]
# the output size of the 8 feature maps is [(input_size - filter_size +2(padding)/stride)+1]
#Batch Normalization
self.batchnorm1 = nn.BatchNorm2d(8)
# RELU
self.relu = nn.ReLU()
self.maxpool1 = nn.MaxPool2d(kernel_size =2)
# After maxpooling, the output of each feature map is 28/2 =14
self.cnn2 = nn.Conv2d(in_channels = 8, out_channels = 32, kernel_size = 5, stride = 1, padding =2)
#Batch Normalization
self.batchnorm2 = nn.BatchNorm2d(32)
# RELU
#self.relu = nn.ReLU()
self.maxpool2 = nn.MaxPool2d(kernel_size =2)
# After maxpooling , the output of each feature map is 14/2 =7of them is of size 7x7 --> 32*7*7=1568
# Flatten the feature maps. You have 32 feature maps, each
self.fc1 = nn.Linear(in_features=1568, out_features = 600)
self.dropout = nn.Dropout(p=0.5)
self.fc2 = nn.Linear(in_features=600, out_features = 10)
def forward(self,x):
out = self.cnn1(x)
#out = F.relu(self.cnn1(x))
out = self.batchnorm1(out)
out = self.relu(out)
out = self.maxpool1(out)
out = self.cnn2(out)
out = self.batchnorm2(out)
out = self.relu(out)
out = self.maxpool2(out)
#Now we have to flatten the output. This is where we apply the feed forward neural network as learned
#before!
#It will the take the shape (batch_size, 1568) = (100, 1568)
out = out.view(-1, 1568)
#Then we forward through our fully connected layer
out = self.fc1(out)
out = self.relu(out)
out = self.dropout(out)
out = self.fc2(out)
return out
def train(model, train_loader, optimizer, epoch, CUDA, loss_fn):
model.train()
cum_loss=0
iter_count = 0
for i, (images, labels) in enumerate(train_load):
if CUDA:
images = Variable(images.cuda())
images = images.unsqueeze(1)
images = images.type(torch.FloatTensor)
images = images.cuda()
labels = Variable(labels.cuda())
labels = labels.type(torch.LongTensor)
labels = labels.cuda()
else:
images = Variable(images)
images = images.unsqueeze(1)
images = images.type(torch.DoubleTensor)
labels = Variable(labels)
labels = labels.type(torch.DoubleTensor)
optimizer.zero_grad()
outputs = model(images)
loss = loss_fn(outputs, labels)
loss.backward()
optimizer.step()
cum_loss += loss
if (i+1) % batch_size == 0:
correct = 0
total = 0
acc = 0
_, predicted = torch.max(outputs.data,1)
total += labels.size(0)
if CUDA:
correct += (predicted.cpu()==labels.cpu()).sum()
else:
correct += (predicted==labels).sum()
accuracy = 100*correct/total
if i % len(train_load) == 0:
iter_count += 1
ave_loss = cum_loss/batch_size
return ave_loss
batch_size = 100
epochs = 5
e = range(epochs)
#print(e)
#Load datasets
variable_name=KMNIST()
train_images = variable_name.images
train_images = torch.from_numpy(train_images)
#print(train_images.shape)
#print(type(train_images))
train_labels = variable_name.labels
train_labels = torch.from_numpy(train_labels)
#print(train_labels.shape)
#print(type(train_labels))
train_dataset = torch.utils.data.TensorDataset(train_images, train_labels)
# Make the dataset iterable
train_load = torch.utils.data.DataLoader(dataset = train_dataset, batch_size = batch_size, shuffle = True)
print('There are {} images in the training set' .format(len(train_dataset)))
print('There are {} images in the loaded training set' .format(len(train_load)))
def net(learning_rate, Momentum):
model = CNN()
CUDA = torch.cuda.is_available()
if CUDA:
model = model.cuda()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate,momentum = Momentum, nesterov= True)
iteration = 0
total_loss=[]
for epoch in range(epochs):
ave_loss = train(model, train_load, optimizer, epoch, CUDA, loss_fn)
total_loss.append(ave_loss)
return optimizer, loss_fn, model, total_loss
optimizer, loss_fn, model, total_loss = net(learning_rate= 0.01, Momentum = 0.09)
# Print model's state_dict
print("---------------")
print("Model's state_dict:")
for param_tensor in model.state_dict():
print(param_tensor, "\t", model.state_dict()[param_tensor].size())
print("---------------")
#print("Optimizer's state_dict:")
#for var_name in optimizer.state_dict():
# print(var_name, "\t", optimizer.state_dict()[var_name])
torch.save(model.state_dict(), "kmnist_cnn.pt")
plt.plot(e, (np.array(total_loss)))
plt.xlabel("# Epoch")
plt.ylabel("Loss")
plt.show()
print('Done!')
smac hyperparameter optimization:
from smac.configspace import ConfigurationSpace
from ConfigSpace.hyperparameters import CategoricalHyperparameter, \
UniformFloatHyperparameter, UniformIntegerHyperparameter
from smac.configspace.util import convert_configurations_to_array
#from ConfigSpace.conditions import InCondition
# Import SMAC-utilities
from smac.tae.execute_func import ExecuteTAFuncDict
from smac.scenario.scenario import Scenario
from smac.facade.smac_facade import SMAC
# Build Configuration Space which defines all parameters and their ranges
cs = ConfigurationSpace()
# We define a few possible types of SVM-kernels and add them as "kernel" to our cs
lr = UniformFloatHyperparameter('learning_rate', 1e-4, 1e-1, default_value='1e-2')
momentum = UniformFloatHyperparameter('Momentum', 0.01, 0.1, default_value='0.09')
cs.add_hyperparameters([lr, momentum])
def kmnist_from_cfg(cfg):
cfg = {k : cfg[k] for k in cfg if cfg[k]}
print('Config is', cfg)
#optimizer, loss_fn, model, total_loss = net(**cfg)
#optimizer, loss_fn, model, total_loss = net(learning_rate= cfg["learning_rate"], Momentum= cfg["Momentum"])
optimizer, loss_fn, model, total_loss = net(learning_rate= 0.02, Momentum= 0.05)
return optimizer, loss_fn, model, total_loss
# Scenario object
scenario = Scenario({"run_obj": "quality", # we optimize quality (alternatively runtime)
"runcount-limit": 200, # maximum function evaluations
"cs": cs, # configuration space
"deterministic": "true"
})
#def_value = kmnist_from_cfg(cs.get_default_configuration())
#print("Default Value: %.2f" % (def_value))
# Optimize, using a SMAC-object
print("Optimizing! Depending on your machine, this might take a few minutes.")
smac = SMAC(scenario=scenario,tae_runner=kmnist_from_cfg) #rng=np.random.RandomState(42)
smac.solver.intensifier.tae_runner.use_pynisher = False
print("SMAC", smac)
incumbent = smac.optimize()
inc_value = kmnist_from_cfg(incumbent)
print("Optimized Value: %.2f" % (inc_value))
When I give loss as the run_obj parameter, I get the error message
ArgumentError: argument --run-obj/--run_obj: invalid choice: 'total_loss' (choose from 'runtime', 'quality')
To be honest, I do not know what does "quality" means. Anyways, when I give quality as the run_obj parameter, I get the error message
TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
If I understood it correctly, the above error message is obtained when an int is expected but str is given. To check whether the problem was with configuration space, I tried
optimizer, loss_fn, model, total_loss = net(learning_rate= 0.02, Momentum= 0.05)
instead of these:
optimizer, loss_fn, model, total_loss = net(**cfg)
optimizer, loss_fn, model, total_loss = net(learning_rate= cfg["learning_rate"], Momentum= cfg["Momentum"])
the error remains the same.
Any ideas on how to use smac to optimize hyperparameters of CNN and why do I get this error message? I tried looking for similar problems online. This post was a little helpful. Unfortunately, since there is no implementation of smac on NN (at least I did not find it), I cannot figure out the solution. I ran out of all ideas.
Any help, ideas or useful link is appreciated.
Thank you!
I believe the tae_runner (kmnist_from_cfg in your case) has to be a callable that takes a configuration space point, which you correctly provide, and outputs a single number. You output a tuple of things. Perhaps only return the total_loss on the validation set? I am basing this on the svm example in the smac github at https://github.com/automl/SMAC3/blob/master/examples/svm.py.

If we combine one trainable parameters with a non-trainable parameter, is the original trainable param trainable?

I have two nets and I combine their parameters in some fancy way using only pytorch operations. I store the result in a third net which has its parameters set to non-trainable. Then I proceed and pass data through this new net. The new net is just a placeholder for:
placeholder_net.W = Op( not_trainable_net.W, trainable_net.W )
Then I pass data:
output = placeholder_net(input)
I am concerned that since the parameters of the placeholder net are set to non-trainable that it won’t actually train the variable that it should train. Will this happen? Or what is the result when you combine a trainable param with and non-trainable param (and then set that where the param is not trainable)?
Current solution:
del net3.conv0.weight
net3.conv0.weight = net.conv0.weight + net2.conv0.weight
import torch
from torch import nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from collections import OrderedDict
import copy
def dont_train(net):
'''
set training parameters to false.
'''
for param in net.parameters():
param.requires_grad = False
return net
def get_cifar10():
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,shuffle=True, num_workers=2)
classes = ('plane', 'car', 'bird', 'cat','deer', 'dog', 'frog', 'horse', 'ship', 'truck')
return trainloader,classes
def combine_nets(net_train, net_no_train, net_place_holder):
'''
Combine nets in a way train net is trainable
'''
params_train = net_train.named_parameters()
dict_params_place_holder = dict(net_place_holder.named_parameters())
dict_params_no_train = dict(net_no_train.named_parameters())
for name, param_train in params_train:
if name in dict_params_place_holder:
layer_name, param_name = name.split('.')
param_no_train = dict_params_no_train[name]
## get place holder layer
layer_place_holder = getattr(net_place_holder, layer_name)
delattr(layer_place_holder, param_name)
## get new param
W_new = param_train + param_no_train # notice addition is just chosen for the sake of an example
## store param in placehoder net
setattr(layer_place_holder, param_name, W_new)
return net_place_holder
def combining_nets_lead_to_error():
'''
Intention is to only train the net with trainable params.
Placeholder rnet is a dummy net, it doesn't actually do anything except hold the combination of params and its the
net that does the forward pass on the data.
'''
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
''' create three musketeers '''
net_train = nn.Sequential(OrderedDict([
('conv1', nn.Conv2d(1,20,5)),
('relu1', nn.ReLU()),
('conv2', nn.Conv2d(20,64,5)),
('relu2', nn.ReLU())
])).to(device)
net_no_train = copy.deepcopy(net_train).to(device)
net_place_holder = copy.deepcopy(net_train).to(device)
''' prepare train, hyperparams '''
trainloader,classes = get_cifar10()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net_train.parameters(), lr=0.001, momentum=0.9)
''' train '''
net_train.train()
net_no_train.eval()
net_place_holder.eval()
for epoch in range(2): # loop over the dataset multiple times
running_loss = 0.0
for i, (inputs, labels) in enumerate(trainloader, 0):
optimizer.zero_grad() # zero the parameter gradients
inputs, labels = inputs.to(device), labels.to(device)
# combine nets
net_place_holder = combine_nets(net_train,net_no_train,net_place_holder)
#
outputs = net_place_holder(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
''' DONE '''
print('Done \a')
if __name__ == '__main__':
combining_nets_lead_to_error()
First, do not use eval() mode for any network. Set requires_grad flag to false to make the parameters non-trainable for only the second network and train the placeholder network.
If this doesn't work, you can try the following approach which I prefer.
Instead of using multiple networks, you can use a single network and use a non-trainable layer as a parallel connection after every trainable layer before non-linearity.
For example look at this image:
Set requires_grad flag to false to make the parameters non-trainable. Do not use eval() and train the network.
Combining outputs of the layers before non-linearity is important. Initialize the parameters of the parallel layer and choose the post-operation such that it gives the same result as when you combine the parameters.
I'm not sure if this is what you want to know.
But when I understand you correct - you want to know if the results of operations with non-trainable and trainable variables are still trainable?
If so, this is indeed the case, here is an example:
>>> trainable = torch.ones(1, requires_grad=True)
>>> non_trainable = torch.ones(1, requires_grad=False)
>>> result = trainable + non_trainable
>>> result.requires_grad
True
Maybe you might also find torch.set_grad_enabled useful, with some examples given here (PyTorch Migration Guide for version 0.4.0):
https://pytorch.org/2018/04/22/0_4_0-migration-guide.html
Original answer below, here I address the added code you've uploaded.
in your combine_nets functions you needlessly try to remove and set the attribute, when you can simply copy your required value like this:
def combine_nets(net_train, net_no_train, net_place_holder):
'''
Combine nets in a way train net is trainable
'''
params_train = net_no_train.named_parameters()
dict_params_place_holder = dict(net_place_holder.named_parameters())
dict_params_no_train = dict(net_train.named_parameters())
for name, param_train in params_train:
if name in dict_params_place_holder:
param_no_train = dict_params_no_train[name]
W_new = param_train + param_no_train
dict_params_no_train[name].data.copy_(W_new.data)
return net_place_holder
Due to other errors in the code you've supplied I couldn't make it run without further changes so I attach below an updated version of the code I've given you earlier:
import torch
from torch import nn
from torch.autograd import Variable
import torch.optim as optim
# toy feed-forward net
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(10, 5)
self.fc2 = nn.Linear(5, 5)
self.fc3 = nn.Linear(5, 1)
def forward(self, x):
x = self.fc1(x)
x = self.fc2(x)
x = self.fc3(x)
return x
def combine_nets(net_train, net_no_train, net_place_holder):
'''
Combine nets in a way train net is trainable
'''
params_train = net_no_train.named_parameters()
dict_params_place_holder = dict(net_place_holder.named_parameters())
dict_params_no_train = dict(net_train.named_parameters())
for name, param_train in params_train:
if name in dict_params_place_holder:
param_no_train = dict_params_no_train[name]
W_new = param_train + param_no_train
dict_params_no_train[name].data.copy_(W_new.data)
return net_place_holder
# define random data
random_input1 = Variable(torch.randn(10,))
random_target1 = Variable(torch.randn(1,))
random_input2 = Variable(torch.rand(10,))
random_target2 = Variable(torch.rand(1,))
random_input3 = Variable(torch.randn(10,))
random_target3 = Variable(torch.randn(1,))
# define net
net1 = Net()
net_place_holder = Net()
net2 = Net()
# train the net1
criterion = nn.MSELoss()
optimizer = optim.SGD(net1.parameters(), lr=0.1)
for i in range(100):
net1.zero_grad()
output = net1(random_input1)
loss = criterion(output, random_target1)
loss.backward()
optimizer.step()
# train the net2
criterion = nn.MSELoss()
optimizer = optim.SGD(net2.parameters(), lr=0.1)
for i in range(100):
net2.zero_grad()
output = net2(random_input2)
loss = criterion(output, random_target2)
loss.backward()
optimizer.step()
# train the net2
criterion = nn.MSELoss()
optimizer = optim.SGD(net_place_holder.parameters(), lr=0.1)
for i in range(100):
net_place_holder.zero_grad()
output = net_place_holder(random_input3)
loss = criterion(output, random_target3)
loss.backward()
optimizer.step()
print('#'*50)
print('Weights before combining')
print('')
print('net1 fc2 weight after train:')
print(net1.fc3.weight)
print('net2 fc2 weight after train:')
print(net2.fc3.weight)
combine_nets(net1, net2, net_place_holder)
print('#'*50)
print('')
print('Weights after combining')
print('net1 fc2 weight after train:')
print(net1.fc3.weight)
print('net2 fc2 weight after train:')
print(net2.fc3.weight)
# train the net
criterion = nn.MSELoss()
optimizer1 = optim.SGD(net1.parameters(), lr=0.1)
for i in range(100):
net1.zero_grad()
net2.zero_grad()
output1 = net1(random_input3)
output2 = net2(random_input3)
loss1 = criterion(output1, random_target3)
loss2 = criterion(output2, random_target3)
loss = loss1 + loss2
loss.backward()
optimizer1.step()
print('#'*50)
print('Weights after further training')
print('')
print('net1 fc2 weight after freeze:')
print(net1.fc3.weight)
print('net2 fc2 weight after freeze:')
print(net2.fc3.weight)

How to train a Pytorch net

I'm using this Pytorch implementation of Segnet with pretrained values I found for object segmentation, and it works fine.
Now I want to resume the training from the values I have, using a new dataset with similar images.
How can I do that?
I guess I have to use the "train.py" file found in the repository, but I don't know what to write in order to replace the "fill the batch" comment.
Here is that portion of the code:
def train(epoch):
model.train()
# update learning rate
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# define a weighted loss (0 weight for 0 label)
weights_list = [0]+[1 for i in range(17)]
weights = np.asarray(weights_list)
weigthtorch = torch.Tensor(weights_list)
if(USE_CUDA):
loss = nn.CrossEntropyLoss(weight=weigthtorch).cuda()
else:
loss = nn.CrossEntropyLoss(weight=weigthtorch)
total_loss = 0
# iteration over the batches
batches = []
for batch_idx,batch_files in enumerate(tqdm(batches)):
# containers
batch = np.zeros((args.batch_size,input_nbr, imsize, imsize), dtype=float)
batch_labels = np.zeros((args.batch_size,imsize, imsize), dtype=int)
# fill the batch
# ...
# What should I write here?
batch_th = Variable(torch.Tensor(batch))
target_th = Variable(torch.LongTensor(batch_labels))
if USE_CUDA:
batch_th =batch_th.cuda()
target_th = target_th.cuda()
# initilize gradients
optimizer.zero_grad()
# predictions
output = model(batch_th)
# Loss
output = output.view(output.size(0),output.size(1), -1)
output = torch.transpose(output,1,2).contiguous()
output = output.view(-1,output.size(2))
target = target.view(-1)
l_ = loss(output.cuda(), target)
total_loss += l_.cpu().data.numpy()
l_.cuda()
l_.backward()
optimizer.step()
return total_loss/len(files)
If I had to guess he probablly made some Dataloader feeder that extended the Pytorch Dataloader class. See
https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
Near the bottom of the page you can see an example in which they loop over their data loader
for i_batch, sample_batched in enumerate(dataloader):
What this would like like for images for example is:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batchSize, shuffle=True, num_workers=2)
for batch_idx, (inputs, targets) in enumerate(trainloader):
# Using the pytorch data loader the inputs and targets are given
# automatically
inputs, targets = inputs.cuda(), targets.cuda()
optimizer.zero_grad()
inputs, targets = Variable(inputs), Variable(targets)
How exactly the author loads his files I don't know. You could follow the procedure from: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html to make your own Dataloader though.

Categories