PyTorch : GPU execution time

PyTorch : GPU execution time - python

I am following different tutorials on PyTorch. And I'm trying to use GPU speed but I am encountering a problem between the execution time announced on the web site and my reality!
https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html
You can find here the code at the end of the page, and we can read that the program finished in 1min30 against 6/7min on my computer! Can you try it and tell me how many time have you got please! I am very confuse to see a high difference using the same file code than the web site!
This will allow me to understand if the problem comes from my GPU or not :)
My config :
GTX 1080Ti
Windows10
Cuda 9.1
Pytorch 0.4.0
The code:
if __name__ == "__main__":
plt.ion() # interactive mode
######################################################################
# Load Data
# ---------
#
# We will use torchvision and torch.utils.data packages for loading the
# data.
#
# The problem we're going to solve today is to train a model to classify
# **ants** and **bees**. We have about 120 training images each for ants and bees.
# There are 75 validation images for each class. Usually, this is a very
# small dataset to generalize upon, if trained from scratch. Since we
# are using transfer learning, we should be able to generalize reasonably
# well.
#
# This dataset is a very small subset of imagenet.
#
# .. Note ::
# Download the data from
# `here <https://download.pytorch.org/tutorial/hymenoptera_data.zip>`_
# and extract it to the current directory.
# Data augmentation and normalization for training
# Just normalization for validation
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
data_dir = 'hymenoptera_data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
shuffle=True, num_workers=4)
for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
######################################################################
# Visualize a few images
# ^^^^^^^^^^^^^^^^^^^^^^
# Let's visualize a few training images so as to understand the data
# augmentations.
def imshow(inp, title=None):
"""Imshow for Tensor."""
inp = inp.numpy().transpose((1, 2, 0))
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
inp = std * inp + mean
inp = np.clip(inp, 0, 1)
plt.imshow(inp)
if title is not None:
plt.title(title)
plt.pause(0.001) # pause a bit so that plots are updated
# Get a batch of training data
inputs, classes = next(iter(dataloaders['train']))
# Make a grid from batch
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[class_names[x] for x in classes])
######################################################################
# Training the model
# ------------------
#
# Now, let's write a general function to train a model. Here, we will
# illustrate:
#
# - Scheduling the learning rate
# - Saving the best model
#
# In the following, parameter ``scheduler`` is an LR scheduler object from
# ``torch.optim.lr_scheduler``.
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
######################################################################
# Visualizing the model predictions
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Generic function to display predictions for a few images
#
def visualize_model(model, num_images=6):
was_training = model.training
model.eval()
images_so_far = 0
fig = plt.figure()
with torch.no_grad():
for i, (inputs, labels) in enumerate(dataloaders['val']):
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
for j in range(inputs.size()[0]):
images_so_far += 1
ax = plt.subplot(num_images//2, 2, images_so_far)
ax.axis('off')
ax.set_title('predicted: {}'.format(class_names[preds[j]]))
imshow(inputs.cpu().data[j])
if images_so_far == num_images:
model.train(mode=was_training)
return
model.train(mode=was_training)
######################################################################
# Finetuning the convnet
# ----------------------
#
# Load a pretrained model and reset final fully connected layer.
#
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 2)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
######################################################################
# Train and evaluate
# ^^^^^^^^^^^^^^^^^^
#
# It should take around 15-25 min on CPU. On GPU though, it takes less than a
# minute.
#
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=25)
######################################################################
#
visualize_model(model_ft)
######################################################################
# ConvNet as fixed feature extractor
# ----------------------------------
#
# Here, we need to freeze all the network except the final layer. We need
# to set ``requires_grad == False`` to freeze the parameters so that the
# gradients are not computed in ``backward()``.
#
# You can read more about this in the documentation
# `here <http://pytorch.org/docs/notes/autograd.html#excluding-subgraphs-from-backward>`__.
#
model_conv = torchvision.models.resnet18(pretrained=True)
for param in model_conv.parameters():
param.requires_grad = False
# Parameters of newly constructed modules have requires_grad=True by default
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, 2)
model_conv = model_conv.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that only parameters of final layer are being optimized as
# opoosed to before.
optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)
######################################################################
# Train and evaluate
# ^^^^^^^^^^^^^^^^^^
#
# On CPU this will take about half the time compared to previous scenario.
# This is expected as gradients don't need to be computed for most of the
# network. However, forward does need to be computed.
#
model_conv = train_model(model_conv, criterion, optimizer_conv,
exp_lr_scheduler, num_epochs=25)
######################################################################
#
visualize_model(model_conv)
plt.ioff()
plt.show()
Thanks every body!

A bit late I know, but it might be that you are missing
model.to(cuda)

Related

invalid data type 'numpy.str_' when training bert model with pytorch

my data are divided to 2 parts training and validation one . I Used load_dataset and dataloader functions . I convert data in dataset to torch format using traindataset.set_format
when starting training I got error
new(): invalid data type 'numpy.str_'
in this line
for step,batch in enumerate(train_dataloader):
so how can i fix this error?
model= MixModel()
#model.load_state_dict(torch.load(r"/media/sh/saved_weightscnnbert.pt"))
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
traindataset = load_dataset('csv', data_files='/content/drive//My Drive/Colab Notebooks/newdataset/newdata_train2',split='train')
testdataset = load_dataset('csv', data_files='/content/drive//My Drive/Colab Notebooks/newdataset/newdata_valid2',split='train')
traindataset =traindataset.map(encode)
testdataset1 = testdataset.map(encode)
traindataset =traindataset.map(lambda examples: {'labels': examples['symptoms']}, batched=True)
testdataset =testdataset1.map(lambda examples: {'labels': examples['symptoms']}, batched=True)
traindataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
testdataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataloader = torch.utils.data.DataLoader(traindataset, batch_size= 64)
test_dataloader = torch.utils.data.DataLoader(testdataset, batch_size= 64)
# function to train the model
def train():
model.train()
total_loss, total_accuracy = 0, 0
# empty list to save model predictions
total_preds=[]
Labels=[]
# iterate over batches
for step,batch in enumerate(train_dataloader):
# progress update after every 50 batches.
if step % 100 == 0 and not step == 0:
print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))
sent_id, mask, labels = batch['input_ids'],batch['attention_mask'],batch['labels']
# clear previously calculated gradients
model.zero_grad()
# get model predictions for the current batch
preds = model(sent_id, mask, labels)
# compute the loss between actual and predicted values
alpha=0.25
gamma=2
ce_loss = loss_fn(preds, labels)
#pt = torch.exp(-ce_loss)
#focal_loss = (alpha * (1-pt)**gamma * ce_loss).mean() # mean over the batch
# add on to the total loss
total_loss = total_loss + ce_loss.item()
# backward pass to calculate the gradients
ce_loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# update parameters
optimizer.step()
preds =torch.argmax(preds, dim=1)
total_preds.append(preds)
total_accuracy += (preds == labels).float().sum()
# compute the training loss of the epoch
avg_loss = total_loss / len(train_dataloader)
avg_accuracy = total_accuracy / len(traindataset)
# predictions are in the form of (no. of batches, size of batch, no. of classes).
# reshape the predictions in form of (number of samples, no. of classes)
total_preds = np.concatenate(total_preds, axis=0)
#returns the loss and predictions
return avg_loss, total_preds, avg_accuracy

how "data" and "target" are choosen in a federated learning? (PySyft)

i can't understand how in function train() below, the variable (data, target) are choosen.
def train(args, model, device, federated_train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(federated_train_loader): # <-- now it is a distributed dataset
model.send(data.location) # <-- NEW: send the model to the right location`
i guess they are 2 tensor representing 2 random images of dataset train, but then the loss function
loss = F.nll_loss(output, target)
is calculated at every interaction with different target?
Also i have different question: i trained the network with images of cats, then i test it with images of cars and the accuracy reached is 97%. How is this possible? is a proper value or i'm doing something wrong?
here is the entire code:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import syft as sy # <-- NEW: import the Pysyft library
hook = sy.TorchHook(torch) # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
bob = sy.VirtualWorker(hook, id="bob") # <-- NEW: define remote worker bob
alice = sy.VirtualWorker(hook, id="alice") # <-- NEW: and alice
class Arguments():
def __init__(self):
self.batch_size = 64
self.test_batch_size = 1000
self.epochs = 2
self.lr = 0.01
self.momentum = 0.5
self.no_cuda = False
self.seed = 1
self.log_interval = 30
self.save_model = False
args = Arguments()
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader
datasets.MNIST("C:\\users...\\train", train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
.federate((bob, alice)), # <-- NEW: we distribute the dataset across all the workers, it's now a FederatedDataset
batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
datasets.MNIST("C:\\Users...\\test", train=False, download=True, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=args.test_batch_size, shuffle=True, **kwargs)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
def train(args, model, device, federated_train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(federated_train_loader): # <-- now it is a distributed dataset
model.send(data.location) # <-- NEW: send the model to the right location
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
model.get() # <-- NEW: get the model back
if batch_idx % args.log_interval == 0:
loss = loss.get() # <-- NEW: get the loss back
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * args.batch_size, len(federated_train_loader) * args.batch_size,
100. * batch_idx / len(federated_train_loader), loss.item()))
def test(args, model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
pred = output.argmax(1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=args.lr) # TODO momentum is not supported at the moment
for epoch in range(1, args.epochs + 1):
train(args, model, device, federated_train_loader, optimizer, epoch)
test(args, model, device, test_loader)
if (args.save_model):
torch.save(model.state_dict(), "mnist_cnn.pt")

Consider it like this. When you hook torch, all your torch tensors will get additional functionality - methods like .send(), .federate(), and attributes like .location and ._objects. Your data and target, which were once torch tensors, became pointers to tensors residing in different VirtualWorker objects due to .federate((bob, alice)).
Now data and target have additional attributes that includes .location, which will return the location of that tensor - data/target pointed by the pointer called data/target.
Federated learning sends the global model to this location, as seen in model.send(data.location).
Now, model is a pointer residing at the same location and data is also a pointer residing there. Hence when you take the output as output = model(data), output will also reside there and all we (the central server or in other words, the VirtualWorker called 'me') will get is a pointer to that output.
Now, regarding your doubt on loss calculation, since output and target are both residing in that same location, calculation of loss will also happen there. Same goes for backprop and step.
Finally, you can see model.get(), here is where the central server pulls the remote model using the pointer called model. (I'm not sure if it should be model = model.get() though).
So anything with .get() will be pulled from that worker and will be returned in our python statement. Also note that .get() will remove that object from it's location when called. Hence use .copy().get() if you are going to need it further.

PyTorch: accuracy of validation set greater than 100% during training

1 ) Problem
I observe an odd behaviour during training where my validation-accuracy is above 100% right from the start.
Epoch 0/3
----------
100%|██████████| 194/194 [00:50<00:00, 3.82it/s]
train Loss: 1.8653 Acc: 0.4796
100%|██████████| 194/194 [00:32<00:00, 5.99it/s]
val Loss: 1.7611 Acc: 1.2939
Epoch 1/3
----------
100%|██████████| 194/194 [00:42<00:00, 4.61it/s]
train Loss: 0.8704 Acc: 0.7467
100%|██████████| 194/194 [00:31<00:00, 6.11it/s]
val Loss: 1.0801 Acc: 1.4694
The output indicates that one epoch iterates over 194 batches, which does seem to be correct for the training data (which has a length of 6186, batch_size is 32, hence 32*194 = 6208 and this is ≈6186) but does not match the size of the validation-data (length of 3447, batch_size = 32).
Hence I would expect my validation-loop to generate 108 (3447 / 32 ≈ 108) batches insted of 194.
I thought this behaviour is handled within my for loop at:
for dataset in tqdm(dataloaders[phase]):
But somehow I can't figure out what is wrong here. See point 3) below for my entire code.
2 ) Question
If my assumption above is correct i.e. that this error stems from the for-loop within in my code then I would like to know the following:
How do I need to adjust the for-loop during the validation phase to handle the number of batches that are being used for validation correctly?
3 ) Background:
Following two tutorials, one on how to do transfer-learning (https://discuss.pytorch.org/t/transfer-learning-using-vgg16/20653) and one on how to do data-loading (https://pytorch.org/tutorials/beginner/data_loading_tutorial.html) in pytorch, I am trying to customize the code such that I can perform transfer-learning on a new custom dataset which I want to provide via pandas dataframes.
As such, my training- and validation-data is provided via two dataframes (df_train & df_val) which both contain two columns, one for the path and one for the target. E.g. like this:
url target
0 C:/Users/aaron/Desktop/pics/4ebd... 9
1 C:/Users/aaron/Desktop/pics/7153... 3
2 C:/Users/aaron/Desktop/pics/3ee6... 3
3 C:/Users/aaron/Desktop/pics/4652... 16
4 C:/Users/aaron/Desktop/pics/28ce... 15
...
And their respective length:
print(len(df_train))
print(len(df_val))
>> 6186
>> 3447
My pipeline looks like this:
class CustomDataset(Dataset):
def __init__(self, df, transform=None):
self.dataframe = df_train
self.transform = transform
def __len__(self):
return len(self.dataframe)
def __getitem__(self, idx):
img_name = self.dataframe.iloc[idx, 0]
img = Image.open(img_name)
img_normalized = self.transform(img)
landmarks = self.dataframe.iloc[idx, 1]
sample = {'data': img_normalized, 'label': int(landmarks)}
return sample
train_dataset = CustomDataset(df_train,transform=transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]))
val_dataset = CustomDataset(df_val,transform=transforms.Compose([
transforms.Resize(224),
transforms.ToTensor(),transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]))
train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=32,shuffle=True, num_workers=0)
val_loader = torch.utils.data.DataLoader(val_dataset,batch_size=32,shuffle=True, num_workers=0)
dataloaders = {'train': train_loader, 'val': val_loader}
dataset_sizes = {'train': len(df_train) ,'val': len(df_val)}
################### Training
from tqdm import tqdm
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for dataset in tqdm(dataloaders[phase]):
inputs, labels = dataset["data"], dataset["label"]
#print(inputs.type())
inputs = inputs.to(device, dtype=torch.float)
labels = labels.to(device,dtype=torch.long)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, len(le.classes_))
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=4)

Your problem appears to be here:
class CustomDataset(Dataset):
def __init__(self, df, transform=None):
>>>>> self.dataframe = df_train
This should be
self.dataframe = df
In your case, you are inadvertently setting both the train and val CustomDataset to df_train ...

Neural networks pytorch

I am very new in pytorch and implementing my own network of image classifier. However I see for each epoch training accuracy is very good but validation accuracy is 0.i noted till 5th epoch. I am using Adam optimizer and have learning rate .001. also resampling the whole data set after each epoch into training n validation set. Please help where I am going wrong.
Here is my code:
### where is data?
data_dir_train = '/home/sup/PycharmProjects/deep_learning/CNN_Data/training_set'
data_dir_test = '/home/sup/PycharmProjects/deep_learning/CNN_Data/test_set'
# Define your batch_size
batch_size = 64
allData = datasets.ImageFolder(root=data_dir_train,transform=transformArr)
# We need to further split our training dataset into training and validation sets.
def split_train_validation():
# Define the indices
num_train = len(allData)
indices = list(range(num_train)) # start with all the indices in training set
split = int(np.floor(0.2 * num_train)) # define the split size
#train_idx, valid_idx = indices[split:], indices[:split]
# Random, non-contiguous split
validation_idx = np.random.choice(indices, size=split, replace=False)
train_idx = list(set(indices) - set(validation_idx))
# define our samplers -- we use a SubsetRandomSampler because it will return
# a random subset of the split defined by the given indices without replacement
train_sampler = SubsetRandomSampler(train_idx)
validation_sampler = SubsetRandomSampler(validation_idx)
#train_loader = DataLoader(allData,batch_size=batch_size,sampler=train_sampler,shuffle=False,num_workers=4)
#validation_loader = DataLoader(dataset=allData,batch_size=1, sampler=validation_sampler)
return (train_sampler,validation_sampler)
Training
from torch.optim import Adam
import torch
import createNN
import torch.nn as nn
import loadData as ld
from torch.autograd import Variable
from torch.utils.data import DataLoader
# check if cuda - GPU support available
cuda = torch.cuda.is_available()
#create model, optimizer and loss function
model = createNN.ConvNet(class_num=2)
optimizer = Adam(model.parameters(),lr=.001,weight_decay=.0001)
loss_func = nn.CrossEntropyLoss()
if cuda:
model.cuda()
# function to save model
def save_model(epoch):
torch.save(model.load_state_dict(),'imageClassifier_{}.model'.format(epoch))
print('saved model at epoch',epoch)
def exp_lr_scheduler ( epoch , init_lr = args.lr, weight_decay = args.weight_decay, lr_decay_epoch = cf.lr_decay_epoch):
lr = init_lr * ( 0.5 ** (epoch // lr_decay_epoch))
def train(num_epochs):
best_acc = 0.0
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
model.train()
acc = 0.0
loss = 0.0
total = 0
# train model with training data
for i,(images,labels) in enumerate(train_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal , predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
# Valid model with validataion data
model.eval()
acc = 0.0
loss = 0.0
total = 0
for i,(images,labels) in enumerate(validation_loader):
# if cuda then move to GPU
if cuda:
images = images.cuda()
labels = labels.cuda()
# Variable class wraps a tensor and we can calculate grad
images = Variable(images)
labels = Variable(labels)
# reset accumulated gradients for each batch
optimizer.zero_grad()
# pass images to model which returns preiction
output = model(images)
#calculate the loss based on prediction and actual
loss = loss_func(output,labels)
# backpropagate the loss and compute gradient
loss.backward()
# update weights as per the computed gradients
optimizer.step()
# prediction class
predVal, predClass = torch.max(output.data, 1)
acc += torch.sum(predClass == labels.data)
loss += loss.cpu().data[0]
total += labels.size(0)
# print the statistics
valid_acc = acc / total
valid_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, valid_loss))
if(best_acc<valid_acc):
best_acc = valid_acc
save_model(epoch)
# at 30th epoch we save the model
if (epoch == 30):
save_model(epoch)
train(20)

I think you did not take into account that acc += torch.sum(predClass == labels.data) returns a tensor instead of a float value. Depending on the version of pytorch you are using I think you should change it to:
acc += torch.sum(predClass == labels.data).cpu().data[0] #pytorch 0.3
acc += torch.sum(predClass == labels.data).item() #pytorch 0.4
Although your code seems to be working for old pytorch version, I would recommend you to upgrade to the 0.4 version.
Also, I mentioned other problems/typos in your code.
You are loading the dataset for every epoch.
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
...
That should not happen, it should be enough loading it once
train_sampler, validation_sampler = ld.split_train_validation()
train_loader = DataLoader(ld.allData, batch_size=30, sampler=train_sampler, shuffle=False)
validation_loader = DataLoader(dataset=ld.allData, batch_size=1, sampler=validation_sampler)
for epoch in range(num_epochs):
print('\n\nEpoch {}'.format(epoch))
...
In the training part you have (this does not happen in the validation):
train_acc = acc/total
train_loss = loss / total
print('Mean train acc = {} over epoch = {}'.format(epoch,acc))
print('Mean train loss = {} over epoch = {}'.format(epoch, loss))
Where you are printing acc instead of train_acc
Also, in the validation part I mentioned that you are printing print('Mean train acc = {} over epoch = {}'.format(epoch, valid_acc)) when it should be something like 'Mean val acc'.
Changing this lines of code, using a standard model I created and CIFAR dataset the training seems to converge, accuracy increases at every epoch while mean loss value decreases.
I Hope I could help you!

How to train a Pytorch net

I'm using this Pytorch implementation of Segnet with pretrained values I found for object segmentation, and it works fine.
Now I want to resume the training from the values I have, using a new dataset with similar images.
How can I do that?
I guess I have to use the "train.py" file found in the repository, but I don't know what to write in order to replace the "fill the batch" comment.
Here is that portion of the code:
def train(epoch):
model.train()
# update learning rate
lr = args.lr * (0.1 ** (epoch // 30))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# define a weighted loss (0 weight for 0 label)
weights_list = [0]+[1 for i in range(17)]
weights = np.asarray(weights_list)
weigthtorch = torch.Tensor(weights_list)
if(USE_CUDA):
loss = nn.CrossEntropyLoss(weight=weigthtorch).cuda()
else:
loss = nn.CrossEntropyLoss(weight=weigthtorch)
total_loss = 0
# iteration over the batches
batches = []
for batch_idx,batch_files in enumerate(tqdm(batches)):
# containers
batch = np.zeros((args.batch_size,input_nbr, imsize, imsize), dtype=float)
batch_labels = np.zeros((args.batch_size,imsize, imsize), dtype=int)
# fill the batch
# ...
# What should I write here?
batch_th = Variable(torch.Tensor(batch))
target_th = Variable(torch.LongTensor(batch_labels))
if USE_CUDA:
batch_th =batch_th.cuda()
target_th = target_th.cuda()
# initilize gradients
optimizer.zero_grad()
# predictions
output = model(batch_th)
# Loss
output = output.view(output.size(0),output.size(1), -1)
output = torch.transpose(output,1,2).contiguous()
output = output.view(-1,output.size(2))
target = target.view(-1)
l_ = loss(output.cuda(), target)
total_loss += l_.cpu().data.numpy()
l_.cuda()
l_.backward()
optimizer.step()
return total_loss/len(files)

If I had to guess he probablly made some Dataloader feeder that extended the Pytorch Dataloader class. See
https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
Near the bottom of the page you can see an example in which they loop over their data loader
for i_batch, sample_batched in enumerate(dataloader):
What this would like like for images for example is:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batchSize, shuffle=True, num_workers=2)
for batch_idx, (inputs, targets) in enumerate(trainloader):
# Using the pytorch data loader the inputs and targets are given
# automatically
inputs, targets = inputs.cuda(), targets.cuda()
optimizer.zero_grad()
inputs, targets = Variable(inputs), Variable(targets)
How exactly the author loads his files I don't know. You could follow the procedure from: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html to make your own Dataloader though.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

PyTorch : GPU execution time - python

A bit late I know, but it might be that you are missing model.to(cuda)

Related

invalid data type 'numpy.str_' when training bert model with pytorch

how "data" and "target" are choosen in a federated learning? (PySyft)

PyTorch: accuracy of validation set greater than 100% during training

Neural networks pytorch

How to train a Pytorch net

Categories

Resources