How to get activation values of a layer in pytorch - python

I have a pytorch-lightning model that has a dense layer like so:
def __init__(...)
...
self.dense = nn.Linear(channels[-1], 64, bias=True)
...
for my project, I need to get the activation values of this layer as a list
I have tried this code which I found on the pytorch discussion forum:
activation = {}
def get_activation(name):
def hook(model, input, output):
activation[name] = output.detach()
return hook
test_img = cv.imread(f'digimage/100.jpg')
test_img = cv.resize(test_img, (128, 128))
test_img = np.moveaxis(test_img, 2, 0)
modelftr = load_feature_model(**model_dict)
num_ftrs = modelftr.fc.in_features
modelftr.fc = torch.nn.Linear(num_ftrs, 228)
modelftr.load_state_dict(torch.load('...'))
modelftr.dense.register_forward_hook(get_activation('dense'))
with torch.no_grad():
modelatt.to('cpu')
modelatt.eval()
test_img = torch.tensor(test_img).view(-1, 3, 128, 128).float()
output = modelcat(test_img)
print(activation['dense'])
But this gives a keyerror:
8 test_img = torch.tensor(test_img).view(-1, 3, 128, 128).float()
9 output = modelcat(test_img)
---> 10 print(activation['dense'])
KeyError: 'dense'
Update:
This is my full model code.
As you can see there is a linear layer named dense
class FAtNet(pl.LightningModule):
def __init__(self, image_size, in_channels, num_blocks, channels,
num_classes=20, block_types=['C', 'C', 'T', 'T'], lr=0.0001, loss_function=nn.CrossEntropyLoss()):
super().__init__()
self.lr = lr
self.loss_function = loss_function
ih, iw = image_size
block = {'C': MBConv, 'T': Transformer}
self.s0 = self._make_layer(
conv_3x3_bn, in_channels, channels[0], num_blocks[0], (ih // 2, iw // 2))
self.s1 = self._make_layer(
block[block_types[0]], channels[0], channels[1], num_blocks[1], (ih // 4, iw // 4))
self.s2 = self._make_layer(
block[block_types[1]], channels[1], channels[2], num_blocks[2], (ih // 8, iw // 8))
self.s3 = self._make_layer(
block[block_types[2]], channels[2], channels[3], num_blocks[3], (ih // 16, iw // 16))
self.s4 = self._make_layer(
block[block_types[3]], channels[3], channels[4], num_blocks[4], (ih // 32, iw // 32))
self.pool = nn.AvgPool2d(ih // 32, 1)
self.dense = nn.Linear(channels[-1], 64, bias=True)
self.fc = nn.Linear(64, num_classes, bias=False)
def forward(self, x):
x = self.s0(x)
x = self.s1(x)
x = self.s2(x)
x = self.s3(x)
x = self.s4(x)
x = self.pool(x).view(-1, x.shape[1])
x = self.dense(x)
x = self.fc(x)
return x
def _make_layer(self, block, inp, oup, depth, image_size):
layers = nn.ModuleList([])
for i in range(depth):
if i == 0:
layers.append(block(inp, oup, image_size, downsample=True))
else:
layers.append(block(oup, oup, image_size))
return nn.Sequential(*layers)
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=self.lr)
def training_step(self, batch, batch_idx):
X, y = batch
y_hat = self(X)
loss = self.loss_function(y_hat, y)
self.log('train_loss', loss)
return loss
def test_step(self, batch, batch_idx):
X, y = batch
y_hat = self(X)
loss = self.loss_function(y_hat, y)
self.log('test_loss', loss)
return loss
### custom prediction function ###
def predict(self, dm):
X_test = dm.X_test
self.eval()
X_test = torch.tensor(X_test).float()
self.to(device='cuda')
pred = []
with torch.no_grad():
for data in X_test:
output = self(data)
pred.append(output)
pred = pred[0].detach()
pred = pred.cpu()
self.to(device='cpu')
self.train()
return pred
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)

It seems like you model does not have 'dense' layer, only 'fc'.
Try:
modelftr.fc.register_forward_hook(get_activation('fc'))

Related

RuntimeError: The expanded size of the tensor (1024) must match the existing size (256) at non-singleton dimension 1

I want to use Fast.ai inference code from the following source:
https://www.kaggle.com/code/thedevastator/inference-fastai-baseline/notebook
The size of the tiles sz is defined as 256.
bs = 64
sz = 256 # the size of tiles
reduce = 4 # reduce the original images by 4 times
TH = 0.225 # threshold for positive predictions
DATA = '../input/hubmap-organ-segmentation/test_images/'
df_sample = pd.read_csv('../input/hubmap-organ-segmentation/sample_submission.csv')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#iterator like wrapper that returns predicted masks
class Model_pred:
def __init__(self, models, dl, tta:bool=True, half:bool=False):
self.models = models
self.dl = dl
self.tta = tta
self.half = half
def __iter__(self):
count=0
with torch.no_grad():
for x,y in iter(self.dl):
print('Yusuf', x.shape, y.shape)
if ((y>=0).sum() > 0): #exclude empty images
x = x[y>=0].to(device)
y = y[y>=0]
if self.half: x = x.half()
py = None
for model in self.models:
p = model(x)
p = torch.sigmoid(p).detach()
if py is None: py = p
else: py += p
if self.tta:
#x,y,xy flips as TTA
flips = [[-1],[-2],[-2,-1]]
for f in flips:
xf = torch.flip(x,f)
for model in self.models:
p = model(xf)
p = torch.flip(p,f)
py += torch.sigmoid(p).detach()
py /= (1+len(flips))
py /= len(self.models)
print('Yusuf2', py.shape)
# py = F.upsample(py, scale_factor=reduce, mode="bilinear")
# py = py.permute(0,2,3,1).float().cpu()
py = py.float().cpu()
py = np.argmax(py, axis=1)
batch_size = len(py)
for i in range(batch_size):
yield py[i],y[i]
count += 1
def __len__(self):
return len(self.dl.dataset)
Models
class FPN(nn.Module):
def __init__(self, input_channels:list, output_channels:list):
super().__init__()
self.convs = nn.ModuleList(
[nn.Sequential(nn.Conv2d(in_ch, out_ch*2, kernel_size=3, padding=1),
nn.ReLU(inplace=True), nn.BatchNorm2d(out_ch*2),
nn.Conv2d(out_ch*2, out_ch, kernel_size=3, padding=1))
for in_ch, out_ch in zip(input_channels, output_channels)])
def forward(self, xs:list, last_layer):
hcs = [F.interpolate(c(x),scale_factor=2**(len(self.convs)-i),mode='bilinear')
for i,(c,x) in enumerate(zip(self.convs, xs))]
hcs.append(last_layer)
return torch.cat(hcs, dim=1)
class UnetBlock(Module):
def __init__(self, up_in_c:int, x_in_c:int, nf:int=None, blur:bool=False,
self_attention:bool=False, **kwargs):
super().__init__()
self.shuf = PixelShuffle_ICNR(up_in_c, up_in_c//2, blur=blur, **kwargs)
self.bn = nn.BatchNorm2d(x_in_c)
ni = up_in_c//2 + x_in_c
nf = nf if nf is not None else max(up_in_c//2,32)
self.conv1 = ConvLayer(ni, nf, norm_type=None, **kwargs)
self.conv2 = ConvLayer(nf, nf, norm_type=None,
xtra=SelfAttention(nf) if self_attention else None, **kwargs)
self.relu = nn.ReLU(inplace=True)
def forward(self, up_in:Tensor, left_in:Tensor) -> Tensor:
s = left_in
up_out = self.shuf(up_in)
cat_x = self.relu(torch.cat([up_out, self.bn(s)], dim=1))
return self.conv2(self.conv1(cat_x))
class _ASPPModule(nn.Module):
def __init__(self, inplanes, planes, kernel_size, padding, dilation, groups=1):
super().__init__()
self.atrous_conv = nn.Conv2d(inplanes, planes, kernel_size=kernel_size,
stride=1, padding=padding, dilation=dilation, bias=False, groups=groups)
self.bn = nn.BatchNorm2d(planes)
self.relu = nn.ReLU()
self._init_weight()
def forward(self, x):
x = self.atrous_conv(x)
x = self.bn(x)
return self.relu(x)
def _init_weight(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
class ASPP(nn.Module):
def __init__(self, inplanes=512, mid_c=256, dilations=[6, 12, 18, 24], out_c=None):
super().__init__()
self.aspps = [_ASPPModule(inplanes, mid_c, 1, padding=0, dilation=1)] + \
[_ASPPModule(inplanes, mid_c, 3, padding=d, dilation=d,groups=4) for d in dilations]
self.aspps = nn.ModuleList(self.aspps)
self.global_pool = nn.Sequential(nn.AdaptiveMaxPool2d((1, 1)),
nn.Conv2d(inplanes, mid_c, 1, stride=1, bias=False),
nn.BatchNorm2d(mid_c), nn.ReLU())
out_c = out_c if out_c is not None else mid_c
self.out_conv = nn.Sequential(nn.Conv2d(mid_c*(2+len(dilations)), out_c, 1, bias=False),
nn.BatchNorm2d(out_c), nn.ReLU(inplace=True))
self.conv1 = nn.Conv2d(mid_c*(2+len(dilations)), out_c, 1, bias=False)
self._init_weight()
def forward(self, x):
x0 = self.global_pool(x)
xs = [aspp(x) for aspp in self.aspps]
x0 = F.interpolate(x0, size=xs[0].size()[2:], mode='bilinear', align_corners=True)
x = torch.cat([x0] + xs, dim=1)
return self.out_conv(x)
def _init_weight(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
torch.nn.init.kaiming_normal_(m.weight)
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
from torchvision.models.resnet import ResNet, Bottleneck
class UneXt50(nn.Module):
def __init__(self, stride=1, **kwargs):
super().__init__()
#encoder
m = ResNet(Bottleneck, [3, 4, 6, 3], groups=32, width_per_group=4)
#m = torch.hub.load('facebookresearch/semi-supervised-ImageNet1K-models',
# 'resnext50_32x4d_ssl')
self.enc0 = nn.Sequential(m.conv1, m.bn1, nn.ReLU(inplace=True))
self.enc1 = nn.Sequential(nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1),
m.layer1) #256
self.enc2 = m.layer2 #512
self.enc3 = m.layer3 #1024
self.enc4 = m.layer4 #2048
#aspp with customized dilatations
self.aspp = ASPP(2048,256,out_c=512,dilations=[stride*1,stride*2,stride*3,stride*4])
self.drop_aspp = nn.Dropout2d(0.5)
#decoder
self.dec4 = UnetBlock(512,1024,256)
self.dec3 = UnetBlock(256,512,128)
self.dec2 = UnetBlock(128,256,64)
self.dec1 = UnetBlock(64,64,32)
self.fpn = FPN([512,256,128,64],[16]*4)
self.drop = nn.Dropout2d(0.1)
self.final_conv = ConvLayer(32+16*4, 1, ks=1, norm_type=None, act_cls=None)
def forward(self, x):
enc0 = self.enc0(x)
enc1 = self.enc1(enc0)
enc2 = self.enc2(enc1)
enc3 = self.enc3(enc2)
enc4 = self.enc4(enc3)
enc5 = self.aspp(enc4)
dec3 = self.dec4(self.drop_aspp(enc5),enc3)
dec2 = self.dec3(dec3,enc2)
dec1 = self.dec2(dec2,enc1)
dec0 = self.dec1(dec1,enc0)
x = self.fpn([enc5, dec3, dec2, dec1], dec0)
x = self.final_conv(self.drop(x))
x = F.interpolate(x,scale_factor=2,mode='bilinear')
return x
models = []
MODELS = [FPN, UnetBlock, _ASPPModule, ASPP]
for path in MODELS:
model = UneXt50()
model.float()
model.eval()
model.to(device)
models.append(model)
Prediction
names,preds = [],[]
for idx,row in tqdm(df_sample.iterrows(),total=len(df_sample)):
idx = str(row['id'])
ds = HuBMAPDataset(idx)
#rasterio cannot be used with multiple workers
dl = DataLoader(ds,bs,num_workers=0,shuffle=False,pin_memory=True)
mp = Model_pred(models,dl)
#generate masks
mask = torch.zeros(len(ds),ds.sz,ds.sz,dtype=torch.int8)
for p,i in iter(mp): mask[i.item()] = p.squeeze(-1) > TH
#reshape tiled masks into a single mask and crop padding
mask = mask.view(ds.n0max,ds.n1max,ds.sz,ds.sz).\
permute(0,2,1,3).reshape(ds.n0max*ds.sz,ds.n1max*ds.sz)
mask = mask[ds.pad0//2:-(ds.pad0-ds.pad0//2) if ds.pad0 > 0 else ds.n0max*ds.sz,
ds.pad1//2:-(ds.pad1-ds.pad1//2) if ds.pad1 > 0 else ds.n1max*ds.sz]
#convert to rle
rle = rle_encode_less_memory(mask.numpy())
names.append(idx)
preds.append(rle)
del mask, ds, dl
gc.collect()
Traceback:
0%| | 0/1 [00:00<?, ?it/s]
Yusuf torch.Size([4, 3, 256, 256]) torch.Size([4])
0%| | 0/1 [00:06<?, ?it/s]
Yusuf2 torch.Size([4, 1, 256, 256])
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_33/1431546850.py in <module>
8 #generate masks
9 mask = torch.zeros(len(ds),ds.sz,ds.sz,dtype=torch.int8)
---> 10 for p,i in iter(mp): mask[i.item()] = p.squeeze(-1) > TH
11
12 #reshape tiled masks into a single mask and crop padding
RuntimeError: The expanded size of the tensor (1024) must match the existing size (256) at non-singleton dimension 1. Target sizes: [1024, 1024]. Tensor sizes: [256, 256]

How to plot Receptive Fields, for a CNN/fashionMNIST?

I created my CNN with PyTorch Lightning, and I am actually looking for plotting the Receptive Fields.
Do you have any suggestions about it?
I look for different solutions here and there, but I actually can't make them synergize with PyTorch Lightning.
Is it possible to visualize the Receptive fields directly inside Tensorboard?
I'll share with you my Dataset:
train_dataset = torchvision.datasets.FashionMNIST('classifier_data', train=True, download=True, transform=transforms.ToTensor())
train, val = train_test_split(train_dataset, test_size = .2)
train_loader = DataLoader(train, batch_size = 32)
val_loader = DataLoader(train, batch_size = 32)
test_dataset = torchvision.datasets.FashionMNIST('classifier_data', train=False, download=True, transform=transforms.ToTensor())
test_loader = DataLoader(test_dataset, batch_size = 32)
and CNN:
def __init__(self, dropout, learn_rate, momentum, weight_decay, optimizer):
#def __init__(self, dropout, learn_rate, weight_decay, optimizer):
super().__init__()
self.conv1 = nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 5)
self.conv2 = nn.Conv2d(in_channels = 6, out_channels = 12 , kernel_size = 5)
self.fc1 = nn.Linear(in_features = 12*4*4, out_features = 120)
self.fc2 = nn.Linear(in_features = 120, out_features = 60)
self.out = nn.Linear(in_features = 60, out_features = 10)
self.do = nn.Dropout(dropout) #for overfitting issues
self.loss = nn.CrossEntropyLoss()
self.accuracy = torchmetrics.Accuracy()
self.learn_rate = learn_rate
self.momentum = momentum #with Adam we don't have momentum. To Check best Optimizer with Optune, please comment this line.
self.weight_decay = weight_decay
self.optimizer = optimizer
self.train_loss = []
self.val_loss = []
self.train_acc = []
self.test_acc = []
#plot into tensorboard
log_dir = pathlib.Path.cwd() / "lightning_logs"
self.writer = SummaryWriter(log_dir)
#forward step
#I add each layer to the histogram. It's plotted into tensorboard
def forward(self, x, additional_out=False):
#conv1
x = self.conv1(x)
self.writer.add_histogram("First convolutional layer CNN", x)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size = 2, stride = 2)
#conv2
x = self.conv2(x)
self.writer.add_histogram("Second convolutional layer CNN", x)
x = F.relu(x)
x = F.max_pool2d(x, kernel_size = 2, stride = 2)
#fuly connected 1
x = x.reshape(-1, 12*4*4)
x = self.fc1(x)
self.writer.add_histogram("First linear layer CNN", x)
x = F.relu(x)
x = self.do(x)
#fully connected 2
x=self.fc2(x)
self.writer.add_histogram("Second linear layer CNN", x)
x = F.relu(x)
x = self.do(x)
#output
x = self.out(x)
self.writer.add_histogram("Output layer CNN", x)
return x
#optimizer
def configure_optimizers(self):
#optimizer = self.optimizer(self.parameters(), lr = self.learn_rate, momentum = self.momentum, weight_decay = self.weight_decay)
optimizer = self.optimizer(self.parameters(), lr = self.learn_rate, weight_decay = self.weight_decay)
return optimizer
#training step
def training_step(self, batch, batch_idx):
x, y = batch
b = x.size(0)
x = x.view(b, -1, 28, 28)
logit = self(x)
J = self.loss(logit, y) #loss
#self.train_loss.append(J) #no need to append
acc = self.accuracy(logit, y) #accuracy
#self.train_acc.append(acc) #no need to append
self.log("train_loss_cnn", J.item())
self.log("train_acc_cnn", acc.item())
return {'loss': J}
#Since I used Tensorboard, it don't have to append to loss
def test_step(self, batch, batch_idx):
p, q = batch
b = p.size(0)
p = p.view(b, -1, 28, 28)
logit = self(p)
J = self.loss(logit, q) #loss
acc_test = self.accuracy(logit, q) #accuracy
#self.train_acc.append(acc_test) #no need to append
#self.train_loss.append(J) #no need to append
self.log("test_acc_cnn", acc_test.item())
self.log("test_loss_cnn", J.item())
def validation_step(self, batch, batch_idx=None):
u, v = batch
b = u.size(0)
u = u.view(b, -1, 28, 28)
logit = self(u)
J = self.loss(logit, v) #loss
#self.val_loss.append(J) #no need to append
acc_val = self.accuracy(logit, v) #accuracy
#self.train_acc.append(acc_val) #no need to append
self.log("val_loss_cnn", J.item())
self.log("val_acc_cnn", acc_val.item())
return {"loss": J, "pred": logit, "target": v}
#Once saves from validation step, I take with me the returned elements, and I can plot the Confusion Matrix inside Tensorboard
def validation_epoch_end(self, outputs):
preds = torch.cat([tmp['pred'] for tmp in outputs])
targets = torch.cat([tmp['target'] for tmp in outputs])
conf_mat = confusion_matrix(preds, targets, num_classes=10)
df_cm = pd.DataFrame(conf_mat.numpy(), index = range(10), columns=range(10))
plt.figure(figsize = (10,7))
fig_ = sns.heatmap(df_cm, annot=True, cmap='Spectral').get_figure()
plt.close(fig_)
self.logger.experiment.add_figure("Confusion matrix CNN", fig_, self.current_epoch)

Pytorch network not training

I am trying to train an activity recognition system using PyTorch, but the network is not training and loss is not dropping, even though I have a similar model working perfectly on keras. I have provided code for the training loop, model class, and dataset class here. Can you help me why the loss is not dropping (accuracy is not increasing)
main training loop
# create dataset
dataset = IMU_dataset()
train_loader = DataLoader(dataset=dataset,
batch_size=40,
shuffle=True,
num_workers=2)
num_epochs = 100
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4)
print(total_samples, n_iterations)
input_shape = 3
output_index = 6
device = torch.device('cpu')
model = HARmodel(input_shape, output_index).to(device)
model.float()
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
for epoch in range(num_epochs):
for i, (inputs, labels) in enumerate(train_loader):
# origin shape: [40, 3, 400]
labels = labels.to(device)
# Forward pass
outputs = model(inputs.to(device).float())
loss = criterion(outputs, labels.long())
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
# if (i+1) % 5 == 0:
# print(f'loss: {loss.item()}')
print(model.calculate_accuracy(dataset.x_data, dataset.y_data), model.calculate_loss(dataset.x_data, dataset.y_data, criterion))
Here is the model class.
class HARmodel(nn.Module):
"""Model for human-activity-recognition."""
def __init__(self, input_size, num_classes):
super().__init__()
# Extract features, 1D conv layers
self.layer_1 = nn.Conv1d(input_size, 100, 10, stride=1)
self.activation_relu = nn.ReLU()
self.layer_2 = nn.Conv1d(100, 100, 10, stride=1)
self.layer_3 = nn.Conv1d(100, 100, 10, stride=1)
self.layer_4 = nn.MaxPool1d(2, stride=3)
self.layer_5 = nn.Dropout(p=0.2)
self.layer_6 = nn.Conv1d(100, 160, 10, stride=1)
self.layer_7 = nn.Conv1d(160, 160, 10, stride=1)
self.layer_8 = nn.Conv1d(160, 160, 10, stride=1)
# self.layer_9 = nn.AvgPool1d(97)
self.layer_10 = nn.Dropout(p=0.5)
self.layer_11 = nn.Linear(160, 6)
self.activation_softmax = nn.Softmax()
def forward(self, x):
x = self.layer_1(x)
x = self.activation_relu(x)
x = self.layer_2(x)
x = self.activation_relu(x)
x = self.layer_3(x)
x = self.activation_relu(x)
x = self.layer_4(x)
x = self.layer_5(x)
x = self.layer_6(x)
x = self.activation_relu(x)
x = self.layer_7(x)
x = self.activation_relu(x)
x = self.layer_8(x)
x = self.activation_relu(x)
self.layer_9 = nn.AvgPool1d(x.shape[2])
x = self.layer_9(x)
x = self.layer_10(x)
y = self.layer_11(x.view(x.shape[0],x.shape[1]))
# y = self.activation_softmax(y)
return y
def calculate_accuracy(self, X,y):
with torch.no_grad():
output = model.forward(X.float())
max_index = output.max(dim = 1)[1]
true_output = y.type(torch.LongTensor)
result = (max_index == true_output).sum()/y.shape[0]
return result.detach().numpy()
def calculate_loss(self, X,y, crit):
with torch.no_grad():
output = model.forward(X.float())
max_index = output.max(dim = 1)[1]
true_output = y.type(torch.LongTensor)
return crit(output, true_output).item()
Here is the dataset class:
class IMU_dataset(Dataset):
def __init__(self):
self.n = X.shape[0]
self.x_data = torch.from_numpy(X.reshape(-1,3,400))
self.y_data = torch.from_numpy(y)
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.n
EDIT 1:
I got to know that I need to remove the softmax layer
I have tried with a lower learning rate and still have the same problem.

PyTorch Transformer: ValueError: Expected target size (2, 256), got torch.Size([2, 8, 256])

I am relatively new to transformers and thought of programming one from scratch with pytorch as a good exercise. I already tested the model and it worked. However, when implementing training for english-french translation tasks, I get the said error when computing the loss.
The code of the training function is as follows:
def train_(self, x, y, lr, steps, path=None):
self.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(self.parameters(), lr=lr)
for epoch in range(steps):
for batch_id, (batch_x, batch_y) in enumerate(zip(x, y)):
if torch.cuda.is_available() and self.is_cuda:
batch_x = batch_x.cuda()
batch_y = batch_y.cuda()
out = self(batch_x, batch_y)
print(out.shape, batch_y.shape)
# Embed batch_y so result is comparable
batch_y = self.decoder.word_embedding(batch_y)
print(batch_y.shape)
loss = criterion(out, batch_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Training: epoch {epoch} batch {batch_id} loss {loss}")
The prints of the shapes give the following output:
torch.Size([2, 8, 256]) torch.Size([2, 8])
torch.Size([2, 8, 256])
In terms of dimensionality, I'm using an embedding size of 256.
If needed, I can also provide the entire code.
Thank you.
Edit:
Here's the whole code
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# Hyper-parameters
lr = 0.001
steps = 1000
# Attention head
class AttentionHead(nn.Module):
def __init__(self, embed_dim, head_dim):
super(AttentionHead, self).__init__()
self.embed_dim = embed_dim
self.values_layer = nn.Linear(head_dim, head_dim, bias=False)
self.keys_layer = nn.Linear(head_dim, head_dim, bias=False)
self.queries_layer = nn.Linear(head_dim, head_dim, bias=False)
def forward(self, values, keys, queries, mask=None):
# Send them through the linear layers
values = self.values_layer(values)
keys = self.keys_layer(keys)
queries = self.queries_layer(queries)
# Multiply queries and keys to score matrix
scores = torch.einsum("nah,nbh->nab", queries, keys)
# Keys shape: (n, m, head_dim)
# Queries shape: (n, m, heads_dim)
# Score shape: (n, m, m)
# If needed, then mask the score matrix
if mask is not None:
scores = scores.masked_fill(mask == 0, float("-1e20"))
# Scale the (masked) score matrix
scaled = scores / (self.embed_dim ** (1 / 2))
# Normalize the scaled score matrix
attention = torch.softmax(scaled, dim=1)
# Multiply scores and values to output
out = torch.einsum("nab,nbh->nah", attention, values)
# attention shape: (n, m, m)
# values shape: (n, m, head_dim)
# out shape: (n, m, head_dim)
return out
# Multi head attention mechanism
class MultiHeadAttentionBlock(nn.Module):
def __init__(self, embed_dim, head_num):
super(MultiHeadAttentionBlock, self).__init__()
self.embed_dim = embed_dim
self.head_num = head_num
self.head_dim = embed_dim // head_num
assert (head_num * self.head_dim == embed_dim), \
"Embed size is required to be dividable by heads."
self.heads = nn.ModuleList(
[AttentionHead(embed_dim, self.head_dim)
for _ in range(head_num)]
)
self.out_layer = nn.Linear(embed_dim, embed_dim)
def forward(self, values, keys, queries, mask=None):
n = values.shape[0] # Number of examples / batch size
v_dim = values.shape[1] # Quantity of embeddings
k_dim = keys.shape[1]
q_dim = queries.shape[1]
# Split up the values, keys and queries
values = values.reshape(n, v_dim, self.head_num, self.head_dim)
keys = keys.reshape(n, k_dim, self.head_num, self.head_dim)
queries = queries.reshape(n, q_dim, self.head_num, self.head_dim)
# Iterate through heads
for i, head in enumerate(self.heads):
globals()[f"out{i}"] = head(values[:, :, i, :], keys[:, :, i, :], queries[:, :, i, :], mask)
# out shape: (n, m, head_dim)
# Concatenate the output of each head
out = globals()[f"out{0}"]
for i in range(self.head_num - 1):
out = torch.cat((out, globals()[f"out{i + 1}"]), dim=2)
# Out shape: (n, m, head_num * head_dim / embed_dim)
# Send output through a last linear layer and return the outcome
out = self.out_layer(out)
return out
# Transformer block
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, head_num, dropout, forward_expansion):
super(TransformerBlock, self).__init__()
self.attention = MultiHeadAttentionBlock(embed_dim, head_num)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.feed_forward = nn.Sequential(
nn.Linear(embed_dim, forward_expansion * embed_dim),
nn.ReLU(),
nn.Linear(forward_expansion * embed_dim, embed_dim)
)
self.dropout = nn.Dropout(dropout)
def forward(self, values, keys, queries, mask=None):
attention = self.attention(values, keys, queries, mask)
x = self.dropout(self.norm1(attention + queries))
forward = self.feed_forward(x)
x = self.dropout(self.norm2(forward + x))
return x
# Encoder
class Encoder(nn.Module):
def __init__(self, src_vocab_dim, embed_dim, head_num, block_num, dropout, forward_expansion, max_length, device):
super(Encoder, self).__init__()
self.device = device
self.embed_dim = embed_dim
self.word_embedding = nn.Embedding(src_vocab_dim, embed_dim)
self.position_embedding = nn.Embedding(max_length, embed_dim) # max_length: max word length of all data
self.blocks = nn.ModuleList(
[TransformerBlock(embed_dim, head_num, dropout, forward_expansion)
for _ in range(block_num)]
)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
n, seq_length = x.shape # (batch size, max word length of that batch)
positions = torch.arange(0, seq_length).expand(n, seq_length).to(self.device) # 0 - seq_length along dim 1
x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
for block in self.blocks:
x = block(x, x, x)
return x
# Decoder block
class DecoderBlock(nn.Module):
def __init__(self, embed_dim, head_num, dropout, forward_expansion):
super(DecoderBlock, self).__init__()
self.attention = MultiHeadAttentionBlock(embed_dim, head_num)
self.norm = nn.LayerNorm(embed_dim)
self.transformer_block = TransformerBlock(embed_dim, head_num, dropout, forward_expansion)
self.dropout = nn.Dropout(dropout)
def forward(self, x, values, keys, mask):
attention = self.attention(x, x, x, mask)
# As the outputs of the decoder's first self attention block are the queries, the encoder's
# output can be of different size. Only keys and values have to be indentical in size.
queries = self.dropout(self.norm(attention + x))
x = self.transformer_block(values, keys, queries)
return x
# Decoder
class Decoder(nn.Module):
def __init__(self, trg_vocab_dim, embed_dim, head_num, block_num, dropout, forward_expansion, max_length, device):
super(Decoder, self).__init__()
self.device = device
self.embed_dim = embed_dim
self.word_embedding = nn.Embedding(trg_vocab_dim, embed_dim)
self.position_embedding = nn.Embedding(max_length, embed_dim)
self.blocks = nn.ModuleList(
[DecoderBlock(embed_dim, head_num, dropout, forward_expansion)
for _ in range(block_num)]
)
self.dropout = nn.Dropout(dropout)
self.out_layer = nn.Linear(embed_dim, embed_dim) # changed embed_dim (second time in bracket) from trg_vocab_dim
def forward(self, x, enc_out, mask):
n, seq_length = x.shape
positions = torch.arange(0, seq_length).expand(n, seq_length).to(self.device) # 0 - seq_length along dim 1
x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
for block in self.blocks:
x = block(x, enc_out, enc_out, mask)
x = self.out_layer(x)
return x
# Transformer
class Transformer(nn.Module):
def __init__(self, src_vocab_dim, trg_vocab_dim, embed_dim, head_num, block_num_enc, block_num_dec,
dropout, forward_expansion, max_length, device):
super(Transformer, self).__init__()
self.device = device
self.encoder = Encoder(src_vocab_dim, embed_dim, head_num, block_num_enc, dropout, forward_expansion, max_length, device)
self.decoder = Decoder(trg_vocab_dim, embed_dim, head_num, block_num_dec, dropout, forward_expansion, max_length, device)
def make_mask(self, y):
n, m = y.shape
mask = torch.tril(torch.ones((m, m))).expand(n, m, m)
return mask.to(self.device)
def forward(self, x, y):
mask = self.make_mask(y)
out_enc = self.encoder(x)
out_dec = self.decoder(y, out_enc, mask)
return out_dec
def train_(self, x, y, lr, steps, path=None):
self.train()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(self.parameters(), lr=lr)
for epoch in range(steps):
for batch_id, (batch_x, batch_y) in enumerate(zip(x, y)):
if torch.cuda.is_available() and self.is_cuda:
batch_x = batch_x.cuda()
batch_y = batch_y.cuda()
out = self(batch_x, batch_y)
print(out.shape, batch_y.shape)
# Embed batch_y so result is comparable
batch_y = self.decoder.word_embedding(batch_y)
print(batch_y.shape)
loss = criterion(out, batch_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Training: epoch {epoch} batch {batch_id} loss {loss}")
if path is not None:
torch.save(self, path)
# Run
if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# use for normal run
# x = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(device) # input
# y = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device) # target
# added one bracket for training so this is one batch
x = torch.tensor([[[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]]).to(device) # input
y = torch.tensor([[[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]]).to(device) # target
src_vocab_dim = 10
trg_vocab_dim = 10
model = Transformer(src_vocab_dim, trg_vocab_dim, embed_dim=256, head_num=8, block_num_enc=6, block_num_dec=6,
dropout=0, forward_expansion=4, max_length=100, device=device)
model.train_(x, y, lr, steps)
Here's the whole error trace:
Traceback (most recent call last):
File "C:/Users/user/PycharmProjects/Transformer/Code.py", line 310, in <module>
model.train_(x, y, lr, steps)
File "C:/Users/user/PycharmProjects/Transformer/Code.py", line 279, in train_
loss = criterion(out, batch_y)
File "C:\Users\user\Anaconda3\envs\Transformer\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\user\Anaconda3\envs\Transformer\lib\site-packages\torch\nn\modules\loss.py", line 948, in forward
ignore_index=self.ignore_index, reduction=self.reduction)
File "C:\Users\user\Anaconda3\envs\Transformer\lib\site-packages\torch\nn\functional.py", line 2422, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File "C:\Users\user\Anaconda3\envs\Transformer\lib\site-packages\torch\nn\functional.py", line 2228, in nll_loss
out_size, target.size()))
ValueError: Expected target size (2, 256), got torch.Size([2, 8, 256])
Concerning your question:
My thoughts were that the loss function cannot compare the output with the target if the target is not embedded as well. With the embedding missing, this gives the following shapes:
torch.Size([2, 8]) # target
torch.Size([2, 8, 256]) # output

Error: When subclassing the `Model` class, you should implement a `call` method. on tensorflow custom model

I am trying to train my custom model on Cifar 10 dataset.
My model's code is below: -
class cifar10Model(keras.Model):
def __init__(self):
super(cifar10Model, self).__init__()
self.conv1 = keras.layers.Conv2D(32, 3, activation='relu', input_shape=(32, 32, 3))
self.pool1 = keras.layers.MaxPool2D((3, 3))
self.batch_norm1 = keras.layers.BatchNormalization()
self.dropout1 = keras.layers.Dropout(0.1)
self.conv2 = keras.layers.Conv2D(64, 3, activation='relu')
self.pool2 = keras.layers.MaxPool2D((3, 3))
self.batch_norm2 = keras.layers.BatchNormalization()
self.dropout2 = keras.layers.Dropout(0.2)
self.conv3 = keras.layers.Conv2D(128, 3, activation='relu')
self.pool3 = keras.layers.MaxPool2D((3, 3))
self.batch_norm3 = keras.layers.BatchNormalization()
self.dropout3 = keras.layers.Dropout(0.3)
self.flatten = keras.layers.Flatten()
self.dense1 = keras.layers.Dense(128, activation='relu')
self.dense2 = keras.layers.Dense(10)
def call(self, x):
x = self.conv1(x)
x = self.pool1(x)
x = self.batch_norm1(X)
x = self.dropout1(x)
x = self.conv2(x)
x = self.pool2(x)
x = self.batch_norm2(X)
x = self.dropout2(x)
x = self.conv3(x)
x = self.pool3(x)
x = self.batch_norm3(x)
x = self.dropout3(x)
x = self.flatten(x)
x = self.dense1(x)
return self.dense2(x)
model = cifar10Model()
When i run this code this gives me no error.
Then i defined my training loop
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = keras.optimizers.Adam()
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')
#tf.function
def train_step(images, labels):
with tf.GradientTape() as tape:
predictions = model(images, training=True)
loss = loss_object(labels, predictions)
grad = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grad, model.trainable_variables))
train_loss(loss)
train_accuracy(labels, predictions)
#tf.function
def test_step(images, labels):
predictions = model(images)
t_loss = loss_object(labels, predictions)
test_loss(t_loss)
test_accuracy(labels, predictions)
epochs = 10
for epoch in range(epochs):
train_loss.reset_states()
train_accuracy.reset_states()
test_loss.reset_states()
test_accuracy.reset_states()
for images, labels in train_dataset:
train_step(images, labels)
for images, labels in test_dataset:
test_step(images, labels)
template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
print(template.format(epoch + 1,
train_loss.result(),
train_accuracy.result() * 100,
test_loss.result(),
test_accuracy.result() * 100))
When i run this code, i get the following error
NotImplementedError: When subclassing the `Model` class, you should implement a `call` method.
I am currently running my code on google colab.
My colab link is https://colab.research.google.com/drive/1sOlbRpPRdyOCJI0zRFfIA-Trj1vrIbWY?usp=sharing
My tensorflow version on colab is 2.2.0
Also, when i tried to predict labels from untrained model by this code :-
print(model(train_images))
This also gives me the same error.
The error is saying that i have not implemented the call method on model.
but, i have defined the call method.
I also tried by changing the call method to __call__ method.
But still, it gives me the same error.
Thanks in advance :-
The problem is with indentation. You've defined call method inside __init__. Try defining it outside the __init__ method as follows:
class cifar10Model(keras.Model):
def __init__(self):
super(cifar10Model, self).__init__()
self.conv1 = keras.layers.Conv3D(32, 3, activation='relu', input_shape=(32, 32, 3))
self.pool1 = keras.layers.MaxPool3D((3, 3, 3))
self.batch_norm1 = keras.layers.BatchNormalization()
self.dropout1 = keras.layers.Dropout(0.1)
self.conv2 = keras.layers.Conv3D(64, 3, activation='relu')
self.pool2 = keras.layers.MaxPool3D((3, 3, 3))
self.batch_norm2 = keras.layers.BatchNormalization()
self.dropout2 = keras.layers.Dropout(0.2)
self.conv3 = keras.layers.Conv3D(128, 3, activation='relu')
self.pool3 = keras.layers.MaxPool3D((3, 3, 3))
self.batch_norm3 = keras.layers.BatchNormalization()
self.dropout3 = keras.layers.Dropout(0.3)
self.flatten = keras.layers.Flatten()
self.dense1 = keras.layers.Dense(128, activation='relu')
self.dense2 = keras.layers.Dense(10)
def call(self, x):
x = self.conv1(x)
x = self.pool1(x)
x = self.batch_norm1(X)
x = self.dropout1(x)
x = self.conv2(x)
x = self.pool2(x)
x = self.batch_norm2(X)
x = self.dropout2(x)
x = self.conv3(x)
x = self.pool3(x)
x = self.batch_norm3(X)
x = self.dropout3(x)
x = self.flatten(x)
x = self.dense1(x)
return self.dense2(x)
model = cifar10Model()
Hope this helps.

Categories