Using captum with nn.Embedding getting RuntimeError - python

I am using captum library and getting following error. Here is the complete code to reproduce the error.
RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from captum.attr import IntegratedGradients
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
vocab_size = 1024
embedding_dim = 32
seq_len = 128
num_classes = 5
hidden_dim = 256
class predictor(nn.Module):
def __init__(self):
super().__init__()
self.seq_len = seq_len
self.num_classes = num_classes
self.hidden_dim = hidden_dim
self.vocab_size, self.embedding_dim = vocab_size, embedding_dim
self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
self.linear = nn.Linear(self.seq_len*self.embedding_dim, self.num_classes)
def forward(self, x):
x = self.embedding(x.long())
x = x.reshape(-1, self.seq_len*self.embedding_dim)
x = F.relu(self.linear(x))
return x
class wrapper_predictor(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, x):
x = self.model(x)
x = F.softmax(x, dim=1)
return x
indexes = torch.Tensor(np.random.randint(0, vocab_size, (seq_len))).to(device)
model = predictor().to(device)
wrapper_model = wrapper_predictor(model).to(device)
ig = IntegratedGradients(wrapper_model)
attributions, delta = ig.attribute(inputs=indexes, target=0, n_steps=1, return_convergence_delta=True)

I resolved the issue with LayerIntegratedGradients.
Here is the link to read more to know other possible solutions. https://captum.ai/tutorials/IMDB_TorchText_Interpret
This is using an instance of LayerIntegratedGradients using forward function of model and the embedding layer as the example given in the link.
Here is sample code which using LayerIntegratedGradients with nn.Embedding
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from captum.attr import IntegratedGradients, LayerIntegratedGradients
from torchsummary import summary
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
vocab_size = 1024
embedding_dim = 1
seq_len = 128
num_classes = 5
hidden_dim = 256
class predictor(nn.Module):
def __init__(self):
super(predictor, self).__init__()
self.seq_len = seq_len
self.num_classes = num_classes
self.hidden_dim = hidden_dim
self.vocab_size, self.embedding_dim = vocab_size, embedding_dim
self.embedding = nn.Sequential(
nn.Embedding(self.vocab_size, self.embedding_dim),
)
self.embedding.weight = torch.randn((self.vocab_size, self.embedding_dim), requires_grad=True)
self.fc = nn.Sequential(
nn.Linear(self.seq_len*self.embedding_dim, self.hidden_dim, device=device, bias=False),
nn.Linear(self.hidden_dim, self.num_classes, device=device, bias=False),
)
def forward(self, x):
x = self.embedding(x.long())
x = x.view(-1, self.seq_len*self.embedding_dim)
x = self.fc(x)
return x
class wrapper_predictor(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, x):
x = self.model(x)
x = F.softmax(x, dim=1) #keep softmax out of forward if attribution score is too low.
return x
model = predictor().to(device)
indexes = torch.Tensor(np.random.randint(0, vocab_size, (seq_len))).to(device)
input_size = indexes.shape
summary(model=model, input_size=input_size, batch_size=-1, device='cuda')
wrapper_model = wrapper_predictor(model).to(device)
lig = LayerIntegratedGradients(model, model.embedding)
attributions, delta = lig.attribute(inputs=indexes, target=0, n_steps=1, return_convergence_delta=True)

Related

LSTM + Linear layer Encoder/Decoder always predict blank value

I am trying to use Seq2Seq to covert analog signal to discrete time series.
The raw timeseries signal contains firing wave sound, echo from marker(in short marker), and end of the tube.
The current progress of my Encoder/Decoder + Linear layer + MSEloss() is blank prediction. It does not convert the bumps in to discrete signal.
0 = blank
1 = end of tube(wall)
2 = noise echo
3 = echo from marker
4 = firing kick
class PLMixin:
def configure_optimizers(self) -> torch.optim.Adam:
return torch.optim.Adam(self.parameters(), lr=1e-3)
# return torch.optim.SGD(self.parameters(), lr=1e-3)
# return torch.optim.RMSprop(self.parameters(), lr=1e-3)
def train_loader(self) -> DataLoader:
"""Return train_loader."""
training_dataset = TimeseriesLiquidLevelDataset(
text_files_dir=Path("digitized_dataset/training")
)
training_dataloader = DataLoader(training_dataset, batch_size=1, shuffle=True, num_workers=0)
return training_dataloader
def val_loader(self) -> DataLoader:
"""Return validation loader."""
val_dataset = TimeseriesLiquidLevelDataset(
text_files_dir=Path("digitized_dataset/training")
)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0)
return val_dataloader
def test_loader(self) -> DataLoader:
"""Return validation loader."""
val_dataset = TimeseriesLiquidLevelDataset(
text_files_dir=Path("digitized_dataset/testing")
)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=0)
return val_dataloader
"""Time series experiment with embedding function. On freeze because it predict variety of token."""
import multiprocessing
import random
import typing as typ
from os.path import exists
from pathlib import Path
import joblib
import numpy as np
import pytorch_lightning as pl
import torch
import torch.nn as nn
from pytorch_lightning.loggers import TensorBoardLogger
import seq2seq_gadgets as gadgets
from lstm_compare_solution import clean, conclude_correctness
NUM_PROCS = multiprocessing.cpu_count()
seed = 181993
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"
class Encoder(nn.Module):
"""Simple encoder class."""
def __init__(self, input_size: int, hidden_size: int, num_layers: int):
"""Init the instance."""
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, bidirectional=True, batch_first=True)
def forward(self, x: torch.Tensor) -> typ.Tuple[torch.Tensor, typ.Tuple[torch.Tensor, torch.Tensor]]:
"""Run forward computing."""
_, (hn, cn) = self.lstm(x)
return _, (hn, cn)
class Decoder(nn.Module):
def __init__(
self,
input_size: int,
hidden_size: int,
output_size: int,
num_layers: int
) -> None:
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, bidirectional=True, batch_first=True)
# Convert the tensor dimension
self.layers = nn.Sequential(
nn.Dropout(p=0.5),
nn.Linear(2 * hidden_size, hidden_size), # *2 because of bidirectional=True
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(hidden_size, output_size),
nn.ReLU(),
)
def forward(self, x: torch.Tensor, cell: typ.Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
"""Run forward computing."""
hn, cn = cell
output, (hn, cn) = self.lstm(x, (hn, cn))
output = self.layers(output)
return output
class EncoderDecoder(gadgets.PLMixin, pl.LightningModule):
"""Lightning of simple LSTM."""
def __init__(self, encoder: Encoder, decoder: Decoder, n_classes: int = 4):
"""Init the instance."""
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.n_classes = n_classes
self._loss = nn.MSELoss()
def forward(
self,
x: torch.Tensor
) -> torch.Tensor:
"""Run training function."""
_, (hidden, cell) = self.encoder.forward(x)
output = self.decoder.forward(x, (hidden, cell))
return output
def training_step(
self,
train_batch: typ.Tuple[torch.Tensor, torch.Tensor],
batch_idx: int
) -> typ.Dict:
"""Training step."""
x, y = train_batch
x_hat = self.forward(x)
loss = self._loss(x_hat, y.float())
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, val_batch: typ.Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> typ.Dict:
"""Validate the batch."""
x, y = val_batch
x_hat = self.forward(x)
loss = self._loss(x_hat, y.float())
self.log("train_loss", loss)
analog_file = f"my_val_{batch_idx}.joblib"
predicted_file = f"my_val_{batch_idx}_predicted.joblib"
solution_file = f"my_val_{batch_idx}_solution.joblib"
joblib.dump(x, analog_file)
joblib.dump(x_hat, predicted_file)
joblib.dump(y, solution_file)
return {"val_loss": loss}
def main() -> None:
"""Run main function."""
logger = TensorBoardLogger("lightning_logs", name="digitized")
trainer = pl.Trainer(fast_dev_run=False, max_epochs=5, logger=logger)
encoder_input_size: int = 20000
encoder_hidden_size: int = 1000
encoder_num_layers: int = 1
decoder_n_classes: int = 5
decoder_input_size: int = encoder_input_size
decoder_hidden_size: int = 1000
decoder_output_size: int = encoder_input_size
decoder_num_layers: int = encoder_num_layers
encoder = Encoder(
encoder_input_size, encoder_hidden_size, encoder_num_layers
)
decoder = Decoder(
decoder_input_size,
decoder_hidden_size,
decoder_output_size,
decoder_num_layers,
)
chk_point = Path("./lightning_logs/digitized/version_1/checkpoints/epoch=9-step=570.ckpt")
if exists(chk_point):
model = EncoderDecoder.load_from_checkpoint(chk_point, encoder=encoder, decoder=decoder)
model.eval()
else:
model = EncoderDecoder(encoder, decoder, decoder_n_classes)
trainer.fit(model, train_dataloaders=model.train_loader())
clean()
trainer.validate(model, dataloaders=model.train_loader())
# trainer.test(model, dataloaders=model.test_loader())
# Keep pytorch lightning clean
conclude_correctness("my_val")
# conclude_correctness("my_test")
if __name__ == "__main__":
main()
Question:
Is Encoder/Decoder capable to do analog to discrete conversion?

It's a pytorch question. Backward() is executed only the first time or two and no further progress is made

enter image description here
So we've translated this image into a code.
class pre_h(nn.Module):
def __init__(self, vocab_size, hidden_size, window_size):
super().__init__()
self.window_size = window_size
self.in_layer = nn.Embedding(vocab_size, hidden_size)
self.weight_list = []
def forward(self, contexts):
for i in range(2 * self.window_size):
weight = self.in_layer(contexts[:,i])
self.weight_list.append(weight)
h = sum(self.weight_list)
h = h / len(self.weight_list)
return h
class UnigramSampler:
def __init__(self, corpus, power, sample_size):
super().__init__()
self.sample_size = sample_size
self.vocab_size = None
self.word_p = None
counts = collections.Counter()
for word_id in corpus:
counts[word_id] += 1
vocab_size = len(counts)
self.vocab_size = vocab_size
self.word_p = np.zeros(vocab_size)
for i in range(vocab_size):
self.word_p[i] = counts[i]
self.word_p = np.power(self.word_p, power)
self.word_p /= np.sum(self.word_p)
def get_negative_sample(self, target):
batch_size = target.shape[0]
negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size), replace=True, p=self.word_p)
negative_sample = torch.tensor(negative_sample)
return negative_sample
class NegativeSampling(nn.Module):
def __init__(self, vocab_size, hidden_size, corpus, power = 0.75, sample_size = 5):
super().__init__()
self.sample_size = sample_size
self.sampler = UnigramSampler(corpus, power, sample_size)
self.embedding = nn.Embedding(vocab_size, hidden_size)
self.sigmoid = nn.Sigmoid()
self.loss_layer = nn.CrossEntropyLoss()
self.hidden_size = hidden_size
def forward(self, h, target):
loss_data = 0
w = self.embedding(target)
w = torch.squeeze(w).reshape(self.hidden_size, -1)
loss = h # w
loss = self.sigmoid(loss)
correct_label = torch.ones_like(loss)
loss = self.loss_layer(loss, correct_label)
loss_data += loss
negative_target = self.sampler.get_negative_sample(target)
for i in range(self.sample_size):
w = self.embedding(negative_target[:,i])
w = torch.squeeze(w).reshape(self.hidden_size, -1)
loss = h # w
loss = self.sigmoid(loss)
negative_label = torch.zeros_like(loss)
loss = self.loss_layer(loss, negative_label)
return loss_data
class sample_cbow(nn.Module):
def __init__(self, vocab_size, hidden_size, corpus, window_size, power = 0.75, sample_size = 5):
super().__init__()
self.model_1 = pre_h(vocab_size, hidden_size, window_size)
self.model_2 = NegativeSampling(vocab_size, hidden_size, corpus, power, sample_size)
def forward(self, contexts, target):
out = self.model_1(contexts)
out = self.model_2(out, target)
return out
vocab_size = len(id_to_word)
model = sample_cbow(vocab_size, 100, corpus, 1)
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate) # 아담으로 옵티마이저 설정
i = 1
for contexts, target in dataloader:
print('===================================')
print(i,'experiment')
contexts.to(device)
target.to(device)
model.to(device)
output = model(contexts, target)
optimizer.zero_grad()
output.backward()
optimizer.step()
Here is the dataset I used. I'm leaving this just in case.
class Corpus_Dataset(Dataset):
def __init__(self, corpus, window_size = 1):
contexts, target = create_contexts_target(corpus, window_size)
self.contexts = contexts
self.target = target
def __len__(self):
return len(self.contexts)
def __getitem__(self, idx):
contexts = torch.tensor(self.contexts[idx], dtype = torch.long)
target = torch.tensor(self.target[idx], dtype = torch.long)
return contexts, target
dataset = Corpus_Dataset(corpus, window_size = 2)
dataloader = DataLoader(dataset, batch_size = 100)
RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward
We know that with backward(), the graph is no longer formed.
But I've only used one backward(), and I get this error even though I didn't recycle it again.
If the backward() of the model was not executed from the beginning, it would have been suspected as a problem with the model, but only the first or second time the backward() runs normally and does not proceed any further.
Clone() or detach() were used for more diverse tensors, but this error was not resolved.
I searched and thought about what the problem was, but I can't think of a solution anymore.
I'm sorry for my poor English.

Train a model to output weights of another model, and use the other model just as function evaluation

I have 2 models, A and B.
A(x1)=Weights of B
B(x2)=Final output
A is trainable
B is not trainable (I just want to upload the outputs of A into B and infer)
Problem I am facing: Output of A is torch.tensor. While setting the weights of B, I had to slice the output tensor of A. However, I am losing the gradient flow, from final loss to weights of A, hence there is no training happening. How do I implement the idea or correct my code?
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.autograd import Variable
import numpy as np
class Hyper_Model(nn.Module):
def __init__(self):
super(Hyper_Model, self).__init__()
self.layers = nn.Sequential(nn.Linear(1,32),
nn.ReLU(),
nn.Linear(32,32),
nn.ReLU(),
nn.Linear(32,32),
nn.ReLU(),
nn.Linear(32,32),
nn.ReLU(),
nn.Linear(32,177))
def forward(self,param):
param_ = self.layers(param)
return param_
class Main_Model(nn.Module):
def __init__(self):
super(Main_Model, self).__init__()
self.linear1 = nn.Linear(2,8)
self.linear2 = nn.Linear(8,8)
self.linear3 = nn.Linear(8,8)
self.out = nn.Linear(8,1)
def forward(self,param_,x):
self.linear1.weight = torch.nn.Parameter(param_[0,:16].view(8,2))
self.linear2.weight = torch.nn.Parameter(param_[0,24:88].view(8,8))
self.linear3.weight = torch.nn.Parameter(param_[0,96:160].view(8,8))
self.linear1.bias = torch.nn.Parameter(param_[0,16:24].view(8))
self.linear2.bias = torch.nn.Parameter(param_[0,88:96].view(8))
self.linear3.bias = torch.nn.Parameter(param_[0,160:168].view(8))
self.out.weight = torch.nn.Parameter(param_[0,168:176].view(1,8))
self.out.bias = torch.nn.Parameter(param_[0,176:].view(1))
self.linear1.weight.requires_grad = False
self.linear2.weight.requires_grad = False
self.linear3.weight.requires_grad = False
self.linear1.bias.requires_grad = False
self.linear2.bias.requires_grad = False
self.linear3.bias.requires_grad = False
self.out.weight.requires_grad = False
self.out.bias.requires_grad = False
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
x = self.out(x)
return x
x = torch.tensor([1.0,2.0,3.0],requires_grad=True).view(3,1)
t = torch.tensor([1.0,1.5,2.0],requires_grad=True).view(3,1)
param = torch.tensor([-0.01]).view(1,1)
X = torch.cat([x,t],dim=1)
Y = torch.tensor([5.0,6.0,9.0]).view(3,1)
h = Hyper_Model()
m = Main_Model()
opt = torch.optim.Adam(list(h.parameters()), lr=0.001)
loss_func = nn.MSELoss()
for i in range(10):
opt.zero_grad()
param_ = h(param)
out = m(param_,X)
loss = loss_func(out,Y)
print(i,loss)
loss.backward()
opt.step()

Debugging neural network dropout problem for the probability not lying inside [0,1]

I tried to put a droprate to my neural network (NN) using torch and I got a strange error at the end. How can I fix it?
So the idea is that I wrote a NN inside a function to make it easier to call. The function is the following:
(I personally think the problem lies inside the class of the NN, but for the sake of having a working example I'm putting everything).
def train_neural_network(data_train_X, data_train_Y, batch_size, learning_rate, graph = True, dropout = 0.0 ):
input_size = len(data_test_X.columns)
hidden_size = 200
num_classes = 4
num_epochs = 120
batch_size = batch_size
learning_rate = learning_rate
# The class of NN
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes, p = dropout):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, num_classes)
def forward(self, x, p = dropout):
out = F.relu(self.fc1(x))
out = F.relu(self.fc2(out))
out = nn.Dropout(out, p) #drop
out = self.fc3(out)
return out
# Prepare data
X_train = torch.from_numpy(data_train_X.values).float()
Y_train = torch.from_numpy(data_train_Y.values).float()
# Loading data
train = torch.utils.data.TensorDataset(X_train, Y_train)
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size)
net = NeuralNet(input_size, hidden_size, num_classes)
# Loss
criterion = nn.CrossEntropyLoss()
# Optimiser
optimiser = torch.optim.SGD(net.parameters(), lr=learning_rate)
# Proper training
total_step = len(train_loader)
loss_values = []
for epoch in range(num_epochs+1):
net.train()
train_loss = 0.0
for i, (predictors, results) in enumerate(train_loader, 0):
# Forward pass
outputs = net(predictors)
results = results.long()
results = results.squeeze_()
loss = criterion(outputs, results)
# Backward and optimise
optimiser.zero_grad()
loss.backward()
optimiser.step()
# Update loss
train_loss += loss.item()
loss_values.append(train_loss / batch_size )
print('Finished Training')
return net
And when I call the function:
net = train_neural_network(data_train_X = data_train_X, data_train_Y = data_train_Y, batch_size = batch_size, learning_rate = learning_rate, dropout = 0.1)
The error is the following:
net = train_neural_network(data_train_X = data_train_X, data_train_Y = data_train_Y, batch_size = batch_size, learning_rate = learning_rate, dropout = 0.1)
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/dropout.py in __init__(self, p, inplace)
8 def __init__(self, p=0.5, inplace=False):
9 super(_DropoutNd, self).__init__()
---> 10 if p < 0 or p > 1:
11 raise ValueError("dropout probability has to be between 0 and 1, "
12 "but got {}".format(p))
RuntimeError: bool value of Tensor with more than one value is ambiguous
Why do you think there is an error?
Before putting the droprate, everything was working. Additional points for you if you know how to
implement a bias inside my network! For example, on the hidden layer. I can't find any example online.
Change your architecture for this:
class NeuralNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes, p=dropout):
super(NeuralNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, num_classes)
self.dropout = nn.Dropout(p=p)
def forward(self, x):
out = F.relu(self.fc1(x))
out = F.relu(self.fc2(out))
out = self.dropout(self.fc3(out))
return out
Let me know if it works.

How to do fully connected batch norm in PyTorch?

torch.nn has classes BatchNorm1d, BatchNorm2d, BatchNorm3d, but it doesn't have a fully connected BatchNorm class? What is the standard way of doing normal Batch Norm in PyTorch?
Ok. I figured it out. BatchNorm1d can also handle Rank-2 tensors, thus it is possible to use BatchNorm1d for the normal fully-connected case.
So for example:
import torch.nn as nn
class Policy(nn.Module):
def __init__(self, num_inputs, action_space, hidden_size1=256, hidden_size2=128):
super(Policy, self).__init__()
self.action_space = action_space
num_outputs = action_space
self.linear1 = nn.Linear(num_inputs, hidden_size1)
self.linear2 = nn.Linear(hidden_size1, hidden_size2)
self.linear3 = nn.Linear(hidden_size2, num_outputs)
self.bn1 = nn.BatchNorm1d(hidden_size1)
self.bn2 = nn.BatchNorm1d(hidden_size2)
def forward(self, inputs):
x = inputs
x = self.bn1(F.relu(self.linear1(x)))
x = self.bn2(F.relu(self.linear2(x)))
out = self.linear3(x)
return out
The BatchNorm1d normally comes before the ReLU, and the bias is redundant, so
import torch.nn as nn
class Policy(nn.Module):
def __init__(self, num_inputs, action_space, hidden_size1=256, hidden_size2=128):
super(Policy2, self).__init__()
self.action_space = action_space
num_outputs = action_space
self.linear1 = nn.Linear(num_inputs, hidden_size1, bias=False)
self.linear2 = nn.Linear(hidden_size1, hidden_size2, bias=False)
self.linear3 = nn.Linear(hidden_size2, num_outputs)
self.bn1 = nn.BatchNorm1d(hidden_size1)
self.bn2 = nn.BatchNorm1d(hidden_size2)
def forward(self, inputs):
x = inputs
x = F.relu(self.bn1(self.linear1(x)))
x = F.relu(self.bn2(self.linear2(x)))
out = self.linear3(x)
return out

Categories