I'm using PyTorch Lightning and PyTorch for LSTM classification. Whenever I train the model, this error shows:
File "<string>", line 1, in <module>
File "C:\Users\hisha\anaconda3\envs\FYP\lib\multiprocessing\spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "C:\Users\hisha\anaconda3\envs\FYP\lib\multiprocessing\spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'GaitDataset' on <module '__main__' (built-in)>
I'm implementing time series classification using sequences. The original model and data module code is below:
class GaitDataset(Dataset):
def __init__(self, sequences):
self.sequences = sequences
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
sequence, label = self.sequences[idx]
return dict(
sequence=torch.Tensor(sequence.to_numpy()),
label=torch.tensor(label).long()
)
class GaitDataModule(pl.LightningDataModule):
def __init__(self, train_sequences, test_sequences, batch_size):
super().__init__()
self.train_sequences = train_sequences
self.test_sequences = test_sequences
self.batch_size = batch_size
def setup(self, stage=None):
self.train_dataset = GaitDataset(self.train_sequences)
self.test_dataset = GaitDataset(self.test_sequences)
def train_dataloader(self):
return DataLoader(
self.train_dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=cpu_count()
)
def val_dataloader(self):
return DataLoader(
self.test_dataset,
batch_size=self.batch_size,
shuffle=False,
num_workers=cpu_count()
)
def test_dataloader(self):
return DataLoader(
self.test_dataset,
batch_size=self.batch_size,
shuffle=False,
num_workers=2
)
N_EPOCHS = 250
BATCH_SIZE = 64
data_module = GaitDataModule(train_sequences, test_sequences, BATCH_SIZE)
class SequenceModel(nn.Module):
def __init__(self, n_features, n_classes, n_hidden=256, n_layers=3):
super().__init__()
self.lstm = nn.LSTM(
input_size=n_features,
hidden_size=n_hidden,
num_layers=n_layers,
batch_first=True,
dropout=0.5
)
self.classifier = nn.Linear(n_hidden, n_classes)
def forward(self, x):
self.lstm.flatten_parameters()
_, (hidden, _) = self.lstm(x)
out = hidden[-1]
return self.classifier(out)
class GaitPredictor(pl.LightningModule):
def __init__(self, n_features: int, n_classes: int):
super().__init__()
self.model = SequenceModel(n_features, n_classes)
self.criterion = nn.CrossEntropyLoss()
def forward(self, x, labels=None):
output = self.model(x)
loss = 0
if labels is not None:
loss = self.criterion(output, labels)
return loss, output
def training_step(self, batch, batch_idx):
sequences = batch['sequence']
labels = batch['label']
loss, outputs = self(sequences, labels)
predictions = torch.argmax(outputs, dim=1)
step_accuracy = accuracy(predictions, labels)
self.log('train_loss', loss, prog_bar=True, logger=True)
self.log('train_accuracy', step_accuracy, prog_bar=True, logger=True)
return {'loss': loss, 'accuracy': step_accuracy}
def validation_step(self, batch, batch_idx):
sequences = batch['sequence']
labels = batch['label']
loss, outputs = self(sequences, labels)
predictions = torch.argmax(outputs, dim=1)
step_accuracy = accuracy(predictions, labels)
self.log('val_loss', loss, prog_bar=True, logger=True)
self.log('val_accuracy', step_accuracy, prog_bar=True, logger=True)
return {'loss': loss, 'accuracy': step_accuracy}
def test_step(self, batch, batch_idx):
sequences = batch['sequence']
labels = batch['label']
loss, outputs = self(sequences, labels)
predictions = torch.argmax(outputs, dim=1)
step_accuracy = accuracy(predictions, labels)
self.log('test_loss', loss, prog_bar=True, logger=True)
self.log('test_accuracy', step_accuracy, prog_bar=True, logger=True)
return {'loss': loss, 'accuracy': step_accuracy}
def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=0.0001)
model = GaitPredictor(
n_features=len(FeatureColumns),
n_classes=len(label_encoder.classes_)
)
checkpoint_callback = ModelCheckpoint(
dirpath='checkpoints',
filename='best-checkpoint',
save_top_k=1,
verbose=True,
monitor='val_loss',
mode='min'
)
logger = TensorBoardLogger('lightning_logs', name='Gait')
trainer = pl.Trainer(
checkpoint_callback=checkpoint_callback,
logger=logger,
max_epochs=N_EPOCHS,
gpus=1,
progress_bar_refresh_rate=30
)
trainer.fit(model, data_module)
How to fix this error? I am using GaitDataset class which describes gait sequences for training, but it seems that Python is unable to import the class properly.
I followed this tutorial from YouTube:
https://youtu.be/PCgrgHgy26c
Environment:
python 3.9.7
pytorch 1.11.0
pytorch-lightning 1.5.10
Conda environment configured in Pycharm
This is due to lightning trying to load the data from the file. It cannot determine the 'GaitDataset'. I guess this is actually a bug in lightning since it should use the fully qualified name to resolve it.
Funny enough you can import it in your main file and it will be found.
Add
from your.package.here import GaitDataset
It gets funny if you have other wrapper logic happening.
Edit: See https://stackoverflow.com/a/68279928/1615430 for the cause of this error
Related
I have a code, with it, I wanted to train a neural network and save the finished model as a file. But I am getting an error due to incorrect distribution of training and training data. Can't understand why:
`import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class ChatBot(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size):
super().__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden):
out, hidden = self.lstm(x, hidden)
out = self.fc(out[:, -1, :])
return out, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
return hidden
class ChatDataset(torch.utils.data.Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.data[index]
def train(model, train_loader, loss_fn, optimizer, device):
model.train()
for inputs, targets in train_loader:
inputs = inputs.to(device)
targets = targets.to(device)
hidden = model.init_hidden(inputs.size(0))
hidden = tuple([each.data for each in hidden])
optimizer.zero_grad()
outputs, _ = model(inputs, hidden)
loss = loss_fn(outputs.view(-1), targets.view(-1))
loss.backward()
optimizer.step()
def evaluate(model, val_loader, loss_fn, device):
model.eval()
total_loss = 0
with torch.no_grad():
for inputs, targets in val_loader:
inputs = inputs.to(device)
targets = targets.to(device)
hidden = model.init_hidden(inputs.size(0))
hidden = tuple([each.data for each in hidden])
outputs, _ = model(inputs, hidden)
total_loss += loss_fn(outputs, targets).item()
return total_loss / len(val_loader)
device = torch.device("cuda" if
torch.cuda.is_available() else "cpu")
input_size = 500
hidden_size = 128
num_layers = 2
output_size = 500
model = ChatBot(input_size, hidden_size, num_layers, output_size)
model = model.to(device)
data = [("Hi, how are you?", "I'm doing well, thank you for asking."),
("What's your name?", "I'm a chatbot, I don't have a name."),
("What's the weather like?", "I'm not sure, I don't have access to current weather information."),
("What's the time?", "I'm not sure, I don't have access to the current time.")]
dataset = ChatDataset(data)
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), int(0.2 * len(dataset))])
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 100
for epoch in range(num_epochs):
train(model, train_loader, loss_fn, optimizer, device)
val_loss = evaluate(model, val_loader, loss_fn, device)
print("Epoch [{}/{}], Validation Loss: {:.4f}".format(epoch+1, num_epochs, val_loss))
torch.save(model.state_dict(), 'chatbot_model.pt')`
But, when I start this code, I have an error:
` ValueError
Traceback (most recent call last)
<ipython-input-8-ae2a6dd1bc7c> in
<module>
78 dataset = ChatDataset(data)
79
---> 80 train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), int(0.2 * len(dataset))])
81
82 train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataset.py in random_split(dataset, lengths, generator)
345 # Cannot verify that dataset is Sized
346 if sum(lengths) != len(dataset): # type: ignore[arg-type]
--> 347 raise ValueError("Sum of input lengths does not equal the length of the input dataset!")
348
349 indices = randperm(sum(lengths), generator=generator).tolist() # type: ignore[call-overload]
ValueError: Sum of input lengths does not equal the length of the input dataset!`
I don't know, why this error. Everything seems to be correct.
The typecasting of the values to an integer is causing a difference in the total number of images in the dataset and the distribution of the number of images in train and test.
Not the most ideal code, but replacing it with the following will work :
num_train_images = int(0.8 * len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [num_train_images, len(dataset) - num_train_images])
I suspect there could be a loss of precision in this calculation,
[int(0.8 * len(dataset)), int(0.2 * len(dataset))]
so the number of records in the dataset is not fully accounted for.
for example:
int(.8 * 56) + int(.2 * 56) = 55
I have been trying to train a model on vulnerability detection through source code. And, after a little bit of searching, I thought a very good starting point could be using a pre-trained transformer model from HuggingFace with PyTorch and pl.lightning torch. I chose DistilBert because it was the fastest one.
I have an imbalanced dataset, approximately 70% non-vulnerable and 30% vulnerable functions.
However, my results have been very bad. The model does not seem to learn and generalize. Specifically, during training the train loss is heavily oscillating, accuracy is around 70 percent and recall is extremely low (implying that the model always predicts one label).
I was wondering if there is anything that might be obviously problematic with my code. This is the first time I am using a pre-trained model and pl lightning and I cannot really tell what might have gone wrong.
class Model(pl.LightningModule):
def __init__(self, n_classes, n_training_steps, n_warmup_steps, lr, fine_tune=False):
super().__init__()
self.save_hyperparameters()
self.bert = DistilBert.from_pretrained(BERT_MODEL_NAME, return_dict=True)
for name, param in self.bert.named_parameters():
param.requires_grad = False
self.classifier = nn.Linear(self.bert.config.hidden_size, self.hparams.n_classes)
self.criterion = nn.BCELoss()
def finetune(self):
self.fine_tune = True
for name, param in self.bert.named_parameters():
if 'layer.5' in name:
param.requires_grad = True
def forward(self, input_ids, attention_mask, labels=None):
x = self.bert(input_ids, attention_mask=attention_mask)
x = x.last_hidden_state[:,0,:]
x = self.classifier(x)
x = torch.sigmoid(x)
x = x.squeeze(dim=-1)
loss = 0
if labels is not None:
loss = self.criterion(x, labels.float())
return loss, x
def training_step(self, batch, batch_idx):
enc, labels = batch
input_ids, attention_mask = enc
loss, outputs = self.forward(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {'loss': loss, 'predictions': outputs, 'labels': labels}
def validation_step(self, batch, batch_idx):
enc, labels = batch
input_ids, attention_mask = enc
loss, outputs = self.forward(input_ids, attention_mask, labels)
r = recall(outputs[:], labels[:])
self.log("val_loss", loss, prog_bar=True, logger=True)
self.log("val_recall", r, prog_bar=True, logger=True)
return {'loss': loss, 'predictions': outputs, 'labels': labels}
def test_step(self, batch, batch_idx):
enc, labels = batch
input_ids, attention_mask = enc
loss, outputs = self.forward(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return {'loss': loss, 'predictions': outputs, 'labels': labels}
def training_epoch_end(self, outputs):
labels = []
predictions = []
for o in outputs:
for o_labels in o['labels'].detach().cpu():
labels.append(o_labels)
for o_preds in o['predictions'].detach().cpu():
predictions.append(o_preds)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
class_recall = recall(predictions[:], labels[:])
self.logger.experiment.add_scalar("recall/Train", class_recall, self.current_epoch)
def validation_epoch_end(self, outputs):
labels = []
predictions = []
for o in outputs:
for o_labels in o['labels'].detach().cpu():
labels.append(o_labels)
for o_preds in o['predictions'].detach().cpu():
predictions.append(o_preds)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
class_recall = recall(predictions[:], labels[:])
self.logger.experiment.add_scalar("recall/Validation", class_recall, self.current_epoch)
def test_epoch_end(self, outputs):
labels = []
predictions = []
for o in outputs:
for o_labels in o['labels'].detach().cpu():
labels.append(o_labels)
for o_preds in o['predictions'].detach().cpu():
predictions.append(o_preds)
labels = torch.stack(labels).int()
predictions = torch.stack(predictions)
class_recall = recall(predictions[:], labels[:])
self.logger.experiment.add_scalar("recall/Test", class_recall, self.current_epoch)
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=self.hparams.lr if self.hparams.fine_tune == False else self.hparams.lr // 100)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=self.hparams.n_warmup_steps,
num_training_steps=self.hparams.n_training_steps
)
return dict(
optimizer=optimizer,
lr_scheduler=dict(
scheduler=scheduler,
interval='step'
)
)
if __name__ == "__main__":
data_module = SourceCodeDataModule(batch_size=BATCH_SIZE)
steps_per_epoch = len(train_loader) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
warmup_steps = total_training_steps // 5
model = Model(
n_classes=1,
n_warmup_steps = warmup_steps,
n_training_steps=total_training_steps,
lr=2e-5
)
logger = TensorBoardLogger("lightning_logs", name="bert_predictor")
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)
trainer = pl.Trainer(
logger=logger,
checkpoint_callback=checkpoint_callback,
callbacks=[early_stopping_callback],
max_epochs=N_EPOCHS,
gpus=1 if str(device).startswith('cuda') else 0,
progress_bar_refresh_rate=30
)
# First just train the final layer.
trainer.fit(model, datamodule=data_module)
result = trainer.test(model, datamodule=data_module)
print(f"Result when training classifier only: {result}")
# Then train the whole model
model = Model.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
model.finetune()
trainer.fit(model, datamodule=data_module)
result = trainer.test(model, datamodule=data_module)
print(f"Result when fine tuning: {result}")
Here,
def finetune(self):
self.fine_tune = True
for name, param in self.bert.named_parameters():
if 'layer.5' in name:
param.requires_grad = True
try to unfreeze more layers at the end of the neural net, maybe the weights are saturated and not learning enough. Also, pay attention to the loss you are using, as well as the activation function at the output.
I am trying to train my q&a model through pytorch_lightning. However while running the command trainer.fit(model,data_module) I am getting the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-72-b9cdaa88efa7> in <module>()
----> 1 trainer.fit(model,data_module)
4 frames
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_setup_hook(self)
1488
1489 if self.datamodule is not None:
-> 1490 self.datamodule.setup(stage=fn)
1491 self._call_callback_hooks("setup", stage=fn)
1492 self._call_lightning_module_hook("setup", stage=fn)
TypeError: setup() got an unexpected keyword argument 'stage'
I have installed and imported pytorch_lightning.
Also I have defined data_module = BioQADataModule(train_df, val_df, tokenizer, batch_size = BATCH_SIZE) where BATCH_SIZE = 2, N_EPOCHS = 6.
The model I have used is as follows:-
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
Also, I have defined the class for the model as follows:-
class BioQAModel(pl.LightningModule):
def __init__(self):
super().__init__()
self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
def forward(self, input_ids, attention_mask, labels=None):
output = self.model(
input_ids = encoding["input_ids"],
attention_mask = encoding["attention_mask"],
labels=labels
)
return output.loss, output.logits
def training_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch["input_ids"]
attention_mask = batch["attention_mask"]
labels = batch["labels"]
loss, outputs = self(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def configure_optimizers(self):
return AdamW(self.parameters(), lr=0.0001)
For any additional information required, please specify.
Edit 1: Adding BioQADataModule:
class BioQADataModule(pl.LightningDataModule):
def __init__(
self,
train_df: pd.DataFrame,
test_df: pd.DataFrame,
tokenizer: T5Tokenizer,
batch_size: int = 8,
source_max_token_len = 396,
target_max_token_len = 32
):
super().__init__()
self.batch_size = batch_size
self.train_df = train_df
self.test_df = test_df
self.tokenizer = tokenizer
self.source_max_token_len = source_max_token_len
self.target_max_token_len = target_max_token_len
def setup(self):
self.train_dataset = BioQADataset(
self.train_df,
self.tokenizer,
self.source_max_token_len,
self.target_max_token_len
)
self.test_dataset = BioQADataset(
self.test_df,
self.tokenizer,
self.source_max_token_len,
self.target_max_token_len
)
def train_dataloader(self):
return DataLoader(
self.train_dataset,
batch_size = self.batch_size,
shuffle = True,
num_workers = 4
)
def val_dataloader(self):
return DataLoader(
self.train_dataset,
batch_size = 1,
shuffle = True,
num_workers = 4
)
def test_dataloader(self):
return DataLoader(
self.train_dataset,
batch_size = 1,
shuffle = True,
num_workers = 4
)
You need to add an extra argument stage=None to your setup method:
def setup(self, stage=None):
self.train_dataset = BioQADataset(
self.train_df,
self.tokenizer,
self.source_max_token_len,
self.target_max_token_len
)
self.test_dataset = BioQADataset(
self.test_df,
self.tokenizer,
self.source_max_token_len,
self.target_max_token_len
)
I've played with Pytorch Lightning myself for multi-GPU training here. Although some of the code is a bit outdated (metrics are a standalone module now), you might find it useful.
I am training resnet34 for handwriting recognition and want to optimize the training time of the model using GradScaler (). I initialize AMP in the train_loop function which is responsible for starting the train session. My output should be enc_pad_texts, output_lenghts, text_lens but feeding them to loss_fp gives me an error in foward. How can I rewrite foward or change loss_fn
a function that helps to combine images and target text into a batch
def collate_fn(batch):
images, texts, enc_texts = zip(*batch)
images = torch.stack(images, 0)
text_lens = torch.LongTensor([len(text) for text in texts])
enc_pad_texts = pad_sequence(enc_texts, batch_first=True, padding_value=0)
return images, texts, enc_pad_texts, text_lens
from torch.cuda.amp import GradScaler
from torch.cuda.amp import autocast
loss_fn = nn.CrossEntropyLoss()
def train_loop(data_loader, model, criterion, optimizer, epoch):
torch.autograd.set_detect_anomaly(False)
torch.autograd.profiler.profile(False)
torch.autograd.profiler.emit_nvtx(False)
scaler = GradScaler()
torch.backends.cudnn.benchmark = True
loss_avg = AverageMeter()
model.train()
for images, texts, enc_pad_texts, text_lens in data_loader:
model.zero_grad()
images = images.to(DEVICE)
batch_size = len(texts)
with autocast():
output = model(images)
output_lenghts = torch.full(
size=(output.size(1),),
fill_value=output.size(0),
dtype=torch.long
)
loss = criterion(output, enc_pad_texts, output_lenghts, text_lens)
loss_avg.update(loss.item(), batch_size)
loss = loss_fn(output, enc_pad_texts, output_lenghts, text_lens)
scaler.scale(loss).backward()
scaler.step(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 2)
scaler.update()
for param_group in optimizer.param_groups:
lr = param_group['lr']
print(f'\nEpoch {epoch}, Loss: {loss_avg.avg:.5f}, LR: {lr:.7f}')
return loss_avg.avg
def get_resnet34_backbone(pretrained=True):
m = torchvision.models.resnet34(pretrained=True)
input_conv = nn.Conv2d(3, 64, 7, 1, 3)
blocks = [input_conv, m.bn1, m.relu,
m.maxpool, m.layer1, m.layer2, m.layer3]
return nn.Sequential(*blocks)
class BiLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, dropout=0.1):
super().__init__()
self.lstm = nn.LSTM(
input_size, hidden_size, num_layers,
dropout=dropout, batch_first=True, bidirectional=True)
def forward(self, x):
out, _ = self.lstm(x)
return out
class CRNN(nn.Module):
def __init__(
self, number_class_symbols, time_feature_count=256, lstm_hidden=256,
lstm_len=2,
):
super().__init__()
self.feature_extractor = get_resnet34_backbone(pretrained=True)
self.avg_pool = nn.AdaptiveAvgPool2d(
(time_feature_count, time_feature_count))
self.bilstm = BiLSTM(time_feature_count, lstm_hidden, lstm_len)
self.classifier = nn.Sequential(
nn.Linear(lstm_hidden * 2, time_feature_count),
nn.GELU(),
nn.Dropout(0.01),
nn.Linear(time_feature_count, number_class_symbols)
)
def forward(self, x):
x = self.feature_extractor(x)
b, c, h, w = x.size()
x = x.view(b, c * h, w)
x = self.avg_pool(x)
x = x.transpose(1, 2)
x = self.bilstm(x)
x = self.classifier(x)
x = nn.functional.log_softmax(x, dim=2).permute(1, 0, 2)
return x
<ipython-input-39-09956edc294f> in train(config)
122 for epoch in range(config['num_epochs']):
123
--> 124 loss_avg = train_loop(train_loader, model, criterion, optimizer, epoch)
125 acc_avg = val_loop(val_loader, model, tokenizer, DEVICE)
126 scheduler.step(acc_avg)
<ipython-input-39-09956edc294f> in train_loop(data_loader, model, criterion, optimizer, epoch)
42 loss_avg.update(loss.item(), batch_size)
43
---> 44 loss = loss_fn(output, enc_pad_texts, output_lenghts, text_lens)
45
46 scaler.scale(loss).backward()
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
TypeError: forward() takes 3 positional arguments but 5 were given
Train func
def train(config):
tokenizer = Tokenizer(config['alphabet'])
os.makedirs(config['save_dir'], exist_ok=True)
train_loader, val_loader = get_loaders(tokenizer, config)
model = CRNN(number_class_symbols=tokenizer.get_num_chars())
model.load_state_dict(torch.load("/content/drive/MyDrive/model-1-0.6960.ckpt"))
model.to(DEVICE)
criterion = torch.nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001,
weight_decay=0.1)
scheduler = scheduler = torch.optim.lr_scheduler.OneCycleLR(
optimizer=optimizer,
epochs=config.get('num_epochs'),
steps_per_epoch=len(train_loader),
max_lr=0.01,
pct_start=0.07,
anneal_strategy='cos',
final_div_factor=10 ** 3
)
best_acc = -np.inf
acc_avg = val_loop(val_loader, model, tokenizer, DEVICE)
for epoch in range(config['num_epochs']):
loss_avg = train_loop(train_loader, model, criterion, optimizer, epoch)
acc_avg = val_loop(val_loader, model, tokenizer, DEVICE)
scheduler.step(acc_avg)
print(f"Epoch: {epoch} Loss_avg: {loss_avg} Acc_avg: {acc_avg} Step {scheduler.step(acc_avg)}" )
if acc_avg > best_acc:
best_acc = acc_avg
model_save_path = os.path.join(
config['save_dir'], f'model-{epoch}-{acc_avg:.4f}.ckpt')
torch.save(model.state_dict(), model_save_path)
print('Model weights saved')
I am trying to implement custom roccallback for keras. The call back function I wrote is below. I have to achieve a target of 0.90.
AUROC callback:
class rocback(Callback):
def __init__(self, validation_data):
super(rocback, self).__init__()
# self.training_data = training_data
self.validation_data = validation_data
def on_train_begin(self, logs={}) :
return
def on_epoch_end(self, epoch, logs={}):
probs = self.model.predict(self.validation_data[0])
probs = np.round(probs)
y_true = self.validation_data[1]
y_true = np.round(y_true)
score = roc_auc_score(y_true, probs, average='micro')
logs['auc'] = score
Hence next callback which I wrote was to achieve the target.
class scoreTarget(Callback):
def __init__(self, target):
super(scoreTarget, self).__init__()
self.target = target
def on_epoch_end(self, epoch, logs={}):
acc = logs['auc']
if acc >= self.target:
self.model.stop_training = True
The list of call back used is below:
roc_callback = rocback((X_test_pooled_output, y_test))
early_stopping = EarlyStopping(patience=5)
tensorboard = TensorBoard()
reduce_lr = ReduceLROnPlateau(patience=3)
target = scoreTarget(0.90)
callbacks = [
roc_callback,
early_stopping,
tensorboard,
reduce_lr,
target,
]
The classifier function which I wrote is below:
class ReviewClassifier(Model):
def __init__(self):
super(ReviewClassifier, self).__init__()
self.dense_1 = Dense(64, activation='relu')
self.dense_2 = Dense(32, activation='relu')
self.dense_3 = Dense(16, activation='relu')
self.classify = Dense(1, activation='sigmoid')
self.dropout_1 = Dropout(0.2)
self.dropout_2 = Dropout(0.2)
self.dropout_3 = Dropout(0.2)
def call(self, inputs):
x = self.dense_1(inputs)
x = self.dropout_1(x)
x = self.dense_2(x)
x = self.dropout_2(x)
x = self.dense_3(x)
x = self.dropout_3(x)
x = self.classify(x)
return x
review_classifier = ReviewClassifier()
review_classifier.build((None, 768))
review_classifier.summary()
The compile funtcion I wrote is this:
review_classifier.compile(loss='binary_crossentropy',
optimizer='adam',metrics=[rocback])
fit function:
!rm -rf ./logs/*
history = review_classifier.fit(X_train_pooled_output, y_train,
batch_size=32, epochs=100,
callbacks=callbacks,
validation_data=(X_test_pooled_output, y_test))
The error received is:
Epoch 1/100
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-54-fd7595a2c88c> in <module>()
1 get_ipython().system('rm -rf ./logs/*')
----> 2 history = review_classifier.fit(X_train_pooled_output, y_train, batch_size=32, epochs=100,callbacks=callbacks, validation_data=(X_test_pooled_output, y_test))
9 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
975 except Exception as e: # pylint:disable=broad-except
976 if hasattr(e, "ag_error_metadata"):
--> 977 raise e.ag_error_metadata.to_exception(e)
978 else:
979 raise
TypeError: in user code:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:805 train_function *
return step_function(self, iterator)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:795 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
return fn(*args, **kwargs)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:788 run_step **
outputs = model.train_step(data)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/training.py:758 train_step
self.compiled_metrics.update_state(y, y_pred, sample_weight)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/compile_utils.py:408 update_state
metric_obj.update_state(y_t, y_p, sample_weight=mask)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/utils/metrics_utils.py:90 decorated
update_op = update_state_fn(*args, **kwargs)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/metrics.py:177 update_state_fn
return ag_update_state(*args, **kwargs)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/metrics.py:618 update_state **
matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
TypeError: __init__() takes 2 positional arguments but 3 were given
Any idea what am I doing wrong in the custom callback which I created for roc?
Let me know what more inputs you need.
As we've mentioned in the comment, you can use built-in tf.keras.metrics.AUC while compiling the model, easy-fast-efficient. But also no problem to use either sklearn.metrics.roc_auc_score. But using it in callback may slow down your training time. Try this:
class ROAUCMetrics(tf.keras.callbacks.Callback):
def __init__(self, val_data):
super().__init__()
self.valid_x = val_data[0]
self.valid_y = val_data[1]
def on_train_begin(self, logs={}):
self.val_aucs = []
def on_epoch_end(self, epoch, logs={}):
pred = self.model.predict(self.valid_x)
val_auc = roc_auc_score(self.valid_y, pred, average='micro')
print('\nval-roc-auc: %s' % (str(round(val_auc,4))),end=100*' '+'\n')
self.val_aucs.append(val_auc)
return
# sklearn auc
roc = ROAUCMetrics(val_data=(x_val, y_val))
# tf.keras auc
model.compile(.., ..., metrics=["AUC"])
# running
model.fit(x_train, y_train, batch_size=1024,
epochs=..., callbacks=[roc],
validation_data=(x_val, y_val))
# get the values of auc, computed using sklearn auc
roc.val_aucs
However, note that both compute AUC differently, one uses Approximate AUC, another one uses Riemann sum, I've tested in one example and they pretty comparable but sometimes didn't.
Based on your 1st comment, your set up should look like this:
# (1)
# compile with no metrics - as we have custom callback metric to use
review_classifier.compile(loss='binary_crossentropy',optimizer='adam')
# (2)
# or,
# we can add another metrics e.g 'accuracy' or whatever
# here we use built-in AUC
review_classifier.compile(loss='binary_crossentropy',
optimizer='adam',metrics=['AUC'])
# sklearn auc
roc = ROAUCMetrics(val_data=(x_val, y_val))
early_stopping = EarlyStopping(patience=5)
tensorboard = TensorBoard()
reduce_lr = ReduceLROnPlateau(patience=3)
target = scoreTarget(0.90)
callbacks = [
roc,
early_stopping,
tensorboard,
reduce_lr,
target,
]
# fitting
model.fit(x_train, y_train, batch_size=1024,
epochs=..., callbacks=callbacks,
validation_data=(x_val, y_val))