L-BFGS Optimizer Not Changing Loss But Adam Is - python

I am building a physics-informed neural network to approximate PDEs. I am getting okay results with just using the adam optimizer however I want to get better results. I am attempting to use adam for say 10,000 iteration then the L-BFGS optimizer (pytorch) for the last 1,000.
However when using my L-BFGS optimizer the loss of the network never changes and remain constant. Here is my closure function used in my PINN for L-BFGS
def closure(self):
lbfgs_optim.zero_grad()
train_loss = PINN.loss(xt_train_ICBC, u_train_ICBC, xt_resid, f_hat_train)
train_loss.backward()
return train_loss
Also my optimizer paramters for both adam and L-BFGS
epochs_adam, epochs_lbfgs = 10000, 1000
adam_optim = torch.optim.Adam(PINN.parameters(), lr=lr, weight_decay=1e-5)
lbfgs_optim = torch.optim.LBFGS(PINN.parameters(), lr=lr, history_size = 20,
max_iter = 50, line_search_fn = "strong_wolfe")
I am using one for loop for adam then another for loop for L-BFGS, here is how adam is used in my code, which works
train_loss = PINN.loss(xt_train_ICBC, u_train_ICBC, xt_resid, f_hat_train)
... # print loss's, append to lists
adam_optim.zero_grad()
train_loss.backward()
adam_optim.step()
Then for my L-BFGS, which seems not to work, all I call in my epoch loop is
lbfgs_optim.step(PINN.closure)
and I don't see any changes in the loss, why is that?
Versions being used: Python 3.9.12, PyTorch 1.11.0 and NumPy 1.21.5
EDIT: PINN code, and optimization/training code
class NN(nn.Module):
# Heat Equation PDE
def __init__(self, layers):
super().__init__()
self.activation = nn.Sigmoid()
self.loss_function = nn.MSELoss(reduction='mean')
self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i+1]) for i in range(len(layers)-1)])
for i in range(len(layers)-1):
nn.init.xavier_normal_(self.linears[i].weight.data, gain=1.0)
nn.init.zeros_(self.linears[i].bias.data)
def forward(self, x):
a = x.float()
for i in range(0, len(layers)-2):
z = self.linears[i](a)
a = self.activation(z)
a = self.linears[-1](a)
return a
def lossICBC(self, x_ICBC, u_ICBC):
"""MSE losses for oundary and initial conditions"""
loss_ICBC = self.loss_function(self.forward(x_ICBC), u_ICBC)
return loss_ICBC
def lossPDE(self, xt_residual, f_hat):
"""Residual loss for collocation points"""
g = xt_residual.clone().float()
g.requires_grad=True
f = self.forward(g)
f_xt = autograd.grad(f, g, torch.ones(g.shape[0], 1).to(device), create_graph=True)[0]
f_xx_tt = autograd.grad(f_xt, g, torch.ones(g.shape).to(device), create_graph=True)[0]
f_t = f_xt[:,[1]] # extract just the t values
f_xx = f_xx_tt[:,[0]] # extract just the x values
f = f_t - k*f_xx
return self.loss_function(f, f_hat)
def closure(self):
lbfgs_optim.zero_grad()
train_loss = PINN.loss(xt_train_ICBC, u_train_ICBC, xt_resid, f_hat_train)
train_loss.backward()
return train_loss
def loss(self, x_ICBC, u_ICBC, xt_residual, f_hat):
"""Total loss"""
loss_ICBC = self.lossICBC(x_ICBC, u_ICBC)
loss_PDE = self.lossPDE(xt_residual, f_hat) #f_hat=torch.zeros()
return loss_PDE + loss_ICBC
lr_adam = 0.001
lr_lbfgs = 1
epochs_adam = 20000
adam_optim = torch.optim.Adam(PINN.parameters(), lr=lr)
epochs_lbfgs = 100
lbfgs_optim = torch.optim.LBFGS(PINN.parameters(), lr=15, history_size = 20,
max_iter = 50, line_search_fn = "strong_wolfe")
Training loops
for i in range(0, epochs_adam+1):
train_loss = PINN.loss(xt_train_ICBC, u_train_ICBC, xt_resid, f_hat_train)
adam_optim.zero_grad()
train_loss.backward()
adam_optim.step()
for i in range(0, epochs_lbfgs+1):
train_loss = lbfgs_optim.step(PINN.closure)

Related

The pytorch training model cannot be created successfully

I would like to do a neural network for regression analysis using optuna based on this site.
I would like to create a model with two 1D data as input and one 1D data as output in batch learning.
x is the training data and y is the teacher data.
class Model(nn.Module):
# コンストラクタ(インスタンス生成時の初期化)
def __init__(self,trial, mid_units1, mid_units2):
super(Model, self).__init__()
self.linear1 = nn.Linear(2, mid_units1)
self.bn1 = nn.BatchNorm1d(mid_units1)
self.linear2 = nn.Linear(mid_units1, mid_units2)
self.bn2 = nn.BatchNorm1d(mid_units2)
self.linear3 = nn.Linear(mid_units2, 1)
self.activation = trial_activation(trial)
def forward(self, x):
x = self.linear1(x)
x = self.bn1(x)
x = self.activation(x)
x = self.linear2(x)
device = "cuda" if torch.cuda.is_available() else "cpu"
EPOCH = 100
x = torch.from_numpy(a[0].astype(np.float32)).to(device)
y = torch.from_numpy(a[1].astype(np.float32)).to(device)
def train_epoch(model, optimizer, criterion):
model.train()
optimizer.zero_grad() # 勾配情報を0に初期化
y_pred = model(x) # 予測
loss = criterion(y_pred.reshape(y.shape), y) # 損失を計算(shapeを揃える)
loss.backward() # 勾配の計算
optimizer.step() # 勾配の更新
return loss.item()
def trial_activation(trial):
activation_names = ['ReLU','logsigmoid']
activation_name = trial.suggest_categorical('activation', activation_names)
if activation_name == activation_names[0]:
activation = F.relu
else:
activation = F.logsigmoid
return activation
def objective(trial):
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 中間層のユニット数の試行
mid_units1 = int(trial.suggest_discrete_uniform("mid_units1", 1024*2,1024*4, 64*2))
mid_units2 = int(trial.suggest_discrete_uniform("mid_units2", 1024, 1024*2, 64*2))
net = Model(trial, mid_units1, mid_units2).to(device)
criterion = nn.MSELoss()
# 最適化手法の試行
optimizer = trial_optimizer(trial, net)
train_loss = 0
for epoch in range(EPOCH):
train_loss = train_epoch(net, optimizer, criterion, device)
torch.save(net.state_dict(), str(trial.number) + "new1.pth")
return train_loss
strage_name = "a.sql"
study_name = 'a'
study = optuna.create_study(
study_name = study_name,
storage='sqlite:///' + strage_name,
load_if_exists=True,
direction='minimize')
TRIAL_SIZE = 100
study.optimize(objective, n_trials=TRIAL_SIZE)
error message
---> 28 loss = criterion(y_pred.reshape(y.shape), y) # 損失を計算(shapeを揃える)
29 loss.backward() # 勾配の計算
30 optimizer.step() # 勾配の更新
AttributeError: 'NoneType' object has no attribute 'reshape'
Because of the above error, I checked the value of y_pred and found it to be None.
model.train()
optimizer.zero_grad()
I am thinking that these two lines may be wrong, but I don't know how to solve this problem.
With PyTorch, when you call y_pred = model(x) that will call the forward function which is defined in the Model class.
So, y_pred will get the result of the forward function, in your case, it returns nothing, that's why you get a None value. You can change the forward function as below:
def forward(self, x):
x = self.linear1(x)
x = self.bn1(x)
x = self.activation(x)
x = self.linear2(x)
return x

Implementing Early Stopping in Pytorch without Torchsample

As a Pytorch newbie (coming from tensorflow), I am unsure of how to implement Early Stopping. My research has led me discover that pytorch does not have a native way to so this. I have also discovered torchsample, but am unable to install it in my conda environment for whatever reason. Is there a simple way to go about applying early stopping without it? Here is my current setup:
class RegressionDataset(Dataset):
def __init__(self, X_data, y_data):
self.X_data = X_data
self.y_data = y_data
def __getitem__(self, index):
return self.X_data[index], self.y_data[index]
def __len__(self):
return len(self.X_data)
train_dataset = RegressionDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_dataset = RegressionDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_dataset = RegressionDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())
# Model Params
EPOCHS = 100
BATCH_SIZE = 1000
LEARNING_RATE = 0.001
NUM_FEATURES = np.shape(X_test)[1]
# Initialize Dataloader
train_loader = DataLoader(dataset = train_dataset, batch_size=BATCH_SIZE, shuffle = True)
val_loader = DataLoader(dataset = val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(dataset = test_dataset, batch_size=BATCH_SIZE)
# Define Neural Network Architecture
class MultipleRegression(nn.Module):
def __init__(self, num_features):
super(MultipleRegression, self).__init__()
# Define architecture
self.layer_1 = nn.Linear(num_features, 16)
self.layer_2 = nn.Linear(16, 32)
self.layer_3 = nn.Linear(32, 25)
self.layer_4 = nn.Linear(25, 20)
self.layer_5 = nn.Linear(20, 16)
self.layer_out = nn.Linear(16, 1)
self.relu = nn.ReLU() # ReLU applied to all layers
# Initialize weights and biases
nn.init.xavier_uniform_(self.layer_1.weight)
nn.init.zeros_(self.layer_1.bias)
nn.init.xavier_uniform_(self.layer_2.weight)
nn.init.zeros_(self.layer_2.bias)
nn.init.xavier_uniform_(self.layer_3.weight)
nn.init.zeros_(self.layer_3.bias)
nn.init.xavier_uniform_(self.layer_4.weight)
nn.init.zeros_(self.layer_4.bias)
nn.init.xavier_uniform_(self.layer_5.weight)
nn.init.zeros_(self.layer_5.bias)
nn.init.xavier_uniform_(self.layer_out.weight)
nn.init.zeros_(self.layer_out.bias)
def forward(self, inputs):
x = self.relu(self.layer_1(inputs))
x = self.relu(self.layer_2(x))
x = self.relu(self.layer_3(x))
x = self.relu(self.layer_4(x))
x = self.relu(self.layer_5(x))
x = self.layer_out(x)
return(x)
def predict(self, test_inputs):
x = self.relu(self.layer_1(test_inputs))
x = self.relu(self.layer_2(x))
x = self.relu(self.layer_3(x))
x = self.relu(self.layer_4(x))
x = self.relu(self.layer_5(x))
x = self.layer_out(x)
return(x)
# Check for GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model = MultipleRegression(NUM_FEATURES)
model.to(device)
print(model)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)
# define dictionary to store loss/epochs for training and validation
loss_stats = {
"train": [],
"val": []
}
# begin training
print("Begin Training")
for e in tqdm(range(1, EPOCHS+1)):
# Training
train_epoch_loss = 0
model.train()
for X_train_batch, y_train_batch in train_loader:
X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
optimizer.zero_grad()
y_train_pred = model(X_train_batch)
train_loss = criterion(y_train_pred, y_train_batch.unsqueeze(1))
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
# validation
with torch.no_grad():
val_epoch_loss = 0
model.eval()
for X_val_batch, y_val_batch in val_loader:
X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
y_val_pred = model(X_val_batch)
val_loss = criterion(y_val_pred, y_val_batch.unsqueeze(1))
val_epoch_loss += val_loss.item()
loss_stats["train"].append(train_epoch_loss/len(train_loader))
loss_stats["val"].append(val_epoch_loss/len(val_loader))
print(f"Epoch {e}: \ Train loss: {train_epoch_loss/len(train_loader):.5f} \ Val loss: {val_epoch_loss/len(val_loader):.5f}")
# Visualize loss and accuracy
train_val_loss_df = pd.DataFrame.from_dict(loss_stats).reset_index().melt(id_vars=["index"]).rename(columns = {"index":"epochs"})
plt.figure()
sns.lineplot(data = train_val_loss_df, x = "epochs", y = "value", hue = "variable").set_title("Train-Val Loss/Epoch")
# Test model
y_pred_list = []
with torch.no_grad():
model.eval()
for X_batch, _ in test_loader:
X_batch = X_batch.to(device)
y_test_pred = model(X_batch)
y_pred_list.append(y_test_pred.cpu().numpy())
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
y_pred_list = [item for sublist in y_pred_list for item in sublist]
y_pred_list = np.array(y_pred_list)
mse = mean_squared_error(y_test, y_pred_list)
r_square = r2_score(y_test, y_pred_list)
print("Mean Squared Error :", mse)
print("R^2 :", r_square)
A basic way to do this is to keep track of the best validation loss obtained so far.
You can have a variable best_loss = 0 initialized before your loop over epochs (or you could do other things like best loss per epoch, etc.).
After each validation pass then do:
if val_loss > best_loss:
best_loss = val_loss
# At this point also save a snapshot of the current model
torch.save(model, 'my_model_best_loss.pth')
Then, if the best_loss does not improve significantly after some number of training steps, or by the end of the epoch, or if it val_loss gets worse, break out of the loop and terminate the training there.
For implementing algorithms like early stopping (and your training loop in general) you may find it easier to give PyTorch Lightning a try (no affiliation, but it's much easier than trying to roll everything by hand).

how to save Keras Model instance of class

am using an Seq2Seq project from Google that use Encoder/Decoder, there is the 2 encoder and decoder class :
#ENCODER
class EncoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,input_vocab_size,embedding_dims, rnn_units ):
super().__init__()
self.encoder_embedding = tf.keras.layers.Embedding(input_dim=input_vocab_size,
output_dim=embedding_dims)
self.encoder_rnnlayer = tf.keras.layers.LSTM(rnn_units,return_sequences=True,
return_state=True )
encoder_embedding = self.encoder_embedding
encoder_rnnlayer = self.encoder_rnnlayer
#DECODER
class DecoderNetwork(tf.keras.Model):
def __getstate__(self):
d = self.__dict__.copy()
d.pop('_parents', None)
return d
def __init__(self,output_vocab_size, embedding_dims, rnn_units):
super().__init__()
self.decoder_embedding = tf.keras.layers.Embedding(input_dim=output_vocab_size,
output_dim=embedding_dims)
self.dense_layer = tf.keras.layers.Dense(output_vocab_size)
self.decoder_rnncell = tf.keras.layers.LSTMCell(rnn_units)
# Sampler
self.sampler = tfa.seq2seq.sampler.TrainingSampler()
# Create attention mechanism with memory = None
self.attention_mechanism = self.build_attention_mechanism(dense_units,None,BATCH_SIZE*[Tx])
self.rnn_cell = self.build_rnn_cell(BATCH_SIZE)
self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler= self.sampler,
output_layer=self.dense_layer)
def build_attention_mechanism(self, units,memory, memory_sequence_length):
return tfa.seq2seq.LuongAttention(units, memory = memory,
memory_sequence_length=memory_sequence_length)
#return tfa.seq2seq.BahdanauAttention(units, memory = memory, memory_sequence_length=memory_sequence_length)
# wrap decodernn cell
def build_rnn_cell(self, batch_size ):
rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnncell, self.attention_mechanism,
attention_layer_size=dense_units)
return rnn_cell
def build_decoder_initial_state(self, batch_size, encoder_state,Dtype):
decoder_initial_state = self.rnn_cell.get_initial_state(batch_size = batch_size,
dtype = Dtype)
decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
return decoder_initial_state
i create an instance of EncoderNetwork and DecoderNetwork with my argument and use the loss_function and train_step already defined to train my model
def loss_function(y_pred, y):
#shape of y [batch_size, ty]
#shape of y_pred [batch_size, Ty, output_vocab_size]
sparsecategoricalcrossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
reduction='none')
loss = sparsecategoricalcrossentropy(y_true=y, y_pred=y_pred)
mask = tf.logical_not(tf.math.equal(y,0)) #output 0 for y=0 else output 1
mask = tf.cast(mask, dtype=loss.dtype)
loss = mask* loss
loss = tf.reduce_mean(loss)
return loss
def train_step(input_batch, output_batch,encoder_initial_cell_state):
#initialize loss = 0
loss = 0
with tf.GradientTape() as tape:
encoder_emb_inp = encoderNetwork.encoder_embedding(input_batch)
a, a_tx, c_tx = encoderNetwork.encoder_rnnlayer(encoder_emb_inp,
initial_state =encoder_initial_cell_state)
#[last step activations,last memory_state] of encoder passed as input to decoder Network
# Prepare correct Decoder input & output sequence data
decoder_input = output_batch[:,:-1] # ignore <end>
#compare logits with timestepped +1 version of decoder_input
decoder_output = output_batch[:,1:] #ignore <start>
# Decoder Embeddings
decoder_emb_inp = decoderNetwork.decoder_embedding(decoder_input)
#Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
decoderNetwork.attention_mechanism.setup_memory(a)
decoder_initial_state = decoderNetwork.build_decoder_initial_state(BATCH_SIZE,
encoder_state=[a_tx, c_tx],
Dtype=tf.float32)
#BasicDecoderOutput
outputs, _, _ = decoderNetwork.decoder(decoder_emb_inp,initial_state=decoder_initial_state,
sequence_length=BATCH_SIZE*[Ty-1])
logits = outputs.rnn_output
#Calculate loss
loss = loss_function(logits, decoder_output)
#Returns the list of all layer variables / weights.
variables = encoderNetwork.trainable_variables + decoderNetwork.trainable_variables
# differentiate loss wrt variables
gradients = tape.gradient(loss, variables)
#grads_and_vars – List of(gradient, variable) pairs.
grads_and_vars = zip(gradients,variables)
optimizer.apply_gradients(grads_and_vars)
return loss
the training does not use fit() methode but like this :
epochs = 20
for i in range(1, epochs+1):
encoder_initial_cell_state = initialize_initial_state()
total_loss = 0.0
for ( batch , (input_batch, output_batch)) in enumerate(dataset.take(steps_per_epoch)):
batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
total_loss += batch_loss
if (batch+1)%5 == 0:
print("total loss: {} epoch {} batch {} ".format(batch_loss.numpy(), i, batch+1))
the result are fine and the predict fonction work perfectly (custom predict function), but how can i save the model ? i tried pickel and keras.save() but it doesn't work any idea ?

LSTM implementation / overfitting

I am having a problem on an implementation of LSTM. I am not sure if I have the right implementation or this is just an overfitting problem. I am doing essay grading using a LSTM, scoring text with score from 0 - 10 (or other range of score). I am using the ASAP kaggle competition data as one of the training data.
However, the main goal is to achieve good performance on a private dataset, with around 500 samples. The 500 samples includes validation and training set. I have previously done some experiment and got the model to work, but after fiddling with something, the model doesn't fit anymore. The model does not improve at all. I have also re-implemented the code in a cleaner manner with much more obejct oriented code and still can't reproduce my previous result.
However, I am getting the model to fit to my data, just there is tremendous overfitting. I am not sure if this is an implementation problem of some sort or just overfitting, but I cannot get the model to work. The maximum I can get it to is 0.35 kappa using LSTM on the ASAP data essay set 1. For some bizarre reason, I can get a single layer fully connected model to have 0.75 kappa. I think this is an implementation problem but I am not sure.
Here is my old code:
train.py
import gensim
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import cohen_kappa_score
from torch import nn
import torch.utils.data as data_utils
from torch.optim import Adam
from dataset import AESDataset
from network import Network
from optimizer import Ranger
from qwk import quadratic_weighted_kappa, kappa
batch_size = 32
device = "cuda:0"
torch.manual_seed(1000)
# Load data from csv
file_name = "data/data_new.csv"
data = pd.read_csv(file_name)
arr = data.to_numpy()
text = arr[:, :2]
text = [str(line[0]) + str(line[1]) for line in text]
text = [gensim.utils.simple_preprocess(line) for line in text]
score = arr[:,2]
score = [sco*6 for sco in score]
score = np.asarray(score, dtype=int)
train_dataset = AESDataset(text_arr=text[:400], scores=score[:400])
test_dataset = AESDataset(text_arr=text[400:], scores=score[400:])
score = torch.tensor(score).view(-1,1).long().to(device)
train_loader = data_utils.DataLoader(train_dataset,shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = data_utils.DataLoader(test_dataset,shuffle=True,batch_size=batch_size, drop_last=True)
out_class = 61
epochs = 1000
model = Network(out_class).to(device)
model.load_state_dict(torch.load("model/best_model"))
y_onehot = torch.FloatTensor(batch_size, out_class).to(device)
optimizer = Adam(model.parameters())
criti = torch.nn.CrossEntropyLoss()
# model, optimizer = amp.initialize(model, optimizer, opt_level="O2")
step = 0
for i in range(epochs):
#Testing
if i % 1 == 0:
total_loss = 0
total_kappa = 0
total_batches = 0
model.eval()
for (text, score) in test_loader:
out = model(text)
out_score = torch.argmax(out, 1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
score = score.view(-1)
loss = criti(out, score.view(-1))
total_loss += loss
total_kappa += kappa_l
total_batches += 1
print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
with open(f"model/epoch_{i}", "wb") as f:
torch.save(model.state_dict(),f)
model.train()
#Training
for (text, score) in train_loader:
optimizer.zero_grad()
step += 1
out = model(text)
out_score = torch.argmax(out,1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
loss = criti(out, score.view(-1))
print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
loss.backward()
optimizer.step()
dataset.py
import gensim
import torch
import numpy as np
class AESDataset(torch.utils.data.Dataset):
def __init__(self, text_arr, scores):
self.data = text_arr
self.scores = scores
self.w2v_model = ("w2vec_model_all")
self.max_len = 500
def __getitem__(self, item):
vector = []
essay = self.data[item]
pad_vec = [1 for i in range(300)]
for i in range(self.max_len - len(essay)):
vector.append(pad_vec)
for word in essay:
word_vec = pad_vec
try:
word_vec = self.w2v_model[word]
except:
#print(f"Skipping word as word {word} not in dictionary")
word_vec = pad_vec
vector.append(word_vec)
#print(len(vector))
vector = np.stack(vector)
tensor = torch.tensor(vector[:self.max_len]).float().to("cuda")
score = self.scores[item]
score = torch.tensor(score).long().to("cuda").view(1)
return tensor, score
def __len__(self):
return len(self.scores)
network.py
import torch.nn as nn
import torch
import torch.nn.functional as F
class Network(nn.Module):
def __init__(self, output_size):
super(Network, self).__init__()
self.lstm = nn.LSTM(300,500,1, batch_first=True)
self.dropout = nn.Dropout(p=0.5)
#self.l2 = nn.L2
self.linear = nn.Linear(500,output_size)
def forward(self,x):
x, _ = self.lstm(x)
x = x[:,-1,:]
x = self.dropout(x)
x = self.linear(x)
return x
My new code: https://github.com/Clement-Hui/EssayGrading
I think the problem is in the training code since you are using LSTM you are supposed to flush down the hidden and cell state after every epoch and detach it from the computation graph after each batch.
network.py
import torch.nn as nn
import torch
import torch.nn.functional as F
class Network(nn.Module):
def __init__(self, output_size):
super(Network, self).__init__()
self.lstm = nn.LSTM(300,500,1, batch_first=True)
self.dropout = nn.Dropout(p=0.5)
#self.l2 = nn.L2
self.linear = nn.Linear(500,output_size)
def forward(self,x,hidden):
x, hidden = self.lstm(x,hidden)
x = x.contiguous().view(-1, 500)
x = self.dropout(x)
x = self.linear(x)
return x , hidden
def init_hidden(self,batch_size):
weights = next(self.parameters()).data
hidden = (weights.new(1 , batch_size,500).zero_().cuda(),
weights.new(1 , batch_size,500).zero_().cuda())
return hidden
train.py
# your code for intializing the model and data and all other stuff
for i in range(epochs):
#Testing
if i % 1 == 0:
total_loss = 0
total_kappa = 0
total_batches = 0
model.eval()
val_h = model.init_hidden(batch_size) # intialize the hidden state
for (text, score) in test_loader:
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
val_h = tuple([each.data for each in val_h])
out , val_h = model(text,val_h)
out_score = torch.argmax(out, 1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(), out_score.view(batch_size).tolist())
score = score.view(-1)
loss = criti(out, score.view(-1))
total_loss += loss
total_kappa += kappa_l
total_batches += 1
print(f"Epoch {i} Testing kappa {total_kappa/total_batches} loss {total_loss/total_batches}")
with open(f"model/epoch_{i}", "wb") as f:
torch.save(model.state_dict(),f)
model.train()
#Training
h = model.init_hidden(batch_size) # intialize the hidden state
for (text, score) in train_loader:
optimizer.zero_grad()
step += 1
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
h = tuple([each.data for each in h])
out , h = model(text,h)
out_score = torch.argmax(out,1)
y_onehot.zero_()
y_onehot.scatter_(1, score, 1)
kappa_l = cohen_kappa_score(score.view(batch_size).tolist(),out_score.view(batch_size).tolist())
loss = criti(out, score.view(-1))
print(f"Epoch {i} step {step} kappa {kappa_l} loss {loss}")
loss.backward()
optimizer.step()
Do let me know if the changes mentioned works or not.

Generator and Discriminator Loss in Image Colorization

I have been following the image colorization from GAN. I have tried to implement the research paper myself but I got a problem. The generator and discriminator loss is not converging after a certain point. So, I guess there is a problem in their implementation. Here is the code
def generator_loss(self,fake_output_discri,generated_image_from_generator, actual_image,regularizer_lambda=0.01):
mse = tf.reduce_mean(regularizer_lambda*tf.keras.losses.mean_absolute_error(generated_image_from_generator, actual_image))
return tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(fake_output_discri),logits = fake_output_discri) + mse
def discriminator_loss(self,generated_image_from_generator,actual_image):
actual_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.ones_like(actual_image), logits = actual_image )
fake_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels = tf.zeros_like(generated_image_from_generator), logits = generated_image_from_generator)
return actual_loss + fake_loss
Assuming that the parameter passed to them are correct, am I doing anything wrong? I tried printing the generator and discriminator loss, and it remains constant after certain epochs! Here is the train function that does most of the job!
def train(self,gray_scale_image_dataset,color_image_dataset,test_image):
generator = self.generator_model()
discriminator = self.discriminator_model()
gen_optimizer = tf.train.AdamOptimizer(self.learning_rate)
dis_optimizer = tf.train.AdamOptimizer(self.learning_rate)
for eachEpoch in range(self.epochs):
with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
for i in range(20):
random.shuffle(gray_scale_image_dataset)
random.shuffle(color_image_dataset)
gray_scale_dataset_image = gray_scale_image_dataset[:self.batch_size]
color_dataset_image_batch = color_image_dataset[:self.batch_size]
generated_image = generator(gray_scale_dataset_image)
real_output = discriminator(color_dataset_image_batch)
fake_output = discriminator(generated_image)
gen_loss = self.generator_loss(fake_output,generated_image,color_dataset_image_batch)
dis_loss = self.discriminator_loss(fake_output,real_output)
print("generator = {} discriminator = {}".format(gen_loss,dis_loss))
gen_gradients = gen_tape.gradient(gen_loss,generator.trainable_variables)
disc_gradients = disc_tape.gradient(dis_loss,discriminator.trainable_variables)
print("APPLYING GRADENTS")
gen_optimizer.apply_gradients(zip(gen_gradients, generator.trainable_variables))
dis_optimizer.apply_gradients(zip(disc_gradients, discriminator.trainable_variables))
print ("EPOCHS COMPLETED = {} ".format(eachEpoch))
#for drawing test_image
self.draw_images(generator,test_image)

Categories