Tensorflow: gradients suddenly became NaNs during training

Tensorflow: gradients suddenly became NaNs during training - python

I am training simple variational autoencoder with negative binomial likelihood for decoder. I used python 3.7.1 and tensorflow 2.0.0.
The model was trained well without any problems for tens of epochs, but all weights, loss, and gradients suddenly became NaN during training. I modified the code to find which variables become NaN first (among weights, loss, and gradients) and found that gradients first became nan and this affected other variables.
I have googled similar issues but most of case nan appeared in the loss, which is different from this case. I tried: smaller learning rate, clipping loss... but nothing could resolve the problem.
Here is the model class for autoencoder model:
class Encoder(tf.keras.layers.Layer):
def __init__(self, hidden_dim, latent_dim):
super(Encoder, self).__init__()
self.encoder_fc_1 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.encoder_fc_2 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.encoder_latent = tf.keras.layers.Dense(latent_dim + latent_dim)
def call(self, input):
h = tf.math.l2_normalize(input, 1)
h = self.encoder_fc_1(h)
h = self.encoder_fc_2(h)
return tf.split(self.encoder_latent(h), num_or_size_splits=2, axis=1)
class Decoder(tf.keras.layers.Layer):
def __init__(self, hidden_dim, vocab_size):
super(Decoder, self).__init__()
self.decoder_fc_1 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.decoder_fc_2 = tf.keras.layers.Dense(hidden_dim, activation=tf.nn.leaky_relu)
self.decoder_fc_3 = tf.keras.layers.Dense(vocab_size + vocab_size)
def call(self, z):
h = self.decoder_fc_1(z)
h = self.decoder_fc_2(h)
return tf.split(self.decoder_fc_3(h), num_or_size_splits=2, axis=1)
class NBVAE(tf.keras.Model):
def __init__(self, config):
super(NBVAE, self).__init__()
self.optimizer = tf.keras.optimizers.Adam(config["learning_rate"])
self.encoder = Encoder(config["hidden_dim"], config["latent_dim"])
self.decoder = Decoder(config["hidden_dim"], config["vocab_size"])
def call(self, input):
mean, logvar = self.encoder(input)
z = reparameterize(mean, logvar)
h_r, h_p = self.decoder(z)
return mean, logvar, z, h_r, h_p
def reparameterize(mean, logvar):
eps = tf.random.normal(shape=mean.shape)
return tf.add(tf.multiply(eps, tf.math.exp( tf.math.divide(logvar, 2))), mean)
def log_normal_pdf(sample, mean, logvar, raxis=1):
log2pi = tf.math.log(2. * np.pi)
return tf.reduce_sum(-.5 * ((sample - mean) ** 2. * tf.exp(-logvar) + logvar + log2pi), axis=raxis)
def compute_logpx_z(input, h_r, h_p):
temp = tf.exp(-tf.multiply(tf.exp(h_r), tf.math.log(tf.exp(h_p) + 1)))
temp_cliped = tf.clip_by_value(temp, 1e-5, 1 - 1e-5)
ll = tf.multiply(input, tf.math.log(1 - temp_cliped)) + tf.multiply(1 - input, tf.math.log(temp_cliped))
#print("logpx_z: {}".format(tf.reduce_sum(ll, axis=-1)))
return tf.reduce_sum(ll, axis=-1), temp
def compute_loss(model, input):
mean, logvar, z, h_r, h_p = model(input)
logpx_z, temp = compute_logpx_z(input, h_r, h_p)
logpz = log_normal_pdf(z, 0., 0.)
logqz_x = log_normal_pdf(z, mean, logvar)
return tf.negative(tf.reduce_mean(logpx_z + logpz - logqz_x)), temp
and here is the code snippet for training the model.
I put some if statements in the middle of the code to check which variable become NaN first.
print("start training...")
num_batches = int(np.ceil(len(training_data) / batch_size))
epoch_loss = []
for epoch in range(epochs):
print("epoch: {}".format(epoch+1))
progbar = tf.keras.utils.Progbar(num_batches)
loss_record = []
for i in range(num_batches):
x_batch = training_data[i*batch_size:(i+1)*batch_size]
x_batch = one_hot(x_batch, depth=len(concept2id))
with tf.GradientTape() as tape:
loss, temp = compute_loss(nbvae, x_batch)
print("step{s} loss: {l}".format(s=i, l=loss.numpy()))
# checking the loss
if np.isnan(loss.numpy()):
print("nan loss is detected")
detect_nan = True
break
loss_record.append(loss.numpy())
gradients = tape.gradient(loss, nbvae.trainable_variables)
#gradients, global_norm = tf.clip_by_global_norm(tape.gradient(loss, nbvae.trainable_variables), 10)
print("checking gradients...")
gradient_nancount = 0
for _, grad in enumerate(gradients):
gradient_nancount += np.sum(np.isnan(grad))
if gradient_nancount != 0:
print("nan is detected in gradients")
print("saving the current gradients and weights...")
save_data(os.path.join(output_path, "error_gradients.pkl"), gradients)
save_data(os.path.join(output_path, "error_tvariables.pkl"), nbvae.trainable_variables)
detect_nan = True
break
nbvae.optimizer.apply_gradients(zip(gradients, nbvae.trainable_variables))
print("checking the updated weights...")
weight_nancount = 0
for _, weight in enumerate(nbvae.weights):
weight_nancount += np.sum(np.isnan(weight))
if weight_nancount != 0:
print("nan is detected in weights")
print("saving the current gradients and weights...")
save_data(os.path.join(output_path, "error_gradients.pkl"), gradients)
save_data(os.path.join(output_path, "error_tvariables.pkl"), nbvae.trainable_variables)
detect_nan = True
break
progbar.add(1)
if detect_nan:
epoch_loss.append(np.nan)
nbvae.save_weights(os.path.join(output_path, "nbvae_error{}".format(epoch+1)))
break
print("average epoch loss: {}".format(np.mean(loss_record)))
epoch_loss.append(np.mean(loss_record))
Anyone knows the way to resolve this problem or possible reasons? Thank you for your time in advance.

Related

AttributeError: 'KMeans' object has no attribute 'labels_' pytorch

first of all I thank , I tried to train model with pytorch but I got the following error: AttributeError: 'KMeans' object has no attribute 'labels_'.I am trying to model a extract features point cloud using deep learning in pytorch. I get the following error . Could anyone help on this? ************** *************** Thanks!
# Training loop
def training_loop(gpu, training_dataloader, model, loss_fn, optimizer):
losses = []
correct = 0
batch_results = dict()
conf_mat = np.zeros((10,10))
for batch_n, batch in enumerate(training_dataloader): #batch[batch, pos, ptr, y]
batch_size = int(batch.batch.size()[0] / sample_points)
if dimensionality == 3:
# Input dim [:,3] for your geometry x,y,z
X = batch.pos.cuda(non_blocking=True).view(batch_size, sample_points, -1) + torch.normal(
torch.zeros(batch_size, sample_points, dimensionality), torch.full((batch_size, sample_points,
dimensionality), fill_value=0.1)).cuda(gpu)
else:
# Input dim [:,6] for your geometry x,y,z and normals nx,ny,nz
X = torch.cat((batch.pos.cuda(non_blocking=True), batch.normal.cuda(non_blocking=True)), 1).view(batch_size, sample_points, -1) + torch.normal(
torch.zeros(batch_size, sample_points, dimensionality), torch.full((batch_size, sample_points,
dimensionality), fill_value=0.1)).cuda(gpu)
y = batch.y.cuda(non_blocking=True).flatten() #size (batch_size) --> torch.Size([8])
# Compute predictions
pred = model(None, X) #size (batch_size,classes) --> torch.Size([8, 10])
if overall_classes_loss:
# weighted CE Loss over all classes
loss = loss_fn(pred, y)
else:
# weighted batchwise Loss
sample_count = np.array([[x, batch.y.tolist().count(x)] for x in batch.y])[:,1]
batch_weights = 1. / sample_count
batch_weights = torch.from_numpy(batch_weights)
batch_weights = batch_weights.double()
loss = element_weighted_loss(pred, batch.y, batch_weights, gpu)
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
print(f"Loss: {loss}")
tensor_list_y = [torch.ones_like(y) for _ in range(dist.get_world_size())]
tensor_list_pred = [torch.ones_like(y) for _ in range(dist.get_world_size())]
torch.distributed.all_gather(tensor_list_y, y, group=None, async_op=False)
torch.distributed.all_gather(tensor_list_pred, pred.argmax(1), group=None, async_op=False)
tensor_list_y = torch.cat(tensor_list_y)
tensor_list_pred = torch.cat(tensor_list_pred)
# Confusion Matrix
conf_mat += confusion_matrix(tensor_list_y.cpu().detach().numpy(), tensor_list_pred.cpu().detach().numpy(), labels=np.arange(0,10))
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(loss.item())
# Save batch predictions
batch_results[batch_n] = {'true':tensor_list_y, 'pred':tensor_list_pred}
if verbosity == True:
print(f"\n\nTRAIN on GPU:{gpu}: True Label {y} - Prediction {pred.argmax(1)} - Loss {loss}")
truevalue = '\t\t'.join(classes[items] for items in y.tolist())
predvalues = '\t\t'.join(classes[items] for items in pred.argmax(1).tolist())
print(f"INFO on GPU:{gpu}: TRAIN - True Value\t {truevalue}")
print(f"INFO on GPU:{gpu}: TRAIN - Predictions\t {predvalues}")
if batch_n % 25 == 0:
torch.distributed.reduce(loss, 0)
return torch.tensor(losses, device=f"cuda:{gpu}"), torch.tensor(correct, device=f"cuda:{gpu}"), batch_results, conf_mat
# Test loop
def test_loop(gpu, test_dataloader, model, loss_fn):
test_losses = []
correct = 0
batch_results = dict()
conf_mat = np.zeros((10,10))
with torch.no_grad():
for batch_n, batch in enumerate(test_dataloader):
batch_size = int(batch.batch.size()[0] / sample_points)
if dimensionality == 3:
# Input dim [:,3] for your geometry x,y,z
X = batch.pos.cuda(non_blocking=True).view(batch_size, sample_points, -1)
else:
# Input dim [:,6] for your geometry x,y,z and normals nx,ny,nz
X = torch.cat((batch.pos.cuda(non_blocking=True), batch.normal.cuda(non_blocking=True)), 1).view(batch_size, sample_points, -1)
y = batch.y.cuda(non_blocking=True).flatten()
pred = model(None, X) #size (batch,classes) per batch_n
if overall_classes_loss:
# weighted CE Loss over all classes
loss = loss_fn(pred, y)
else:
# weighted batchwise Loss
sample_count = np.array([[x, batch.y.tolist().count(x)] for x in batch.y])[:,1]
batch_weights = 1. / sample_count
batch_weights = torch.from_numpy(batch_weights)
batch_weights = batch_weights.double()
loss = element_weighted_loss(pred, batch.y, batch_weights, gpu)
test_losses.append(loss.item())
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
print(f"Loss: {loss}")
tensor_list_y = [torch.ones_like(y) for _ in range(dist.get_world_size())]
tensor_list_pred = [torch.ones_like(y) for _ in range(dist.get_world_size())]
torch.distributed.all_gather(tensor_list_y, y, group=None, async_op=False)
torch.distributed.all_gather(tensor_list_pred, pred.argmax(1), group=None, async_op=False)
tensor_list_y = torch.cat(tensor_list_y)
tensor_list_pred = torch.cat(tensor_list_pred)
# Confusion Matrix
conf_mat += confusion_matrix(tensor_list_y.cpu().detach().numpy(), tensor_list_pred.cpu().detach().numpy(), labels=np.arange(0,10))
# Save batch predictions
batch_results[batch_n] = {'true':tensor_list_y, 'pred':tensor_list_pred}
if verbosity == True:
print(f"\n\nTEST on GPU:{gpu}: True Label {y} - Prediction {pred.argmax(1)} - Loss {loss}")
truevalue = '\t\t'.join(classes[items] for items in y.tolist())
predvalues = '\t\t'.join(classes[items] for items in pred.argmax(1).tolist())
print(f"INFO on GPU:{gpu}: TEST - True Value\t {truevalue}")
print(f"INFO on GPU:{gpu}: TEST - Predictions\t {predvalues}")
test_loss = statistics.mean(test_losses)
return torch.tensor(correct, device=f"cuda:{gpu}"), torch.tensor(test_loss, device=f"cuda:{gpu}"), batch_results, conf_mat
def train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, scheduler, dir_path, initial_epoch):
epoch_losses = []
training_accuracies = []
test_losses = []
test_accuracies = []
learning_rates = []
counter = 0 #early stopping counter
batchwise_results = dict()
# Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=20)
for i in range(initial_epoch, initial_epoch + epochs):
if gpu == 0:
if initial_epoch > 0:
print(f"\n\nEpoch {i}\n-------------------------------")
else:
print(f"\n\nEpoch {i + 1}\n-------------------------------")
# TRAIN
losses, training_accuracy, train_batch_result, train_conf_mat = training_loop(gpu, training_dataloader, model, loss_fn, optimizer)
average_loss = torch.mean(losses)
torch.distributed.reduce(average_loss, 0, torch.distributed.ReduceOp.SUM)
torch.distributed.reduce(training_accuracy, 0, torch.distributed.ReduceOp.SUM)
# TEST
test_accuracy, test_loss, test_batch_result, test_conf_mat = test_loop(gpu, test_dataloader, model, loss_fn)
torch.distributed.reduce(test_accuracy, 0, torch.distributed.ReduceOp.SUM)
torch.distributed.reduce(test_loss, 0, torch.distributed.ReduceOp.SUM)
# save results
batchwise_results[i] = {'train':train_batch_result, 'test':test_batch_result}
if gpu == 0: # the following operations are performed only by the process running in the first gpu
average_loss = average_loss / torch.tensor(gpus, dtype=torch.float) # average loss among all gpus
test_accuracy = test_accuracy / torch.tensor(len(test_dataloader.dataset),
dtype=torch.float) * torch.tensor(100.0)
training_accuracy = training_accuracy / torch.tensor(len(training_dataloader.dataset),
dtype=torch.float) * torch.tensor(100.0)
test_loss = test_loss / torch.tensor(gpus, dtype=torch.float)
epoch_losses.append(average_loss.item())
training_accuracies.append(training_accuracy.item())
test_losses.append(test_loss.item())
test_accuracies.append(test_accuracy.item())
learning_rates.append((optimizer.param_groups[0])["lr"])
print(f"\nBatch size: {batch_size * int(gpus)}")
print(f"average Training Loss: {average_loss.item():.6f}")
print(f"average Test Loss: {test_loss.item():.6f}")
print(f"\naverage Training Acc: {training_accuracy.item():.6f}")
print(f"average Test Acc: {test_accuracy.item():.6f}")
printLearningRate(optimizer)
scheduler.step(test_loss)
# saving model checkpoint
save_checkpoint(model, optimizer, scheduler, i, epoch_losses, training_accuracies, test_losses, test_accuracies, learning_rates,
os.path.join(dir_path, f"epoch{i}.pth"), {key: value for key, value in batchwise_results[i].items() if key == 'train'}, {key: value for key, value in batchwise_results[i].items() if key == 'test'}, train_conf_mat, test_conf_mat)
#TODO: implement ONNX Export
# early stopping scheduler
if early_stopping(test_losses) == True:
counter += 1
print(f"Early Stopping counter: {counter} of {patience}")
else:
counter += 0
if counter < patience:
pass
else:
print("\n\nEarly Stopping activated")
print(f"Training stopped at Epoch{i + 1}")
dist.destroy_process_group()
exit()
def train(gpu, gpus, world_size):
torch.manual_seed(0)
torch.cuda.set_device(gpu)
try:
dist.init_process_group(backend='nccl', world_size=world_size, rank=gpu) #for distributed GPU training
except RuntimeError:
print("\n\nINFO:RuntimeError is raised >> Used gloo backend instead of nccl!\n")
dist.init_process_group(backend='gloo', world_size=world_size, rank=gpu) #as a fallback option
dir_path = None
if gpu == 0:
dir_path = "stackgraphConvPool3DPnet"
createdir(dir_path)
training_number = next_training_number(dir_path)
dir_path = os.path.join(dir_path, f"train{training_number}")
createdir(dir_path)
#save hyper-parameters in txt protocol file
save_hyperparameters(dir_path, 'hyperparameters.txt')
print("\nINFO: Protocol File saved successfully . . .")
model = Classifier(shrinkingLayers, mlpClassifier)
torch.cuda.set_device(gpu)
model.cuda(gpu)
#setting up optimizer
if optimizer_str == "SGD":
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=momentum, weight_decay=weight_decay)
elif optimizer_str == "RMSprop":
optimizer = torch.optim.RMSprop(model.parameters(), learning_rate, weight_decay=weight_decay)
else:
optimizer = torch.optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay)
# single-program multiple-data training paradigm (Distributed Data-Parallel Training)
model = DDP(model, device_ids=[gpu])
if dimensionality == 3:
training_data = ModelNet("ModelNet10_train_data", transform=lambda x: NormalizeScale()(SamplePoints(num=sample_points)(x)))
else:
training_data = ModelNet("ModelNet10_train_data", transform=lambda x: NormalizeScale()(NormalizeRotation()(SamplePoints(num=sample_points, remove_faces=True, include_normals=True)(x))))
training_sampler = DistributedWeightedSampler(training_data, num_replicas=world_size) #weight unbalanced classes by 1/cls_count
training_dataloader = DataLoader(dataset=training_data, batch_size=batch_size, shuffle=data_shuffle, num_workers=0,
pin_memory=True, sampler=training_sampler)
if dimensionality == 3:
test_data = ModelNet("ModelNet10_test_data", train=False, transform=lambda x: NormalizeScale()(SamplePoints(num=sample_points)(x)))
else:
test_data = ModelNet("ModelNet10_test_data", train=False, transform=lambda x: NormalizeScale()(NormalizeRotation()(SamplePoints(num=sample_points, remove_faces=True, include_normals=True)(x))))
test_sampler = DistributedWeightedSampler(test_data, num_replicas=world_size) #weight unbalanced classes by 1/cls_count
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=data_shuffle, num_workers=0,
pin_memory=True, sampler=test_sampler)
# weighted CE Loss over all Classes C
class_sample_count = np.array([len(np.where(training_data.data.y == t)[0]) for t in np.unique(training_data.data.y)])
weight = 1. / class_sample_count
weight = torch.from_numpy(weight)
weight = weight.float()
loss_fn = nn.CrossEntropyLoss(weight=weight).cuda(gpu)
# continue training from certain checkpoint
continue_from_scratch = True if args.resume is None else False
if continue_from_scratch:
if gpu == 0:
print("\nINFO: Train from scratch has started . . .")
train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, 0)
else:
checkpoint_path = "stackgraphConvPool3DPnet/" + args.resume
if gpu == 0:
print(f"\nINFO: Train has started from certain checkpoint {checkpoint_path.split('/')[2].split('.')[0]} in {checkpoint_path.split('/')[1]} . . .")
model.load_state_dict(torch.load(checkpoint_path)['model_state_dict'], strict=False)
optimizer.load_state_dict(torch.load(checkpoint_path)['optimizer_state_dict'])
final_epoch = (torch.load("stackgraphConvPool3DPnet/" + args.resume)['epoch'])+1
train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, final_epoch)
code tools:
class KMeansInitMostDistantFromMean:
def __call__(self, *args, **kwargs):
X, k = args
mean = np.mean(X, axis=0)
arg_sorted = np.argsort(np.apply_along_axis(lambda y: euclidean(mean, y), 1, X))
output = X[np.flip(arg_sorted)[:k]]
return output
class KMeansInit:
def __call__(self, *args, **kwargs):
X, k = args
current_centroids = np.expand_dims(np.mean(X, axis=0), 0)
for i in range(k - 1):
X, current_centroids = self.next_centroid(X, current_centroids)
return current_centroids
def next_centroid(self, X, curr_centroids):
highest_dist = 0.0
next_centroid = None
next_centroid_index = None
for i, x in enumerate(X):
max_dist = np.amax(np.apply_along_axis(lambda y: euclidean(x, y), 1, curr_centroids))
if max_dist > highest_dist:
next_centroid = x
highest_dist = max_dist
next_centroid_index = i
return np.delete(X, next_centroid_index, 0), np.append(curr_centroids, np.expand_dims(next_centroid, 0), 0)
class Conv(gnn.MessagePassing):
def __init__(self, sigma: nn.Module, F: nn.Module, W: nn.Module, M: nn.Module, C: int, P: int):
super().__init__(aggr="mean")
self.sigma = sigma
self.F = F
self.W = W
self.M = M
self.C = C
self.P = P
self.B = torch.randn(C+P, requires_grad=True)
def forward(self, feature_matrix, edge_index):
return self.propagate(edge_index, feature_matrix=feature_matrix)
def message(self, feature_matrix_i, feature_matrix_j):
message = self.F(feature_matrix_j - feature_matrix_i)
message = message.view(-1, self.C + self.P, self.C)
feature_matrix_i_ = feature_matrix_i.unsqueeze(2)
output = torch.bmm(message, feature_matrix_i_).squeeze()
return output
def update(self, aggr_out, feature_matrix):
Weight = self.M(aggr_out)
aggr_out = aggr_out * Weight
transform = self.W(feature_matrix)
transform = transform.view(-1, self.C + self.P, self.C)
feature_matrix = feature_matrix.unsqueeze(2)
transformation = torch.bmm(transform, feature_matrix).squeeze()
aggr_out = aggr_out + transformation
output = aggr_out + self.B
output = self.sigma(output)
return output
class Aggregation(nn.Module):
def __init__(self, mlp1: nn.Module, mlp2: nn.Module):
super().__init__()
self.mlp1 = mlp1
self.mlp2 = mlp2
self.softmax = nn.Softmax(0)
def forward(self, feature_matrix_batch: torch.Tensor, conv_feature_matrix_batch: torch.Tensor):
N, I, D = feature_matrix_batch.size()
N_, I_, D_ = conv_feature_matrix_batch.size()
augmentation = D_ - D
if augmentation > 0:
feature_matrix_batch = F.pad(feature_matrix_batch, (0, augmentation))
S1 = torch.mean(feature_matrix_batch, 1)
S2 = torch.mean(conv_feature_matrix_batch, 1)
Z1 = self.mlp1(S1)
Z2 = self.mlp2(S2)
M = self.softmax(torch.stack((Z1, Z2), 0))
M1 = M[0]
M2 = M[1]
M1 = M1.unsqueeze(1).expand(-1, I, -1)
M2 = M2.unsqueeze(1).expand(-1, I, -1)
output = (M1 * feature_matrix_batch) + (M2 * conv_feature_matrix_batch)
return output
class MaxPool(nn.Module):
def __init__(self, k: int):
super().__init__()
self.k = k
def forward(self, feature_matrix_batch: torch.Tensor, cluster_index: torch.Tensor):
N, I, D = feature_matrix_batch.size()
feature_matrix_batch = feature_matrix_batch.view(-1, D)
output = scatter_max(feature_matrix_batch, cluster_index, dim=0)[0]
output = output.view(N, self.k, -1)
return output
class GraphConvPool3DPnet(nn.Module):
def __init__(self, shrinkingLayers: [ShrinkingUnit], mlp: nn.Module):
super().__init__()
self.neuralNet = nn.Sequential(*shrinkingLayers, mlp)
def forward(self, x: torch.Tensor, pos: torch.Tensor):
feature_matrix_batch = torch.cat((pos, x), 2) if x is not None else pos
return self.neuralNet(feature_matrix_batch)
class ShrinkingUnitStack(nn.Module):
def __init__(self, input_stack: int, stack_fork: int, mlp: nn.Module, learning_rate: int, k: int, kmeansInit, n_init, sigma: nn.Module, F: nn.Module, W: nn.Module,
M: nn.Module, C, P, mlp1: nn.Module, mlp2: nn.Module):
super().__init__()
self.stack_fork = stack_fork
stack_size = input_stack * stack_fork
self.selfCorrStack = SelfCorrelationStack(stack_size, mlp, learning_rate)
self.kmeansConvStack = KMeansConvStack(stack_size, k, kmeansInit, n_init, sigma, F, W, M, C, P)
self.localAdaptFeaAggreStack = AggregationStack(stack_size, mlp1, mlp2)
self.graphMaxPoolStack = MaxPoolStack(stack_size, k)
def forward(self, feature_matrix_batch):
feature_matrix_batch = torch.repeat_interleave(feature_matrix_batch, self.stack_fork, dim=0)
feature_matrix_batch = self.selfCorrStack(feature_matrix_batch)
feature_matrix_batch_, conv_feature_matrix_batch, cluster_index = self.kmeansConvStack(feature_matrix_batch)
feature_matrix_batch = self.localAdaptFeaAggreStack(feature_matrix_batch, conv_feature_matrix_batch)
output = self.graphMaxPoolStack(feature_matrix_batch, cluster_index)
return output
class SelfCorrelationStack(nn.Module):
def __init__(self, stack_size: int, mlp: nn.Module, learning_rate: int = 1.0):
super().__init__()
self.selfCorrelationStack = nn.ModuleList([SelfCorrelation(copy.deepcopy(mlp), learning_rate) for i in range(stack_size)])
self.apply(init_weights)
def forward(self, feature_matrix_batch: torch.Tensor):
# feature_matrix_batch size = (S,N,I,D) where S=stack_size, N=batch number, I=members, D=member dimensionality
output = selfCorrThreader(self.selfCorrelationStack, feature_matrix_batch)
# output size = (S,N,I,D) where where S=stack_size, N=batch number, I=members, D=member dimensionality
return output
class KMeansConvStack(nn.Module):
def __init__(self, stack_size: int, k: int, kmeansInit, n_init: int, sigma: nn.Module, F: nn.Module, W: nn.Module,
M: nn.Module, C: int, P: int):
super().__init__()
self.kmeansConvStack = nn.ModuleList([
KMeansConv(k, kmeansInit, n_init, copy.deepcopy(sigma), copy.deepcopy(F), copy.deepcopy(W),
copy.deepcopy(M), C, P) for i in range(stack_size)])
self.apply(init_weights)
def forward(self, feature_matrix_batch: torch.Tensor):
# feature_matrix_batch size = (S,N,I,D) where S=stack size, N=batch number, I=members, D=member dimensionality
feature_matrix_batch, conv_feature_matrix_batch, cluster_index = kmeansConvThreader(self.kmeansConvStack,
feature_matrix_batch)
return feature_matrix_batch, conv_feature_matrix_batch, cluster_index
class AggregationStack(nn.Module):
def __init__(self, stack_size: int, mlp1: nn.Module, mlp2: nn.Module):
super().__init__()
self.localAdaptFeatAggreStack = nn.ModuleList([Aggregation(copy.deepcopy(mlp1), copy.deepcopy(mlp2)) for i
in range(stack_size)])
self.apply(init_weights)
def forward(self, feature_matrix_batch: torch.Tensor, conv_feature_matrix_batch: torch.Tensor):
output = threader(self.localAdaptFeatAggreStack, feature_matrix_batch, conv_feature_matrix_batch)
return output
class MaxPoolStack(nn.Module):
def __init__(self, stack_size: int, k: int):
super().__init__()
self.graphMaxPoolStack = nn.ModuleList([MaxPool(k) for i in range(stack_size)])
self.apply(init_weights)
def forward(self, feature_matrix_batch: torch.Tensor, cluster_index: torch.Tensor):
output = threader(self.graphMaxPoolStack, feature_matrix_batch, cluster_index)
return output
def selfCorrThreader(modules, input_tensor):
list_append = []
threads = []
for i, t in enumerate(input_tensor):
threads.append(Thread(target=selfCorrAppender, args=(modules[i], t, list_append, i)))
[t.start() for t in threads]
[t.join() for t in threads]
list_append.sort()
list_append = list(map(lambda x: x[1], list_append))
return torch.stack(list_append)
def selfCorrAppender(module, tensor, list_append, index):
list_append.append((index, module(tensor)))
def kmeansConvThreader(modules, input_tensor):
list1_append = []
list2_append = []
list3_append = []
threads = []
for i, t in enumerate(input_tensor):
threads.append(
Thread(target=kmeansAppender, args=(modules[i], t, list1_append, list2_append, list3_append, i)))
[t.start() for t in threads]
[t.join() for t in threads]
list1_append.sort()
list2_append.sort()
list3_append.sort()
list1_append = list(map(lambda x: x[1], list1_append))
list2_append = list(map(lambda x: x[1], list2_append))
list3_append = list(map(lambda x: x[1], list3_append))
return torch.stack(list1_append), torch.stack(list2_append), torch.stack(list3_append)
def kmeansAppender(module, input, list1_append, list2_append, list3_append, index):
x, y, z = module(input)
list1_append.append((index, x))
list2_append.append((index, y))
list3_append.append((index, z))
def threader(modules, input_tensor1, input_tensor2):
list_append = []
threads = []
for i, t in enumerate(input_tensor1):
threads.append(Thread(target=threaderAppender, args=(modules[i], t, input_tensor2[i], list_append, i)))
[t.start() for t in threads]
[t.join() for t in threads]
list_append.sort()
list_append = list(map(lambda x: x[1], list_append))
return torch.stack(list_append)
def threaderAppender(module, t1, t2, list_append, index):
list_append.append((index, module(t1, t2)))
class Classifier(nn.Module):
def __init__(self, shrinkingLayersStack: [ShrinkingUnitStack], mlp: nn.Module):
super().__init__()
self.neuralNet = nn.Sequential(*shrinkingLayersStack)
self.mlp = mlp
def forward(self, x: torch.Tensor, pos: torch.Tensor):
feature_matrix_batch = pos.unsqueeze(0)
output = self.neuralNet(feature_matrix_batch)
output = torch.mean(output, dim=0)
return self.mlp(output)
Error:
thank you for your help

The attribute labels_ of a KMeans object is created once you actually compute the clusters by running .fit() (or .fit_predict(), or .fit_transform()).
Simple example:
>>> from sklearn.cluster import KMeans
>>> from numpy.random import random
>>> X = random((10,2))
>>> X
array([[0.2096706 , 0.69704806],
[0.31732618, 0.29607599],
[0.10372159, 0.56911046],
[0.30922255, 0.07952464],
[0.21190404, 0.46823665],
[0.67134948, 0.95702692],
[0.14781526, 0.24619197],
[0.89931979, 0.96301003],
[0.88256126, 0.07569739],
[0.70776912, 0.92997521]])
>>> clustering = KMeans(n_clusters=3)
>>> clustering.labels_
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'KMeans' object has no attribute 'labels_'
>>> clustering.fit(X)
KMeans(n_clusters=3)
>>> clustering.labels_
array([0, 0, 0, 0, 0, 1, 0, 1, 2, 1], dtype=int32)

It's a pytorch question. Backward() is executed only the first time or two and no further progress is made

enter image description here
So we've translated this image into a code.
class pre_h(nn.Module):
def __init__(self, vocab_size, hidden_size, window_size):
super().__init__()
self.window_size = window_size
self.in_layer = nn.Embedding(vocab_size, hidden_size)
self.weight_list = []
def forward(self, contexts):
for i in range(2 * self.window_size):
weight = self.in_layer(contexts[:,i])
self.weight_list.append(weight)
h = sum(self.weight_list)
h = h / len(self.weight_list)
return h
class UnigramSampler:
def __init__(self, corpus, power, sample_size):
super().__init__()
self.sample_size = sample_size
self.vocab_size = None
self.word_p = None
counts = collections.Counter()
for word_id in corpus:
counts[word_id] += 1
vocab_size = len(counts)
self.vocab_size = vocab_size
self.word_p = np.zeros(vocab_size)
for i in range(vocab_size):
self.word_p[i] = counts[i]
self.word_p = np.power(self.word_p, power)
self.word_p /= np.sum(self.word_p)
def get_negative_sample(self, target):
batch_size = target.shape[0]
negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size), replace=True, p=self.word_p)
negative_sample = torch.tensor(negative_sample)
return negative_sample
class NegativeSampling(nn.Module):
def __init__(self, vocab_size, hidden_size, corpus, power = 0.75, sample_size = 5):
super().__init__()
self.sample_size = sample_size
self.sampler = UnigramSampler(corpus, power, sample_size)
self.embedding = nn.Embedding(vocab_size, hidden_size)
self.sigmoid = nn.Sigmoid()
self.loss_layer = nn.CrossEntropyLoss()
self.hidden_size = hidden_size
def forward(self, h, target):
loss_data = 0
w = self.embedding(target)
w = torch.squeeze(w).reshape(self.hidden_size, -1)
loss = h # w
loss = self.sigmoid(loss)
correct_label = torch.ones_like(loss)
loss = self.loss_layer(loss, correct_label)
loss_data += loss
negative_target = self.sampler.get_negative_sample(target)
for i in range(self.sample_size):
w = self.embedding(negative_target[:,i])
w = torch.squeeze(w).reshape(self.hidden_size, -1)
loss = h # w
loss = self.sigmoid(loss)
negative_label = torch.zeros_like(loss)
loss = self.loss_layer(loss, negative_label)
return loss_data
class sample_cbow(nn.Module):
def __init__(self, vocab_size, hidden_size, corpus, window_size, power = 0.75, sample_size = 5):
super().__init__()
self.model_1 = pre_h(vocab_size, hidden_size, window_size)
self.model_2 = NegativeSampling(vocab_size, hidden_size, corpus, power, sample_size)
def forward(self, contexts, target):
out = self.model_1(contexts)
out = self.model_2(out, target)
return out
vocab_size = len(id_to_word)
model = sample_cbow(vocab_size, 100, corpus, 1)
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate) # 아담으로 옵티마이저 설정
i = 1
for contexts, target in dataloader:
print('===================================')
print(i,'experiment')
contexts.to(device)
target.to(device)
model.to(device)
output = model(contexts, target)
optimizer.zero_grad()
output.backward()
optimizer.step()
Here is the dataset I used. I'm leaving this just in case.
class Corpus_Dataset(Dataset):
def __init__(self, corpus, window_size = 1):
contexts, target = create_contexts_target(corpus, window_size)
self.contexts = contexts
self.target = target
def __len__(self):
return len(self.contexts)
def __getitem__(self, idx):
contexts = torch.tensor(self.contexts[idx], dtype = torch.long)
target = torch.tensor(self.target[idx], dtype = torch.long)
return contexts, target
dataset = Corpus_Dataset(corpus, window_size = 2)
dataloader = DataLoader(dataset, batch_size = 100)
RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward
We know that with backward(), the graph is no longer formed.
But I've only used one backward(), and I get this error even though I didn't recycle it again.
If the backward() of the model was not executed from the beginning, it would have been suspected as a problem with the model, but only the first or second time the backward() runs normally and does not proceed any further.
Clone() or detach() were used for more diverse tensors, but this error was not resolved.
I searched and thought about what the problem was, but I can't think of a solution anymore.
I'm sorry for my poor English.

Pytorch Deep Learning model for noisy function approximation

I have a dataset as shown in this picture: Dataset
I want to to approximate a function to fit the data and make predictions on my test dataset, I have tried to use a Neural Network as shown, but I get huge training loss, what should I change ? Is it just a problem of layers and neurons ?
class NeuralNet(nn.Module):
def __init__(self,
input_dimension,
output_dimension,
n_hidden_layers,
neurons,
regularization_param,
regularization_exp,
retrain_seed,
activation_name):
super(NeuralNet, self).__init__()
# Number of input dimensions n
self.input_dimension = input_dimension
# Number of output dimensions m
self.output_dimension = output_dimension
# Number of neurons per layer
self.neurons = neurons
# Number of hidden layers
self.n_hidden_layers = n_hidden_layers
# Activation function
self.activation_name = activation_name
self.activation = self.get_activation(activation_name)
# Regularization parameter
self.regularization_param = regularization_param
# Regularization exponent
self.regularization_exp = regularization_exp
# Random seed for weight initialization
self.retrain_seed = retrain_seed
if self.n_hidden_layers != 0:
self.input_layer = nn.Linear(self.input_dimension, self.neurons)
self.hidden_layers = nn.ModuleList([nn.Linear(self.neurons, self.neurons) for _ in range(n_hidden_layers - 1)])
self.output_layer = nn.Linear(self.neurons, self.output_dimension)
else:
print("Simple linear regression")
self.linear_regression_layer = nn.Linear(self.input_dimension, self.output_dimension)
self.init_xavier()
def init_xavier(self):
torch.manual_seed(self.retrain_seed)
def init_weights(m):
if type(m) == nn.Linear and m.weight.requires_grad and m.bias.requires_grad:
if self.activation_name in ['tanh', 'relu']:
gain = nn.init.calculate_gain(self.activation_name)
else:
gain = 1
torch.nn.init.xavier_uniform_(m.weight, gain=gain)
m.bias.data.fill_(0)
self.apply(init_weights)
def regularization(self):
reg_loss = 0
for name, param in self.named_parameters():
if 'weight' in name:
reg_loss = reg_loss + torch.norm(param, self.regularization_exp)
return reg_loss
def get_activation(self, activation_name):
if activation_name in ['tanh']:
return nn.Tanh()
elif activation_name in ['relu']:
return nn.ReLU(inplace=True)
elif activation_name in ['lrelu']:
return nn.LeakyReLU(inplace=True)
elif activation_name in ['sigmoid']:
return nn.Sigmoid()
elif activation_name in ['softplus']:
return nn.Softplus(beta=4)
elif activation_name in ['celu']:
return nn.CELU()
else:
raise ValueError('Unknown activation function')
def forward(self, x):
# The forward function performs the set of affine and non-linear transformations defining the network
# (see equation above)
if self.n_hidden_layers != 0:
x = self.activation(self.input_layer(x))
for k, l in enumerate(self.hidden_layers):
x = self.activation(l(x))
return self.output_layer(x)
else:
return self.linear_regression_layer(x)
my_network = NeuralNet(input_dimension=train1.shape[1], output_dimension=test1.shape[1], n_hidden_layers=8, neurons=500,regularization_param= 1e-4,
regularization_exp=2,
retrain_seed=134,
activation_name="tanh")
my_network.double()
optimizer_ = optim.Adam(my_network.parameters(), lr=0.0001, weight_decay=0)
def fit(model, training_set, x_validation_, y_validation_, num_epochs, optimizer, p, verbose=True):
history = [[], []]
regularization_param = model.regularization_param
# Loop over epochs
for epoch in range(num_epochs):
if verbose: print("################################ ", epoch, " ################################")
running_loss = list([0])
# Loop over batches
for j, (x_train_, u_train_) in enumerate(training_set):
def closure():
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
u_pred_ = model(x_train_)
loss_u = torch.mean((u_pred_ - u_train_) ** p)
loss_reg = model.regularization()
loss = loss_u + regularization_param * loss_reg
loss.backward()
# Compute average training loss over batches for the current epoch
running_loss[0] += loss.item() / len(training_set)
return loss
optimizer.step(closure=closure)
y_validation_pred_ = model(x_validation_)
validation_loss = torch.mean((y_validation_pred_.reshape(-1, ) - y_validation_.reshape(-1, )) ** p).item()
history[0].append(running_loss[0])
history[1].append(validation_loss)
if verbose:
print('Training Loss: ', np.round(running_loss[0], 8))
print('Validation Loss: ', np.round(validation_loss, 8))
print('Final Training Loss: ', np.round(history[0][-1], 8))
print('Final Validation Loss: ', np.round(history[1][-1], 8))
return history
n_epochs = 2000
history = fit(my_network, training_set1, x_val1, y_val1, n_epochs, optimizer_, p=2, verbose=False )
Sorry for the long code, I thought it was important to put it all to give you all better information on how to help me.

VAE: mat1 and mat2 shapes cannot be multiplied (90x12800 and 90x360)

I have CSI data with the following shape (5520, 90, 200) being samples, the stacked antennas and subcarriers and time frame. I am trying to build a variational autoencoder however I keep getting stuck on the training part as it says that the matrixes I am inputting cannot be multiplied.
class Encoder(nn.Module):
def __init__(self, z_dim):
super(Encoder, self).__init__()
x_dim = 90
# Layer 1
self.encoder = nn.Sequential()
self.encoder.add_module('fc1', nn.Linear(x_dim, 360))
self.encoder.add_module('Sigmoid1', nn.Sigmoid())
self.encoder.add_module('BN1', nn.BatchNorm1d(360))
self.encoder.add_module('DO1', nn.Dropout())
# Layer 2
self.encoder.add_module('fc2', nn.Linear(360, 50))
self.encoder.add_module('Sigmoid2', nn.Sigmoid())
self.encoder.add_module('BN2', nn.BatchNorm1d(50))
self.encoder.add_module('DO2', nn.Dropout())
# Latent
self.k_mu = nn.Linear(50, z_dim)
self.k_sigma = nn.Linear(50, z_dim)
def reparameterize(self, mean, logvar):
eps = np.random.normal(shape=mean.shape)
return eps * tf.exp(logvar * .5) + mean
def forward(self, x):
h = self.encoder(x)
k_mu = self.k_mu(h)
k_sigma = self.k_sigma(h)
z_dim = reparameterize(k_mu, log(k_sigma))
return z_dim
class Decoder(nn.Module):
def __init__(self, z_dim):
super(Decoder, self).__init__()
x_dim = 90
#1
self.decoder = nn.Sequential()
self.decoder.add_module('fc3', nn.Linear(z_dim, 50))
self.decoder.add_module('Sigmoid3', nn.Sigmoid())
self.decoder.add_module('BN3', nn.BatchNorm1d(50))
self.decoder.add_module('DO3', nn.Dropout())
#2
self.decoder.add_module('fc4', nn.Linear(50, 360))
self.decoder.add_module('Sigmoid4', nn.Sigmoid())
self.decoder.add_module('BN4', nn.BatchNorm1d(360))
self.decoder.add_module('DO4', nn.Dropout())
self.decoder.add_module('x_recon', nn.Linear(360, x_dim))
def forward(self, z):
x_ = self.decoder(z)
return x_
class VAE(nn.Module):
def __init__(self, z_dim):
super(VAE, self).__init__()
self.encoder = Encoder(z_dim)
self.decoder = Decoder(z_dim)
def sample(self, z_mu, z_log_var):
z_std = torch.exp(z_log_var)
eps = torch.randn_like(z_std)
z = z_mu + z_std*eps
return z
def forward(self, x):
# Encode
z_mu, z_log_var = self.encoder(x)
# Sample
if self.training:
# Sample if we are training
z = self.sample(z_mu, z_log_var)
else:
z = z_mu
# Decode
x_ = self.decoder(z)
return x_, z_mu, z_log_var
vae = VAE(z_dim=10).cuda()
optimizer = optim.Adam(vae.parameters())
epochs = 20 #Number of epochs
# Loop over epochs
for epoch in range(1, epochs + 1):
vae.train()
train_loss = 0
# Loop over batches. Note that the labels are ignored in this case
for batch_idx, (data) in enumerate(train_loader):
data = data.cuda()
optimizer.zero_grad()
data = data.view(data.size(1), -1)
x_recon, z_mu, z_log_var = vae(data)
# Calculate the loss
KL_loss, E_loss = loss_function(x_recon, data, z_mu, z_log_var)
loss = KL_loss + E_loss
loss.backward()
# Accumulate the loss to calculate the average
train_loss += loss.item()
# Step the optimizer
optimizer.step()
if batch_idx % 100 == 0:
print('Epoch: {} [{}/{} ({:.0f}%)]\tKL Loss: {:.6f}\tBC Loss: {:.6f}\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), KL_loss.item() / len(data), E_loss.item() / len(data), loss.item() / len(data)))
print('====> Epoch: {} Average loss: {:.4f}'.format(epoch, train_loss / len(train_loader.dataset)))
And I get the following error: mat1 and mat2 shapes cannot be multiplied (90x12800 and 90x360)
I cannot figure out why I am getting it.

Why gradient check gives high difference (almost 1)?

I'm trying to implement a Neural Net in python without the use of libraries like Keras or Tensorflow. I still have to test the net, right now I just tried to train it on Iris dataset and check afterwards the correctness of the backpropagation algorithm.
To do so, I wrote the gradient checking procedure, calculating the analytical gradients and comparing them with the gradients from backpropagation.
The point is that, even if the backpropagation algorithm seems correct to me, the difference between the gradients is always high (around 0.8, instead of the classic 1e-7).
Layer class
class Dense(Layer):
def __init__(self, input_shape, name=None, activation='relu', regularization='l2'):
self.name = name
self.is_output = False
self.weights = np.random.uniform(low=0.01, high=0.10, size=input_shape)
self.biases = np.ones((1,input_shape[1]))
if activation == 'sigmoid':
self.activation = Activation_Sigmoid()
else: #activation == 'relu':
self.activation = Activation_ReLU()
self.cost = Categorical_CrossEntropyLoss()
def set_as_output(self, is_output=True):
self.is_output = is_output
def forward(self, inputs, debug=False, epsilon=None):
self.net_input = inputs
if debug:
augmented_parameters = np.zeros(epsilon.shape)
weights_column_vector = np.reshape(self.weights,(-1,1))
biases_column_vector = np.reshape(self.biases,(-1,1))
concatenated_parameters = np.concatenate((weights_column_vector, biases_column_vector))
for i in range(concatenated_parameters.shape[0]):
augmented_parameters[i] = concatenated_parameters[i]
# make the augmented parameter long as theta in order to sum them
# this because epsilon is a standard basis vector
augmented_parameters += epsilon
# rebuild the weights matrix and biases vector to apply forward propagation
weights_end = self.weights.shape[0] * self.weights.shape[1]
biases_end = self.biases.shape[0] * self.biases.shape[1] + weights_end
weights = np.reshape(augmented_parameters[0:weights_end],self.weights.shape)
biases = np.reshape(augmented_parameters[weights_end:biases_end], self.biases.shape)
output = np.dot(inputs, weights) + biases
activated_output = self.activation.forward(output)
return activated_output
self.output = np.dot(inputs, self.weights) + self.biases
self.activated_output = self.activation.forward(self.output)
return self.activated_output
def backward(self, X, y, output, step, l2=0.5): #backpropagation
m = X.shape[0] # number of examples
if self.is_output:
error = self.cost.backward(output, y) #(a_k - y_hat_k)
delta_k = self.activation.backward(self.output)* error
# net input for neuron k is a_j^(l-1)
grad = np.dot(self.net_input.T, delta_k)
#update weights with l2 regularization
self.grad_w = grad + (l2 / m)*self.weights
self.grad_b = np.sum(delta_k * 1,axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_k ,self.weights.T)
else:
delta_j = self.activation.backward(self.output) * output
grad = np.dot(self.net_input.T, delta_j)
self.grad_w = grad + (l2 / m) * self.weights
self.grad_b = np.sum(delta_j * 1, axis=0)
self.weights -= step * self.grad_w
self.biases -= step * self.grad_b
return np.dot(delta_j, self.weights.T)
def get_parameters(self):
return self.weights, self.biases
def get_gradients(self):
return self.grad_w, self.grad_b
Neural Net class
class NeuralNet():
def __init__(self):
self.layers = []
self.layers_output = []
self.cost = None
self.regularization = L2_Regularization()
def add(self,layer):
self.layers.append(layer)
def forward(self, inputs, debug=False, epsilon=None):
input = np.copy(inputs)
for layer in self.layers:
output = layer.forward(input, debug=debug, epsilon=epsilon)
input = output
return input
def backward(self, X, y, output, step):
prev_delta = None
out = output
for layer in self.layers[::-1]:
prev_delta = layer.backward(X, y, out, step)
out = prev_delta
def fit(self, X, y, batch_size=1, epochs=10, step=0.05, shuffle=True):
self.layers[-1].set_as_output()
self.error = []
i = 0.005 * epochs
for epoch in range(epochs):
if shuffle:
X = np.random.permutation(X)
batches = int(np.ceil(X.shape[0]/batch_size))
batches_error = []
for t in range(batches):
batch_X = X[t*batch_size:np.min([X.shape[0],(t+1)*batch_size]),:]
batch_y = y[t*batch_size:np.min([y.shape[0],(t+1)*batch_size]),:]
output = self.forward(batch_X)
cost = self.cost.forward(output,batch_y)
cost += self.regularization.forward(X, self.layers)
batches_error.append(cost)
self.backward(batch_X, batch_y, output, step)
self.error.append(np.mean(batches_error))
if epoch % i == 0:
print('epoch:', epoch, 'error:', np.mean(self.error))
return self
def parameters_to_theta(self):
theta = []
for layer in self.layers:
w, b = layer.get_parameters()
#flatten parameter w
new_vector = np.reshape(w, (-1,1))
theta.append(new_vector)
#flatten parameter b
new_vector = np.reshape(b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradients_to_theta(self):
theta = []
for layer in self.layers:
grad_w, grad_b = layer.get_gradients()
new_vector = np.reshape(grad_w, (-1,1))
theta.append(new_vector)
new_vector = np.reshape(grad_b, (-1,1))
theta.append(new_vector)
return np.vstack(theta)
def gradient_check(self, X, y, epsilon=1e-7):
theta = self.parameters_to_theta()
dtheta = self.gradients_to_theta()
num_parameters = theta.shape[0]
J_plus = np.zeros((num_parameters, 1))
J_minus = np.zeros((num_parameters, 1))
dtheta_approx = np.zeros((num_parameters, 1))
for i in range(num_parameters):
theta_plus = np.zeros((num_parameters,1))
theta_plus[i] = epsilon
J_plus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_plus),y)
theta_minus = np.zeros((num_parameters,1))
theta_minus[i] = - epsilon
J_minus[i] = self.cost.forward(self.forward(X, debug=True, epsilon=theta_minus),y)
dtheta_approx[i] = (J_plus[i] - J_minus[i])/ (2 * epsilon)
numerator = np.linalg.norm(dtheta - dtheta_approx)
denominator = np.linalg.norm(dtheta_approx) + np.linalg.norm(dtheta)
difference = numerator / denominator
return difference
I'm using ReLU and Sigmoid as activation functions, and Categorical Cross Entropy for the cost
import numpy as np
from scipy.special import expit as sigmoid
class Activation_ReLU:
def forward(self, inputs):
return np.maximum(0, inputs)
def backward(self, inputs):
return np.greater(inputs,0).astype(int)
class Activation_Sigmoid:
def forward(self, inputs):
return sigmoid(inputs)
def backward(self, inputs):
return sigmoid(inputs) * (1 - sigmoid(inputs))
class Categorical_CrossEntropyLoss():
def forward(self, y_pred, y_real):
predictions = np.copy(y_pred)
predictions = np.clip(predictions, 1e-12, 1 - 1e-12) # avoid zero values for log
n = y_real.shape[0]
return - (1 / n) * np.sum(y_real * np.log(y_pred))
def backward(self, y_pred, y_real):
return y_real - y_pred
These are the main classes that define the net. The model that I create to train on Iris dataset is a NN with 1 hidden layer.
# random seed is 1
X, y = load_iris(return_X_y=True)
X = (X - np.mean(X)) / np.std(X) # standardize data to improve network convergence
y = y.reshape((-1,1))
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8)
model = NeuralNet()
model.add(Dense((4,10),name='input_layer',activation='relu'))
model.add(Dense((10,10),name='hidden_layer',activation='relu'))
model.add(Dense((10,3),name='output_layer',activation='sigmoid'))
model.fit(X_train,y_train, batch_size=5, epochs=200, step=1e-3)
difference = model.gradient_check(X_train, y_train)
And then, the result of print(difference) is
0.7992920544491866
So there is something wrong with my implementation. What things I have to check to determine the causes of this high difference between gradients?

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Tensorflow: gradients suddenly became NaNs during training - python

Related

AttributeError: 'KMeans' object has no attribute 'labels_' pytorch

It's a pytorch question. Backward() is executed only the first time or two and no further progress is made

Pytorch Deep Learning model for noisy function approximation

VAE: mat1 and mat2 shapes cannot be multiplied (90x12800 and 90x360)

Why gradient check gives high difference (almost 1)?

Categories

Resources