Keras subclassing style model training huge performance difference - python

I was running DSSM_N and DSSM on a dataset with batch size 512 on 2060.
DSSM_N costs ~35ms per batch
DSSM. costs ~400ms per batch.
What makes this huge performance difference? I have checked profiling which said
that DSSM costs ~350ms on All Others Time. How can I fix the DSSM implementation?
Many thanks in advance.
Edited as suggested by Micheal:
The main difference is DSSM makes a hash-table-like lookup (notice tf.nn.embedding_lookup and IntegerLookup) which makes the dataset preprocess a little bit simpler while in DSSM_N this lookup was done in dataset preprocess in advance. However, I don't believe this simple hash table like makes such a big difference. What was I doing wrong?
import pickle
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_hub as hub
import tensorflow_text as text # required for BERT hub model
from keras.layers import Layer, Embedding, Dense, Concatenate, BatchNormalization, Dropout, Dot, Hashing, TextVectorization, GRU, IntegerLookup
from keras import Model
import random
from ..config import *
from ..util import *
def embedding_sequence_reduce_mean(x, mask):
# float[B,L,E], bool[B,L] -> float[B,E]
x = tf.ragged.boolean_mask(x, mask) # (B, Lr, E) remove masked data
x = tf.reduce_mean(x, axis=1) # (B, E)
x = tf.where(tf.math.is_nan(x), 0.0, x) # nan to 0
return x
def embedding_masked_to_zero(x, mask):
mask = tf.expand_dims( # B -> B 1 align for broadcasting
tf.cast(mask, dtype=tf.float32), axis=1)
return x * mask
LATENT_DIM = latent_dim
N_HASH = 8
N_BIN = 1024
print('N_HASH', N_HASH)
print('N_BIN', N_BIN)
class HashEmbedding(Layer):
# TODO: with_importance is not supported
def __init__(
self, n_hash, n_bin, output_dim,
embeddings_initializer='uniform', embeddings_regularizer=None,
activity_regularizer=None, embeddings_constraint=None,
mask_zero=False, input_length=None, **kwargs
super(HashEmbedding, self).__init__()
self.mask_zero = mask_zero
self.n_hash = n_hash
self.n_bin = n_bin
# salts no duplication
self.salts = random.sample(range(self.n_hash * 32), self.n_hash)
self.hashs = [Hashing(
# if mask_zero then hash 0 to 0
mask_value=(0 if self.mask_zero else None),
for i in range(self.n_hash)]
self.embedding = Embedding(
self.n_bin, output_dim,
mask_zero=mask_zero, input_length=input_length
def compute_mask(self, inputs, mask=None):
if not self.mask_zero:
return None
return tf.not_equal(inputs, 0)
def call(self, inputs):
shape = inputs.shape
hash = tf.stack([hash(inputs) # [I], n_hash
for hash in self.hashs], axis=len(shape))
x = self.embedding(hash) # [I], n_hash, emb_dim
x = tf.reduce_sum(x, axis=len(shape)) # [I], emb_dim
return x
class StringVectorization(Layer):
def __init__(self, vocab, embedding_dim=32, output_dim=16):
super(StringVectorization, self).__init__()
self.text_vectorization = TextVectorization(
vocabulary=vocab, split='character')
self.embedding = Embedding(
self.text_vectorization.vocabulary_size(), embedding_dim, mask_zero=True)
self.gru = GRU(output_dim)
def call(self, inputs): # B, S
x = self.text_vectorization(inputs)
x = self.embedding(x)
return self.gru(x)
class TfBertZh(Layer): # 128 - 2 input length limit
def __init__(self): # output_dim 768
super(TfBertZh, self).__init__()
self.preprocess = hub.KerasLayer(
zh_preprocessor_model_file, trainable=False)
self.encoder = hub.KerasLayer(zh_encoder_model_file, trainable=False)
def call(self, inputs):
x = self.preprocess(inputs)
x = self.encoder(x)['pooled_output']
return x
class DNN(Layer):
def __init__(self):
super(DNN, self).__init__()
self.concat = Concatenate(axis=1)
self.dense1 = Dense(64) = BatchNormalization()
self.drop = Dropout(0.1)
self.dense2 = Dense(32)
def call(self, inputs: list):
from keras.activations import tanh
x = self.concat(inputs)
x = self.drop(tanh(
x = tanh(self.dense2(x))
return x
with open(stats_file_pkl, 'rb') as f:
sinfo = pickle.load(f)
with open(vocab_file_pkl, 'rb') as f:
vocab = pickle.load(f)
class DSSM_N(Model):
def __init__(self):
super(DSSM_N, self).__init__()
self.user_id = HashEmbedding(
N_HASH, N_BIN, USER_ID_DIM, mask_zero=True)
self.item_id = Embedding(
sinfo['media_id']['unique'], MEDIA_ID_DIM, mask_zero=True)
self.genre = Embedding(
sinfo['genre_id']['unique'], GENRE_DIM, mask_zero=True)
self.origin = Embedding(
sinfo['origin_id']['unique'], ORIGIN_DIM, mask_zero=True)
self.user_dnn = DNN()
self.item_dnn = DNN() = Dot(axes=1, normalize=False)
def call(self, inputs):
u = self.compute_user_latent({'id': inputs['user']})
n_pos = inputs['pos'].shape[1]
n_neg = inputs['neg'].shape[1]
ui_pos = []
ui_neg = []
def signal(u, i):
return tf.exp([u, i]))
for j in range(n_pos):
i = self.compute_item_latent({
'id': inputs['pos'][:, j],
'genre': inputs['pos_genre'][:, j, :], # B N 4
'origin': inputs['pos_origin'][:, j, :] # B N 2
ui_pos.append(signal(u, i))
ui_pos = tf.add_n(ui_pos)
for j in range(n_neg):
i = self.compute_item_latent({
'id': inputs['neg'][:, j],
'genre': inputs['neg_genre'][:, j, :],
'origin': inputs['neg_origin'][:, j, :]
ui_neg.append(signal(u, i))
ui_neg = tf.add_n(ui_neg)
return tf.squeeze(ui_pos / (ui_pos + ui_neg))
def compute_user_latent(self, inputs):
id = self.user_id(inputs['id'])
latent = self.user_dnn([id])
return latent
def compute_item_latent(self, inputs):
id = self.item_id(inputs['id'])
genre = self.genre(inputs['genre']) # B 4 -> B 4 E
genre = embedding_sequence_reduce_mean(genre, genre._keras_mask)
origin = self.origin(inputs['origin']) # B 2 -> B 2 E
origin = embedding_sequence_reduce_mean(origin, origin._keras_mask)
latent = self.item_dnn([id, genre, origin])
return latent
user_df = pd.read_pickle(preprocessed_user_file_pkl)
media_df = pd.read_pickle(preprocessed_media_file_pkl)
genre_df = pd.read_pickle(clean_genre_file_pkl)
origin_df = pd.read_pickle(clean_origin_file_pkl)
class MediaPreprocess(Layer):
def __init__(self):
super(MediaPreprocess, self).__init__()
self.lookup = IntegerLookup(vocabulary=list(media_df['id']))
self.genre_table = tf.Variable(
[[0] * 4] + list(media_df['genre']), dtype=tf.int32, trainable=False)
self.origin_table = tf.Variable(
[[0] * 2] + list(media_df['origin']), dtype=tf.int32, trainable=False)
self.id_embedding = Embedding(
self.lookup.vocabulary_size() + 1, MEDIA_ID_DIM, mask_zero=True)
self.genre_embedding =\
Embedding(genre_df['id'].max() + 1, GENRE_DIM, mask_zero=True)
self.origin_embedding =\
Embedding(origin_df['id'].max() + 1, ORIGIN_DIM, mask_zero=True)
def __call__(self, inputs):
index = self.lookup(inputs) # B -> B
vector = self.id_embedding(index) # B -> B E
vector = embedding_masked_to_zero(vector, vector._keras_mask)
genre = tf.nn.embedding_lookup(self.genre_table, index)
genre = self.genre_embedding(genre)
genre = embedding_sequence_reduce_mean(genre, genre._keras_mask)
origin = tf.nn.embedding_lookup(self.origin_table, index)
origin = self.origin_embedding(origin)
origin = embedding_sequence_reduce_mean(origin, origin._keras_mask)
return {
'id': vector,
'genre': genre,
'origin': origin}
class UserPreprocess(Layer):
def __init__(self):
super(UserPreprocess, self).__init__()
self.lookup = IntegerLookup(vocabulary=list(user_df['id']))
self.embedding = HashEmbedding(
N_HASH, N_BIN, USER_ID_DIM, mask_zero=True)
def __call__(self, inputs):
vector = self.embedding(inputs)
vector = embedding_masked_to_zero(vector, vector._keras_mask)
return {'id': vector}
class DSSM(Model):
def __init__(self, *args, **kwargs):
super(DSSM, self).__init__()
self.user_pp = UserPreprocess()
self.item_pp = MediaPreprocess()
self.user_nn = DNN()
self.item_nn = DNN()
dot = Dot(axes=1, normalize=False)
self.signal = lambda u, i: tf.exp(dot([u, i]))
def call(self, inputs):
user = inputs['user'] # B
pos_s = inputs['pos'] # B N_POS=1
neg_s = inputs['neg'] # B N_NEG=7
n_pos = pos_s.shape[1]
n_neg = neg_s.shape[1]
u = self.user_pp(user)['id'] # B E(uid)
u = self.user_nn([u]) # B L
def compute_ui(i_s, count):
ui = []
for j in range(count):
i = self.item_pp(i_s[:, j])
i = self.item_nn([i['id'], i['genre'], i['origin']])
ui.append(self.signal(u, i))
return tf.add_n(ui) # C B 1 -> B 1
pos_ui = compute_ui(pos_s, n_pos) # B 1
neg_ui = compute_ui(neg_s, n_neg) # B 1
return tf.squeeze(pos_ui / (neg_ui + pos_ui)) # B


AttributeError: 'KMeans' object has no attribute 'labels_' pytorch

first of all I thank , I tried to train model with pytorch but I got the following error: AttributeError: 'KMeans' object has no attribute 'labels_'.I am trying to model a extract features point cloud using deep learning in pytorch. I get the following error . Could anyone help on this? ************** *************** Thanks!
# Training loop
def training_loop(gpu, training_dataloader, model, loss_fn, optimizer):
losses = []
correct = 0
batch_results = dict()
conf_mat = np.zeros((10,10))
for batch_n, batch in enumerate(training_dataloader): #batch[batch, pos, ptr, y]
batch_size = int(batch.batch.size()[0] / sample_points)
if dimensionality == 3:
# Input dim [:,3] for your geometry x,y,z
X = batch.pos.cuda(non_blocking=True).view(batch_size, sample_points, -1) + torch.normal(
torch.zeros(batch_size, sample_points, dimensionality), torch.full((batch_size, sample_points,
dimensionality), fill_value=0.1)).cuda(gpu)
# Input dim [:,6] for your geometry x,y,z and normals nx,ny,nz
X =, batch.normal.cuda(non_blocking=True)), 1).view(batch_size, sample_points, -1) + torch.normal(
torch.zeros(batch_size, sample_points, dimensionality), torch.full((batch_size, sample_points,
dimensionality), fill_value=0.1)).cuda(gpu)
y = batch.y.cuda(non_blocking=True).flatten() #size (batch_size) --> torch.Size([8])
# Compute predictions
pred = model(None, X) #size (batch_size,classes) --> torch.Size([8, 10])
if overall_classes_loss:
# weighted CE Loss over all classes
loss = loss_fn(pred, y)
# weighted batchwise Loss
sample_count = np.array([[x, batch.y.tolist().count(x)] for x in batch.y])[:,1]
batch_weights = 1. / sample_count
batch_weights = torch.from_numpy(batch_weights)
batch_weights = batch_weights.double()
loss = element_weighted_loss(pred, batch.y, batch_weights, gpu)
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
print(f"Loss: {loss}")
tensor_list_y = [torch.ones_like(y) for _ in range(dist.get_world_size())]
tensor_list_pred = [torch.ones_like(y) for _ in range(dist.get_world_size())]
torch.distributed.all_gather(tensor_list_y, y, group=None, async_op=False)
torch.distributed.all_gather(tensor_list_pred, pred.argmax(1), group=None, async_op=False)
tensor_list_y =
tensor_list_pred =
# Confusion Matrix
conf_mat += confusion_matrix(tensor_list_y.cpu().detach().numpy(), tensor_list_pred.cpu().detach().numpy(), labels=np.arange(0,10))
# Backpropagation
# Save batch predictions
batch_results[batch_n] = {'true':tensor_list_y, 'pred':tensor_list_pred}
if verbosity == True:
print(f"\n\nTRAIN on GPU:{gpu}: True Label {y} - Prediction {pred.argmax(1)} - Loss {loss}")
truevalue = '\t\t'.join(classes[items] for items in y.tolist())
predvalues = '\t\t'.join(classes[items] for items in pred.argmax(1).tolist())
print(f"INFO on GPU:{gpu}: TRAIN - True Value\t {truevalue}")
print(f"INFO on GPU:{gpu}: TRAIN - Predictions\t {predvalues}")
if batch_n % 25 == 0:
torch.distributed.reduce(loss, 0)
return torch.tensor(losses, device=f"cuda:{gpu}"), torch.tensor(correct, device=f"cuda:{gpu}"), batch_results, conf_mat
# Test loop
def test_loop(gpu, test_dataloader, model, loss_fn):
test_losses = []
correct = 0
batch_results = dict()
conf_mat = np.zeros((10,10))
with torch.no_grad():
for batch_n, batch in enumerate(test_dataloader):
batch_size = int(batch.batch.size()[0] / sample_points)
if dimensionality == 3:
# Input dim [:,3] for your geometry x,y,z
X = batch.pos.cuda(non_blocking=True).view(batch_size, sample_points, -1)
# Input dim [:,6] for your geometry x,y,z and normals nx,ny,nz
X =, batch.normal.cuda(non_blocking=True)), 1).view(batch_size, sample_points, -1)
y = batch.y.cuda(non_blocking=True).flatten()
pred = model(None, X) #size (batch,classes) per batch_n
if overall_classes_loss:
# weighted CE Loss over all classes
loss = loss_fn(pred, y)
# weighted batchwise Loss
sample_count = np.array([[x, batch.y.tolist().count(x)] for x in batch.y])[:,1]
batch_weights = 1. / sample_count
batch_weights = torch.from_numpy(batch_weights)
batch_weights = batch_weights.double()
loss = element_weighted_loss(pred, batch.y, batch_weights, gpu)
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
print(f"Loss: {loss}")
tensor_list_y = [torch.ones_like(y) for _ in range(dist.get_world_size())]
tensor_list_pred = [torch.ones_like(y) for _ in range(dist.get_world_size())]
torch.distributed.all_gather(tensor_list_y, y, group=None, async_op=False)
torch.distributed.all_gather(tensor_list_pred, pred.argmax(1), group=None, async_op=False)
tensor_list_y =
tensor_list_pred =
# Confusion Matrix
conf_mat += confusion_matrix(tensor_list_y.cpu().detach().numpy(), tensor_list_pred.cpu().detach().numpy(), labels=np.arange(0,10))
# Save batch predictions
batch_results[batch_n] = {'true':tensor_list_y, 'pred':tensor_list_pred}
if verbosity == True:
print(f"\n\nTEST on GPU:{gpu}: True Label {y} - Prediction {pred.argmax(1)} - Loss {loss}")
truevalue = '\t\t'.join(classes[items] for items in y.tolist())
predvalues = '\t\t'.join(classes[items] for items in pred.argmax(1).tolist())
print(f"INFO on GPU:{gpu}: TEST - True Value\t {truevalue}")
print(f"INFO on GPU:{gpu}: TEST - Predictions\t {predvalues}")
test_loss = statistics.mean(test_losses)
return torch.tensor(correct, device=f"cuda:{gpu}"), torch.tensor(test_loss, device=f"cuda:{gpu}"), batch_results, conf_mat
def train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, scheduler, dir_path, initial_epoch):
epoch_losses = []
training_accuracies = []
test_losses = []
test_accuracies = []
learning_rates = []
counter = 0 #early stopping counter
batchwise_results = dict()
# Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=20)
for i in range(initial_epoch, initial_epoch + epochs):
if gpu == 0:
if initial_epoch > 0:
print(f"\n\nEpoch {i}\n-------------------------------")
print(f"\n\nEpoch {i + 1}\n-------------------------------")
losses, training_accuracy, train_batch_result, train_conf_mat = training_loop(gpu, training_dataloader, model, loss_fn, optimizer)
average_loss = torch.mean(losses)
torch.distributed.reduce(average_loss, 0, torch.distributed.ReduceOp.SUM)
torch.distributed.reduce(training_accuracy, 0, torch.distributed.ReduceOp.SUM)
test_accuracy, test_loss, test_batch_result, test_conf_mat = test_loop(gpu, test_dataloader, model, loss_fn)
torch.distributed.reduce(test_accuracy, 0, torch.distributed.ReduceOp.SUM)
torch.distributed.reduce(test_loss, 0, torch.distributed.ReduceOp.SUM)
# save results
batchwise_results[i] = {'train':train_batch_result, 'test':test_batch_result}
if gpu == 0: # the following operations are performed only by the process running in the first gpu
average_loss = average_loss / torch.tensor(gpus, dtype=torch.float) # average loss among all gpus
test_accuracy = test_accuracy / torch.tensor(len(test_dataloader.dataset),
dtype=torch.float) * torch.tensor(100.0)
training_accuracy = training_accuracy / torch.tensor(len(training_dataloader.dataset),
dtype=torch.float) * torch.tensor(100.0)
test_loss = test_loss / torch.tensor(gpus, dtype=torch.float)
print(f"\nBatch size: {batch_size * int(gpus)}")
print(f"average Training Loss: {average_loss.item():.6f}")
print(f"average Test Loss: {test_loss.item():.6f}")
print(f"\naverage Training Acc: {training_accuracy.item():.6f}")
print(f"average Test Acc: {test_accuracy.item():.6f}")
# saving model checkpoint
save_checkpoint(model, optimizer, scheduler, i, epoch_losses, training_accuracies, test_losses, test_accuracies, learning_rates,
os.path.join(dir_path, f"epoch{i}.pth"), {key: value for key, value in batchwise_results[i].items() if key == 'train'}, {key: value for key, value in batchwise_results[i].items() if key == 'test'}, train_conf_mat, test_conf_mat)
#TODO: implement ONNX Export
# early stopping scheduler
if early_stopping(test_losses) == True:
counter += 1
print(f"Early Stopping counter: {counter} of {patience}")
counter += 0
if counter < patience:
print("\n\nEarly Stopping activated")
print(f"Training stopped at Epoch{i + 1}")
def train(gpu, gpus, world_size):
dist.init_process_group(backend='nccl', world_size=world_size, rank=gpu) #for distributed GPU training
except RuntimeError:
print("\n\nINFO:RuntimeError is raised >> Used gloo backend instead of nccl!\n")
dist.init_process_group(backend='gloo', world_size=world_size, rank=gpu) #as a fallback option
dir_path = None
if gpu == 0:
dir_path = "stackgraphConvPool3DPnet"
training_number = next_training_number(dir_path)
dir_path = os.path.join(dir_path, f"train{training_number}")
#save hyper-parameters in txt protocol file
save_hyperparameters(dir_path, 'hyperparameters.txt')
print("\nINFO: Protocol File saved successfully . . .")
model = Classifier(shrinkingLayers, mlpClassifier)
#setting up optimizer
if optimizer_str == "SGD":
optimizer = torch.optim.SGD(model.parameters(), learning_rate, momentum=momentum, weight_decay=weight_decay)
elif optimizer_str == "RMSprop":
optimizer = torch.optim.RMSprop(model.parameters(), learning_rate, weight_decay=weight_decay)
optimizer = torch.optim.Adam(model.parameters(), learning_rate, weight_decay=weight_decay)
# single-program multiple-data training paradigm (Distributed Data-Parallel Training)
model = DDP(model, device_ids=[gpu])
if dimensionality == 3:
training_data = ModelNet("ModelNet10_train_data", transform=lambda x: NormalizeScale()(SamplePoints(num=sample_points)(x)))
training_data = ModelNet("ModelNet10_train_data", transform=lambda x: NormalizeScale()(NormalizeRotation()(SamplePoints(num=sample_points, remove_faces=True, include_normals=True)(x))))
training_sampler = DistributedWeightedSampler(training_data, num_replicas=world_size) #weight unbalanced classes by 1/cls_count
training_dataloader = DataLoader(dataset=training_data, batch_size=batch_size, shuffle=data_shuffle, num_workers=0,
pin_memory=True, sampler=training_sampler)
if dimensionality == 3:
test_data = ModelNet("ModelNet10_test_data", train=False, transform=lambda x: NormalizeScale()(SamplePoints(num=sample_points)(x)))
test_data = ModelNet("ModelNet10_test_data", train=False, transform=lambda x: NormalizeScale()(NormalizeRotation()(SamplePoints(num=sample_points, remove_faces=True, include_normals=True)(x))))
test_sampler = DistributedWeightedSampler(test_data, num_replicas=world_size) #weight unbalanced classes by 1/cls_count
test_dataloader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=data_shuffle, num_workers=0,
pin_memory=True, sampler=test_sampler)
# weighted CE Loss over all Classes C
class_sample_count = np.array([len(np.where( == t)[0]) for t in np.unique(])
weight = 1. / class_sample_count
weight = torch.from_numpy(weight)
weight = weight.float()
loss_fn = nn.CrossEntropyLoss(weight=weight).cuda(gpu)
# continue training from certain checkpoint
continue_from_scratch = True if args.resume is None else False
if continue_from_scratch:
if gpu == 0:
print("\nINFO: Train from scratch has started . . .")
train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, 0)
checkpoint_path = "stackgraphConvPool3DPnet/" + args.resume
if gpu == 0:
print(f"\nINFO: Train has started from certain checkpoint {checkpoint_path.split('/')[2].split('.')[0]} in {checkpoint_path.split('/')[1]} . . .")
model.load_state_dict(torch.load(checkpoint_path)['model_state_dict'], strict=False)
final_epoch = (torch.load("stackgraphConvPool3DPnet/" + args.resume)['epoch'])+1
train_optimisation(gpu, gpus, training_dataloader, test_dataloader, model, loss_fn, optimizer, None, dir_path, final_epoch)
code tools:
class KMeansInitMostDistantFromMean:
def __call__(self, *args, **kwargs):
X, k = args
mean = np.mean(X, axis=0)
arg_sorted = np.argsort(np.apply_along_axis(lambda y: euclidean(mean, y), 1, X))
output = X[np.flip(arg_sorted)[:k]]
return output
class KMeansInit:
def __call__(self, *args, **kwargs):
X, k = args
current_centroids = np.expand_dims(np.mean(X, axis=0), 0)
for i in range(k - 1):
X, current_centroids = self.next_centroid(X, current_centroids)
return current_centroids
def next_centroid(self, X, curr_centroids):
highest_dist = 0.0
next_centroid = None
next_centroid_index = None
for i, x in enumerate(X):
max_dist = np.amax(np.apply_along_axis(lambda y: euclidean(x, y), 1, curr_centroids))
if max_dist > highest_dist:
next_centroid = x
highest_dist = max_dist
next_centroid_index = i
return np.delete(X, next_centroid_index, 0), np.append(curr_centroids, np.expand_dims(next_centroid, 0), 0)
class Conv(gnn.MessagePassing):
def __init__(self, sigma: nn.Module, F: nn.Module, W: nn.Module, M: nn.Module, C: int, P: int):
self.sigma = sigma
self.F = F
self.W = W
self.M = M
self.C = C
self.P = P
self.B = torch.randn(C+P, requires_grad=True)
def forward(self, feature_matrix, edge_index):
return self.propagate(edge_index, feature_matrix=feature_matrix)
def message(self, feature_matrix_i, feature_matrix_j):
message = self.F(feature_matrix_j - feature_matrix_i)
message = message.view(-1, self.C + self.P, self.C)
feature_matrix_i_ = feature_matrix_i.unsqueeze(2)
output = torch.bmm(message, feature_matrix_i_).squeeze()
return output
def update(self, aggr_out, feature_matrix):
Weight = self.M(aggr_out)
aggr_out = aggr_out * Weight
transform = self.W(feature_matrix)
transform = transform.view(-1, self.C + self.P, self.C)
feature_matrix = feature_matrix.unsqueeze(2)
transformation = torch.bmm(transform, feature_matrix).squeeze()
aggr_out = aggr_out + transformation
output = aggr_out + self.B
output = self.sigma(output)
return output
class Aggregation(nn.Module):
def __init__(self, mlp1: nn.Module, mlp2: nn.Module):
self.mlp1 = mlp1
self.mlp2 = mlp2
self.softmax = nn.Softmax(0)
def forward(self, feature_matrix_batch: torch.Tensor, conv_feature_matrix_batch: torch.Tensor):
N, I, D = feature_matrix_batch.size()
N_, I_, D_ = conv_feature_matrix_batch.size()
augmentation = D_ - D
if augmentation > 0:
feature_matrix_batch = F.pad(feature_matrix_batch, (0, augmentation))
S1 = torch.mean(feature_matrix_batch, 1)
S2 = torch.mean(conv_feature_matrix_batch, 1)
Z1 = self.mlp1(S1)
Z2 = self.mlp2(S2)
M = self.softmax(torch.stack((Z1, Z2), 0))
M1 = M[0]
M2 = M[1]
M1 = M1.unsqueeze(1).expand(-1, I, -1)
M2 = M2.unsqueeze(1).expand(-1, I, -1)
output = (M1 * feature_matrix_batch) + (M2 * conv_feature_matrix_batch)
return output
class MaxPool(nn.Module):
def __init__(self, k: int):
self.k = k
def forward(self, feature_matrix_batch: torch.Tensor, cluster_index: torch.Tensor):
N, I, D = feature_matrix_batch.size()
feature_matrix_batch = feature_matrix_batch.view(-1, D)
output = scatter_max(feature_matrix_batch, cluster_index, dim=0)[0]
output = output.view(N, self.k, -1)
return output
class GraphConvPool3DPnet(nn.Module):
def __init__(self, shrinkingLayers: [ShrinkingUnit], mlp: nn.Module):
self.neuralNet = nn.Sequential(*shrinkingLayers, mlp)
def forward(self, x: torch.Tensor, pos: torch.Tensor):
feature_matrix_batch =, x), 2) if x is not None else pos
return self.neuralNet(feature_matrix_batch)
class ShrinkingUnitStack(nn.Module):
def __init__(self, input_stack: int, stack_fork: int, mlp: nn.Module, learning_rate: int, k: int, kmeansInit, n_init, sigma: nn.Module, F: nn.Module, W: nn.Module,
M: nn.Module, C, P, mlp1: nn.Module, mlp2: nn.Module):
self.stack_fork = stack_fork
stack_size = input_stack * stack_fork
self.selfCorrStack = SelfCorrelationStack(stack_size, mlp, learning_rate)
self.kmeansConvStack = KMeansConvStack(stack_size, k, kmeansInit, n_init, sigma, F, W, M, C, P)
self.localAdaptFeaAggreStack = AggregationStack(stack_size, mlp1, mlp2)
self.graphMaxPoolStack = MaxPoolStack(stack_size, k)
def forward(self, feature_matrix_batch):
feature_matrix_batch = torch.repeat_interleave(feature_matrix_batch, self.stack_fork, dim=0)
feature_matrix_batch = self.selfCorrStack(feature_matrix_batch)
feature_matrix_batch_, conv_feature_matrix_batch, cluster_index = self.kmeansConvStack(feature_matrix_batch)
feature_matrix_batch = self.localAdaptFeaAggreStack(feature_matrix_batch, conv_feature_matrix_batch)
output = self.graphMaxPoolStack(feature_matrix_batch, cluster_index)
return output
class SelfCorrelationStack(nn.Module):
def __init__(self, stack_size: int, mlp: nn.Module, learning_rate: int = 1.0):
self.selfCorrelationStack = nn.ModuleList([SelfCorrelation(copy.deepcopy(mlp), learning_rate) for i in range(stack_size)])
def forward(self, feature_matrix_batch: torch.Tensor):
# feature_matrix_batch size = (S,N,I,D) where S=stack_size, N=batch number, I=members, D=member dimensionality
output = selfCorrThreader(self.selfCorrelationStack, feature_matrix_batch)
# output size = (S,N,I,D) where where S=stack_size, N=batch number, I=members, D=member dimensionality
return output
class KMeansConvStack(nn.Module):
def __init__(self, stack_size: int, k: int, kmeansInit, n_init: int, sigma: nn.Module, F: nn.Module, W: nn.Module,
M: nn.Module, C: int, P: int):
self.kmeansConvStack = nn.ModuleList([
KMeansConv(k, kmeansInit, n_init, copy.deepcopy(sigma), copy.deepcopy(F), copy.deepcopy(W),
copy.deepcopy(M), C, P) for i in range(stack_size)])
def forward(self, feature_matrix_batch: torch.Tensor):
# feature_matrix_batch size = (S,N,I,D) where S=stack size, N=batch number, I=members, D=member dimensionality
feature_matrix_batch, conv_feature_matrix_batch, cluster_index = kmeansConvThreader(self.kmeansConvStack,
return feature_matrix_batch, conv_feature_matrix_batch, cluster_index
class AggregationStack(nn.Module):
def __init__(self, stack_size: int, mlp1: nn.Module, mlp2: nn.Module):
self.localAdaptFeatAggreStack = nn.ModuleList([Aggregation(copy.deepcopy(mlp1), copy.deepcopy(mlp2)) for i
in range(stack_size)])
def forward(self, feature_matrix_batch: torch.Tensor, conv_feature_matrix_batch: torch.Tensor):
output = threader(self.localAdaptFeatAggreStack, feature_matrix_batch, conv_feature_matrix_batch)
return output
class MaxPoolStack(nn.Module):
def __init__(self, stack_size: int, k: int):
self.graphMaxPoolStack = nn.ModuleList([MaxPool(k) for i in range(stack_size)])
def forward(self, feature_matrix_batch: torch.Tensor, cluster_index: torch.Tensor):
output = threader(self.graphMaxPoolStack, feature_matrix_batch, cluster_index)
return output
def selfCorrThreader(modules, input_tensor):
list_append = []
threads = []
for i, t in enumerate(input_tensor):
threads.append(Thread(target=selfCorrAppender, args=(modules[i], t, list_append, i)))
[t.start() for t in threads]
[t.join() for t in threads]
list_append = list(map(lambda x: x[1], list_append))
return torch.stack(list_append)
def selfCorrAppender(module, tensor, list_append, index):
list_append.append((index, module(tensor)))
def kmeansConvThreader(modules, input_tensor):
list1_append = []
list2_append = []
list3_append = []
threads = []
for i, t in enumerate(input_tensor):
Thread(target=kmeansAppender, args=(modules[i], t, list1_append, list2_append, list3_append, i)))
[t.start() for t in threads]
[t.join() for t in threads]
list1_append = list(map(lambda x: x[1], list1_append))
list2_append = list(map(lambda x: x[1], list2_append))
list3_append = list(map(lambda x: x[1], list3_append))
return torch.stack(list1_append), torch.stack(list2_append), torch.stack(list3_append)
def kmeansAppender(module, input, list1_append, list2_append, list3_append, index):
x, y, z = module(input)
list1_append.append((index, x))
list2_append.append((index, y))
list3_append.append((index, z))
def threader(modules, input_tensor1, input_tensor2):
list_append = []
threads = []
for i, t in enumerate(input_tensor1):
threads.append(Thread(target=threaderAppender, args=(modules[i], t, input_tensor2[i], list_append, i)))
[t.start() for t in threads]
[t.join() for t in threads]
list_append = list(map(lambda x: x[1], list_append))
return torch.stack(list_append)
def threaderAppender(module, t1, t2, list_append, index):
list_append.append((index, module(t1, t2)))
class Classifier(nn.Module):
def __init__(self, shrinkingLayersStack: [ShrinkingUnitStack], mlp: nn.Module):
self.neuralNet = nn.Sequential(*shrinkingLayersStack)
self.mlp = mlp
def forward(self, x: torch.Tensor, pos: torch.Tensor):
feature_matrix_batch = pos.unsqueeze(0)
output = self.neuralNet(feature_matrix_batch)
output = torch.mean(output, dim=0)
return self.mlp(output)
thank you for your help
The attribute labels_ of a KMeans object is created once you actually compute the clusters by running .fit() (or .fit_predict(), or .fit_transform()).
Simple example:
>>> from sklearn.cluster import KMeans
>>> from numpy.random import random
>>> X = random((10,2))
>>> X
array([[0.2096706 , 0.69704806],
[0.31732618, 0.29607599],
[0.10372159, 0.56911046],
[0.30922255, 0.07952464],
[0.21190404, 0.46823665],
[0.67134948, 0.95702692],
[0.14781526, 0.24619197],
[0.89931979, 0.96301003],
[0.88256126, 0.07569739],
[0.70776912, 0.92997521]])
>>> clustering = KMeans(n_clusters=3)
>>> clustering.labels_
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
AttributeError: 'KMeans' object has no attribute 'labels_'
>>> clustering.labels_
array([0, 0, 0, 0, 0, 1, 0, 1, 2, 1], dtype=int32)

How to use torch.nn.transformer with pytroch lightning?

I am trying to use the vanilla transformer from PyTorch using Pytorch Lightning. I tried to test the model with a reverse number task. So given [1, 3, 5, 4, 13, 19] it returns [1, 13, 4, 5, 3, 19] with 1, 19 being start and end token respectively. The full code is below. The code can run without error but there seems to be a problem with the backpropagation. The training loss does go down at first but it doesn't go beyond 2.8 and the accuracy doesn't go beyond 11%.
It seems that part of the model is able to optimize, I am guessing it is because the weights located in Embeddings and Generator can backpropagate, but weights located in nn.Transformer cannot? I am really not sure.
import math
import torch.nn.functional as F
import numpy as np
import torch
import torch.nn as nn
from import DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping
class Embeddings(pl.LightningModule):
def __init__(self, d_model, vocab):
super(Embeddings, self).__init__()
self.lut = nn.Embedding(vocab, d_model)
self.d_model = d_model
def forward(self, x):
a = self.lut(x) * math.sqrt(self.d_model)
return a
class PositionalEncoding(pl.LightningModule):
def __init__(self, d_model, dropout, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) *
-(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x +[:, :x.size(1)]
return self.dropout(x)
class Generator(pl.LightningModule):
def __init__(self, size):
super(Generator, self).__init__()
self.proj = nn.Linear(512, size)
def forward(self, x):
return F.log_softmax(self.proj(x), dim=-1)
class Model(pl.LightningModule):
def __init__(self, src_embed, tgt_embed, transformer, generator):
super(Model, self).__init__()
self.src_embed = src_embed
self.tgt_embed = tgt_embed
self.transformer = transformer
self.generator = generator
self.valLoss = 0
self.valAcc = 0
self.automatic_optimization = False
self.optimizer = None
for p in self.parameters():
if p.dim() > 1:
def forward(self, x, y, tgt_mask=None):
x = self.src_embed(x)
y = self.tgt_embed(y)
return self.generator(self.transformer(x, y, tgt_mask=tgt_mask))
def training_step(self, batch, batch_idx):
if self.optimizer is None:
self.optimizer = self.optimizers()
batch = Batch(batch[0], batch[1])
tgt_mask = batch.trg_mask.squeeze(0)
tgt_mask = (tgt_mask != True)
output = self(batch.src, batch.trg, tgt_mask)
criterion = LossCompute(V)
loss = criterion.forward(output.contiguous().view(-1, output.size(-1)), batch.trg_y.contiguous().view(-1)) / batch.ntokens
self.log('train_loss', loss)
def validation_step(self, batch, batch_idx):
batch = Batch(batch[0], batch[1])
tgt_mask = batch.trg_mask.squeeze(0)
tgt_mask = (tgt_mask != True)
output = self(batch.src, batch.trg, tgt_mask)
criterion = LossCompute(V)
loss = criterion.forward(output.view(-1, output.size(-1)), batch.trg_y.contiguous().view(-1)) / batch.ntokens
self.log('val_loss', loss)
self.valLoss += loss
if batch_idx % 10 == 0:
if batch_idx == 99:
self.valLoss = 0
return {"x": output, "trg": batch.trg_y, "index": batch_idx}
def validation_step_end(self, batch):
output, trg, idx = batch["x"], batch["trg"], batch["index"]
accuracy = getAccuracy(output, trg)
self.log("accuracy", accuracy)
self.valAcc += accuracy
if idx == 99:
self.valAcc = 0
def train_dataloader(self):
data = data_gen(V, 0, 3000)
return DataLoader(data, batch_size=30, shuffle=False, num_workers=2, pin_memory=True)
def val_dataloader(self):
data = data_gen(V, 1, 1000)
return DataLoader(data, batch_size=10, shuffle=False, num_workers=2, pin_memory=True)
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9)
class LossCompute(pl.LightningModule):
def __init__(self, size):
super(LossCompute, self).__init__()
self.criterion = nn.KLDivLoss(reduction='sum')
self.size = size
self.true_dist = None
def forward(self, x, target):
# x has size (batch_size x length, vocab_size)
assert x.size(1) == self.size
true_dist =
true_dist.scatter_(1,, 1)
self.true_dist = true_dist
return self.criterion(x, true_dist)
# prepare data
class Batch:
"Object for holding a batch of data with mask during training."
def __init__(self, src, trg=None):
self.src = src
if trg is not None:
self.trg = trg[:, :-1]
self.trg_y = trg[:, 1:]
self.trg_mask = \
self.ntokens = self.trg_y.size(0) * self.trg_y.size(1)
def make_std_mask(tgt):
"Create a mask to hide padding and future words."
tgt_mask = subsequent_mask(tgt.size(-1)).type_as(
return tgt_mask
def subsequent_mask(size):
"Mask out subsequent positions."
attn_shape = (1, size, size)
subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
return torch.from_numpy(subsequent_mask) == 0
def data_gen(V, randomSeed, totalTrainingSample):
x = torch.from_numpy(np.random.randint(2, V - 2, size=(totalTrainingSample, 10)))
y = torch.flip(torch.flip(x, [0, 1]), [0])
x[:, 0] = 1
y[:, 0] = 1
x[:, -1] = V - 1
y[:, -1] = V - 1
return list(zip(x, y))
def getAccuracy(x, trg):
totalValAcc = 0
totalValAccToken = 0
trg = trg.contiguous().view(-1)
out = x.view(-1, x.size(-1)) # (batch_size * tgt_length, src_vocab)
_, index = torch.max(out, dim=-1) # index (batch_size * tgt_length)
correct = list((trg == index)).count(True)
totalValAcc += correct
totalValAccToken += index.size(0)
return totalValAcc / totalValAccToken
V = 20
transformer = nn.Transformer(num_encoder_layers=2, num_decoder_layers=2, batch_first=True)
PositionEnc = PositionalEncoding(512, 0.1)
src_emb = Embeddings(512, V)
tgt_emb = Embeddings(512, V)
gen = Generator(V)
if __name__ == '__main__':
model = Model(nn.Sequential(src_emb, PositionEnc), nn.Sequential(tgt_emb, PositionEnc), transformer, gen)
earlyStopping = EarlyStopping(monitor='val_loss', patience=3)
trainer = pl.Trainer(max_epochs=10, callbacks=[earlyStopping])

Train a model to output weights of another model, and use the other model just as function evaluation

I have 2 models, A and B.
A(x1)=Weights of B
B(x2)=Final output
A is trainable
B is not trainable (I just want to upload the outputs of A into B and infer)
Problem I am facing: Output of A is torch.tensor. While setting the weights of B, I had to slice the output tensor of A. However, I am losing the gradient flow, from final loss to weights of A, hence there is no training happening. How do I implement the idea or correct my code?
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.autograd import Variable
import numpy as np
class Hyper_Model(nn.Module):
def __init__(self):
super(Hyper_Model, self).__init__()
self.layers = nn.Sequential(nn.Linear(1,32),
def forward(self,param):
param_ = self.layers(param)
return param_
class Main_Model(nn.Module):
def __init__(self):
super(Main_Model, self).__init__()
self.linear1 = nn.Linear(2,8)
self.linear2 = nn.Linear(8,8)
self.linear3 = nn.Linear(8,8)
self.out = nn.Linear(8,1)
def forward(self,param_,x):
self.linear1.weight = torch.nn.Parameter(param_[0,:16].view(8,2))
self.linear2.weight = torch.nn.Parameter(param_[0,24:88].view(8,8))
self.linear3.weight = torch.nn.Parameter(param_[0,96:160].view(8,8))
self.linear1.bias = torch.nn.Parameter(param_[0,16:24].view(8))
self.linear2.bias = torch.nn.Parameter(param_[0,88:96].view(8))
self.linear3.bias = torch.nn.Parameter(param_[0,160:168].view(8))
self.out.weight = torch.nn.Parameter(param_[0,168:176].view(1,8))
self.out.bias = torch.nn.Parameter(param_[0,176:].view(1))
self.linear1.weight.requires_grad = False
self.linear2.weight.requires_grad = False
self.linear3.weight.requires_grad = False
self.linear1.bias.requires_grad = False
self.linear2.bias.requires_grad = False
self.linear3.bias.requires_grad = False
self.out.weight.requires_grad = False
self.out.bias.requires_grad = False
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = F.relu(self.linear3(x))
x = self.out(x)
return x
x = torch.tensor([1.0,2.0,3.0],requires_grad=True).view(3,1)
t = torch.tensor([1.0,1.5,2.0],requires_grad=True).view(3,1)
param = torch.tensor([-0.01]).view(1,1)
X =[x,t],dim=1)
Y = torch.tensor([5.0,6.0,9.0]).view(3,1)
h = Hyper_Model()
m = Main_Model()
opt = torch.optim.Adam(list(h.parameters()), lr=0.001)
loss_func = nn.MSELoss()
for i in range(10):
param_ = h(param)
out = m(param_,X)
loss = loss_func(out,Y)

error X = X.reshape(1, X.shape[0]) IndexError: tuple index out of range. How to fix that?

this is fragment of my code
def train(self, features, targets):
for X, y in zip(features, targets):
X = X.reshape(1, X.shape[0])
outputs = self.feed_forward(X)
when I try to use the method with data:
train(np.array([gameDataList[n].ball_position, gameDataList[n].wall_position]), np.array(gameDataList[n].upOrDown))
where gameDataList[n].upOrDown is an array e.g. [0.1, 0.9], and gameDataList[n].ball_position and gameDataList[n].wall_position are floats, I get this error.
Full code:
#### Imports ####
import numpy as np
#### Neural Network Class ####
class MLP:
##### Constructor ####
def __init__(self, n_input_nodes, hidden_nodes, n_output_nodes, lr):
## Network ##
self.n_input_nodes = n_input_nodes
self.n_output_nodes = n_output_nodes
self.nodes = hidden_nodes
self.nodes.insert(0, n_input_nodes)
## Weights and Biases##
self.weights = []
self.biases = []
for i in range(1, len(self.nodes)):
self.weights.append(np.random.uniform(-1.0, 1.0, (self.nodes[i - 1], self.nodes[i])))
self.biases.append(np.random.uniform(-1.0, 1.0, (1, self.nodes[i])))
## Learning Rate ## = lr
## Activation Functions ##
# Linear Activation
self.linear = lambda x: x
self.d_linear = lambda x: np.ones(x.shape)
# Relu Activation
def relu(x):
x[x < 0] = 0
return x
def d_relu(out):
out: x[x > 0] = 1
return out
self.relu = relu
self.d_relu = d_relu
# Sigmoid Activation
self.sigmoid = lambda x: 1 / (1 + np.exp(-x))
self.d_sigmoid = lambda out: out * (1 - out) # assumes out is tanh(x)
# Hyperbolic Tangent Activation
self.tanh = lambda x: np.tanh(x)
self.d_tanh = lambda out: 1 - out ** 2 # assumes out is tanh(x)
def getWeights(self):
return self.weights.copy()
def getBiases(self):
return self.biases.copy()
def setWeights(self, weights):
self.weights = weights.copy()
def setBiases(self, biases):
self.biases = biases.copy()
#### Feed Forward ####
def feed_forward(self, X):
outputs = [X]
logits =, self.weights[0]) + self.biases[0]
for i in range(1, len(self.nodes) - 1):
out = self.sigmoid(logits)
logits =, self.weights[i]) + self.biases[i]
out = self.sigmoid(logits)
return outputs
#### Backpropagation ####
def backpropagation(self, X, y, outputs):
weights_gradients = []
biases_gradients = []
d1 = y - outputs[-1]
d2 = self.d_sigmoid(outputs[-1])
error = d1 * d2
grad = outputs[-2].T * error
for i in range(len(self.weights) - 2, 1, -1):
d = self.d_sigmoid(outputs[i])
error =, self.weights[i + 1].T) * d
grad = outputs[i - 1].T * error
return weights_gradients, biases_gradients
#### Training ####
def train(self, features, targets):
# Batch Size for weight update step
batch_size = features.shape[0]
# Delta Weights Variables
delta_weights = [np.zeros(weight.shape) for weight in self.weights]
delta_biases = [np.zeros(bias.shape) for bias in self.biases]
# For every data point, forward pass, backpropogation, store weights change
for X, y in zip(features, targets):
# Forward pass
X = X.reshape(1, X.shape[0])
outputs = self.feed_forward(X)
# Back propogation
weights_gradients, biases_gradients = self.backpropagation(X, y, outputs)
for i in range(len(weights_gradients)):
delta_weights[-(i + 1)] += weights_gradients[i]
delta_biases[-(i + 1)] += biases_gradients[i]
for i in range(len(delta_weights)):
self.weights[i] += ( * delta_weights[i]) / batch_size
self.biases[i] += ( * delta_biases[i]) / batch_size
#### Testing Methods ####
def predict(self, X):
# Gives prediction
return self.feed_forward(X)[-1]
def test(self, features, targets):
predictions = self.predict(features)
n_correct = 0
for i in range(len(predictions)):
prediction = np.argmax(predictions[i])
correct = np.argmax(targets[i])
if prediction == correct:
n_correct += 1
return n_correct / len(targets)
class GameData:
def __init__(self, ball_position, wall_position, upOrDown):
self.wall_position = wall_position
self.ball_position = ball_position
self.upOrDown = upOrDown
I collect data, and train my network, in this way:
gameDataList.append(GameData(ball.trt.ycor(), b.trt.ycor(), [0.1, 0.9]))
mlp = MLP(2, [32, 32], 2, 0.0001)
n = random.randint(0, 999)
mlp.train(np.array([gameDataList[n].ball_position, gameDataList[n].wall_position]), np.array(gameDataList[n].upOrDown))
Problem solved. It was needed to write two square brackets instead of one.
wrong example:
np.array([gameDataList[n].ball_position, gameDataList[n].wall_position])
correct example:
np.array([[gameDataList[n].ball_position, gameDataList[n].wall_position]])

how to build a custom keras layer witth unkown input shape

I'm trying to build a custom ConvLSTM layer in keras using the following code but it didn't work:
import tensorflow as tf
from tensorflow import keras
from keras.layers import InputSpec, Layer
class Padding2D(Layer):
def __init__(self, padding = (1,1), **kwargs):
self.padding = tuple(padding)
self.input_spec = [InputSpec(ndim = 4)]
def compute_output_shape(self, s):
return (s[0], s[1] + 2*self.padding[0], s[2] + 2*self.padding[1], s[3])
def call(self, x):
w_pad, h_pad = self.padding
return tf.pad(x, [[0,0], [h_pad,h_pad],[w_pad,w_pad],[0,0]])
class ConvLSTM(Layer):
def __init__(self, out_channels, kernel_size=5, forget_bias=1.0, padding=0):
super(ConvLSTM, self).__init__()
self.out_channels = out_channels
self.forget_bias = forget_bias
self.states = None
def call(self, inputs):
if self.states is None:
#inputs.shape : [Batch, Height, Width, Channel]
self.states = (tf.zeros([inputs.shape[0], inputs.shape[1], inputs.shape[2], self.out_channels]),
tf.zeros([inputs.shape[0], inputs.shape[1], inputs.shape[2]], self.out_channels))
c, h = self.states
if not (len(c.shape) == 4 and len(h.shape) == 4 and len(inputs.shape) == 4):
raise TypeError("Incorrect shapes")
inputs_h = tf.concat((inputs, h), axis=3)
padded_inputs_h = Padding2D(padding = (padding,padding))(inputs_h)
i_j_f_o = Conv2D( 4 * out_channels, kernel_size, strides=1)(padded_inputs_h)
i = i_j_f_o[:,:,:,: self.out_channels]
j = i_j_f_o[:,:,:,self.out_channels : 2*self.out_channels]
f= i_j_f_o[:,:,:, 2*self.out_channels : 3*self.out_channels]
o = i_j_f_o[:,:,:, 3*self.out_channels :]
# i, j, f, o = torch.split(i_j_f_o, self.out_channels, dim=3)
new_c = c * sigmoid(f + self.forget_bias) + sigmoid(i) * tanh(j)
new_h = tanh(new_c) * sigmoid(o)
self.states = (new_c, new_h)
return new_h
input0 = tf.keras.Input(shape= (2,2,1))
x = ConvLSTM(out_channels= 1)(input0)
model = tf.keras.Model(input0,x)
Error output
----> x = ConvLSTM(out_channels= 1)(input0)
TypeError: in user code:
<ipython-input-1-2e11c0026581>:28 call *
self.states = (tf.zeros([inputs.shape[0], inputs.shape[1], inputs.shape[2], self.out_channels]),
TypeError: Expected int32, got None of type 'NoneType' instead.
I think the error occurs because the model don't know in advance the value of the batch_size dimension (inputs.shape[0]) which is set to None when the model is built (before execution) but I need to make the model figure out by itself the batch size dimension during execution time (and ignore it in building time). Can anyone help please ?
By following the suggestion given by Marc above in the comments, this code solved the problem:
import tensorflow as tf
from tensorflow import keras
from keras.layers import InputSpec, Layer, Conv2D
from tensorflow.keras.activations import sigmoid, tanh
class Padding2D(Layer):
def __init__(self, padding = (1,1), **kwargs):
self.padding = tuple(padding)
self.input_spec = [InputSpec(ndim = 4)]
def compute_output_shape(self, s):
return (s[0], s[1] + 2*self.padding[0], s[2] + 2*self.padding[1], s[3])
def call(self, x):
w_pad, h_pad = self.padding
return tf.pad(x, [[0,0], [h_pad,h_pad],[w_pad,w_pad],[0,0]])
class ConvLSTM(Layer):
def __init__(self, out_channels, kernel_size=1, forget_bias=1.0, padding=0):
super(ConvLSTM, self).__init__()
self.out_channels = out_channels
self.kernel_size = kernel_size
self.forget_bias = forget_bias
self.states = None
def call(self, inputs):
if self.states is None:
#inputs.shape : [Batch, Height, Width, Channel]
self.states = ( tf.zeros_like(tf.tile(tf.expand_dims(inputs[:,:,:,0], axis=-1), (1,1,1,self.out_channels))),
tf.zeros_like(tf.tile(tf.expand_dims(inputs[:,:,:,0], axis=-1), (1,1,1,self.out_channels))))
c, h = self.states
if not (len(c.shape) == 4 and len(h.shape) == 4 and len(inputs.shape) == 4):
raise TypeError("Incorrect shapes")
inputs_h = tf.concat((inputs, h), axis=3)
padded_inputs_h = Padding2D(padding = (self.padding,self.padding))(inputs_h)
i_j_f_o = Conv2D( 4 * self.out_channels, self.kernel_size, strides=1)(padded_inputs_h)
i = i_j_f_o[:,:,:,: self.out_channels]
j = i_j_f_o[:,:,:,self.out_channels : 2*self.out_channels]
f= i_j_f_o[:,:,:, 2*self.out_channels : 3*self.out_channels]
o = i_j_f_o[:,:,:, 3*self.out_channels :]
new_c = c * sigmoid(f + self.forget_bias) + sigmoid(i) * tanh(j)
new_h = tanh(new_c) * sigmoid(o)
self.states = (new_c, new_h)
return new_h
I also found another alternative to solve the problem by providing the batch size and input shape when initializing the layer.
The code is given below:
import tensorflow as tf
from tensorflow import keras
from keras.layers import InputSpec, Layer, Conv2D
from tensorflow.keras.activations import sigmoid, tanh
class Padding2D(Layer):
def __init__(self, padding = (1,1), **kwargs):
self.padding = tuple(padding)
self.input_spec = [InputSpec(ndim = 4)]
def compute_output_shape(self, s):
return (s[0], s[1] + 2*self.padding[0], s[2] + 2*self.padding[1], s[3])
def call(self, x):
w_pad, h_pad = self.padding
return tf.pad(x, [[0,0], [h_pad,h_pad],[w_pad,w_pad],[0,0]])
class ConvLSTM(Layer):
def __init__(self,batch_size, input_shape, out_channels, kernel_size=1, forget_bias=1.0, padding=0):
super(ConvLSTM, self).__init__()
self.out_channels = out_channels
self.kernel_size = kernel_size
self.forget_bias = forget_bias
self.shape = input_shape
self.states = None
self.batch_size = batch_size
def build(self, input_shape):
if self.states is None:
#input_shape : [Height, Width, Channel]
self.states = (tf.zeros([self.batch_size]+ self.shape[:-1] + [self.out_channels]),
tf.zeros([self.batch_size]+ self.shape[:-1] + [self.out_channels]))
def call(self, inputs):
c, h = self.states
if not (len(c.shape) == 4 and len(h.shape) == 4 and len(inputs.shape) == 4):
raise TypeError("Incorrect shapes")
inputs_h = tf.concat((inputs, h), axis=3)
padded_inputs_h = Padding2D(padding = (self.padding,self.padding))(inputs_h)
i_j_f_o = Conv2D( 4 * self.out_channels, self.kernel_size, strides=1)(padded_inputs_h)
i,j,f,o = tf.split(i_j_f_o, num_or_size_splits=4, axis=3)
new_c = c * sigmoid(f + self.forget_bias) + sigmoid(i) * tanh(j)
new_h = tanh(new_c) * sigmoid(o)
self.states = (new_c, new_h)
return new_h
Yet, even if these implementations solved the question asked in this post, there still remain a problem in both implementations related to how I update lstm cell state (line self.states = (new_c, new_h) """Last line in ConvLSTM class"""") but since the problem is different I opened this issue in a different post
