I estimate ratings in a user-item matrix by decomposing the matrix into two matrices P and Q using PyTorch Matrix Factorization. I got my loss function L(X-PQ).
Let's say rows of X correspond to users, and x is new user's row, so that new X is X concatenated with x.
Now I want to minimize L(X' - P'Q) = L(X - PQ) + L(x - x_pQ). Since I have already trained P and Q.
I want to train x_p that is the new user's row, but leave Q fixed.
So my question would be, is there a way in PyTorch to train MatrixFactorization model for P with fixed Q?
Code I'm working with:
class MatrixFactorizationWithBiasXavier(nn.Module):
def __init__(self, num_people, num_partners, bias=(-0.01, 0.01), emb_size=100):
super(MatrixFactorizationWithBiasXavier, self).__init__()
self.person_emb = nn.Embedding(num_people, emb_size)
self.person_bias = nn.Embedding(num_people, 1)
self.partner_emb = nn.Embedding(num_partners, emb_size)
self.parnter_bias = nn.Embedding(num_partners, 1)
torch.nn.init.xavier_uniform_(self.person_emb.weight)
torch.nn.init.xavier_uniform_(self.partner_emb.weight)
self.person_bias.weight.data.uniform_(bias[0], bias[1])
self.parnter_bias.weight.data.uniform_(bias[0], bias[1])
def forward(self, u, v):
u = self.person_emb(u)
v = self.partner_emb(v)
bias_u = self.person_bias(u).squeeze()
bias_v = self.parnter_bias(v).squeeze()
# calculate dot product
# u*v is a element wise vector multiplication
return torch.sigmoid((u*v).sum(1) + bias_u + bias_v)
def test(model, df_test, verbose=False):
model.eval()
# .to(dev) puts code on either gpu or cpu.
people = torch.LongTensor(df_test.id.values).to(dev)
partners = torch.LongTensor(df_test.pid.values).to(dev)
decision = torch.FloatTensor(df_test.decision.values).to(dev)
y_hat = model(people, partners)
loss = F.mse_loss(y_hat, decision)
if verbose:
print('test loss %.3f ' % loss.item())
return loss.item()
def train(model, df_train, epochs=100, learning_rate=0.01, weight_decay=1e-5, verbose=False):
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
model.train()
for epoch in range(epochs):
# From numpy to PyTorch tensors.
# .to(dev) puts code on either gpu or cpu.
people = torch.LongTensor(df_train.id.values).to(dev)
partners = torch.LongTensor(df_train.pid.values).to(dev)
decision = torch.FloatTensor(df_train.decision.values).to(dev)
# calls forward method of the model
y_hat = model(people, partners)
# Using mean squared errors loss function
loss = F.mse_loss(y_hat, decision)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if verbose and epoch % 100 == 0:
print(loss.item())
Found a solution.
Turns out I can register a hook on my embedding(partner emb, that is my Q) (that i want to stay fixed) so that it says it's gradient is zeroed.
mask = torch.zeros_like(mf_model.partner_emb.weight)
mf_model.partner_emb.weight.register_hook(lambda grad: grad*mask)
Related
Thank you for reading my post.
I’m currently developing the peak detection algorithm using CNN to determine the ideal convolution kernel which is representable as the ideal mother wavelet function that will maximize the peak detection accuracy.
To begin with, I created my own IoU loss function and the simple model and tried to run the learning. The execution itself worked without any errors, but somehow it failed.
The parameters of the model with custom loss function doesn't upgraded thorough its learning over epochs
My own loss function is described as below.
def IoU(inputs: torch.Tensor, labels: torch.Tensor,
smooth: float=0.1, threshold: float = 0.5, alpha: float = 1.0):
'''
- alpha: a parameter that sharpen the thresholding.
if alpha = 1 -> thresholded input is the same as raw input.
'''
thresholded_inputs = inputs**alpha / (inputs**alpha + (1 - inputs)**alpha)
inputs = torch.where(thresholded_inputs < threshold, 0, 1)
batch_size = inputs.shape[0]
intersect_tensor = (inputs * labels).view(batch_size, -1)
intersect = intersect_tensor.sum(-1)
union_tensor = torch.max(inputs, labels).view(batch_size, -1)
union = union_tensor.sum(-1)
iou = (intersect + smooth) / (union + smooth) # We smooth our devision to avoid 0/0
iou_score = iou.mean()
return 1- iou_score
and my training model is,
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv1d(1, 1, kernel_size=32, stride=1, padding=16),
nn.Linear(257, 256),
nn.LogSoftmax(1)
)
def forward(self, x):
return self.net(x)
model = MLP()
opt = optim.Adadelta(model.parameters())
# initialization of the kernel of Conv1d
def init_kernel(m):
if type(m) == nn.Conv1d:
nn.init.kaiming_normal_(m.weight)
print(m.weight)
plt.plot(m.weight[0][0].detach().numpy())
model.apply(init_kernel)
def step(x, y, is_train=True):
opt.zero_grad()
y_pred = model(x)
y_pred = y_pred.reshape(-1, 256)
loss = IoU(y_pred, y)
loss.requires_grad = True
loss.retain_grad = True
if is_train:
loss.backward()
opt.step()
return loss, y_pred
and lastly, the execution code is,
from torch.autograd.grad_mode import F
train_loss_arr, val_loss_arr = [], []
valbose = 10
epochs = 200
for e in range(epochs):
train_loss, val_loss, acc = 0., 0., 0.,
for x, y in train_set.as_numpy_iterator():
x = torch.from_numpy(x)
y = torch.from_numpy(y)
model.train()
loss, y_pred = step(x, y, is_train=True)
train_loss += loss.item()
train_loss /= len(train_set)
for x, y ,in val_set.as_numpy_iterator():
x = torch.from_numpy(x)
y = torch.from_numpy(y)
model.eval()
with torch.no_grad():
loss, y_pred = step(x, y, is_train=False)
val_loss += loss.item()
val_loss /= len(val_set)
train_loss_arr.append(train_loss)
val_loss_arr.append(val_loss)
# visualize current kernel to check whether the learning is on progress safely.
if e % valbose == 0:
print(f"Epoch[{e}]({(e*100/epochs):0.2f}%): train_loss: {train_loss:0.4f}, val_loss: {val_loss:0.4f}")
fig, axs = plt.subplots(1, 4, figsize=(12, 4))
print(y_pred[0], y_pred[0].shape)
axs[0].plot(x[0][0])
axs[0].set_title("spectra")
axs[1].plot(y_pred[0])
axs[1].set_title("y pred")
axs[2].plot(y[0])
axs[2].set_title("y true")
axs[3].plot(model.state_dict()["net.0.weight"][0][0].numpy())
axs[3].set_title("kernel1")
plt.show()
with these programs, I tried to evaluate this simple model, however, model parameters didn't change at all over epochs.
Visualization of the results at epoch 0 and 30.
epoch 0:
prediction and kernel at epoch0
epoch 30:
prediction and kernel at epoch30
As you can see, the kernel has not be modified through its learning over epochs.
I took a survey to figure out what causes this problem for hours but I'm still not sure how to fix my loss function and model into trainable ones.
Thank you.
Try printing the gradient after loss.backward() with:
y_pred.grad()
I suspect what you'll find is that after a backward pass, the gradient of y_pred is zero. This means that either a.) gradient is not enabled for one or more of the variables at which the computation graph has a node, or b.) (more likely) you are using an operation which is not differentiable.
In your case, at a minimum torch.where is non-differentiable, so you'll need to replace that. Thersholding operations are non-differentiable and are generally replaced with "soft" thresholding operations (see Softmax instead of max function for classification) so that gradient computation still works. Try replacing this with a soft threshold or no threshold at all.
For my model I'm using a roberta transformer model and the Trainer from the Huggingface transformer library.
I calculate two losses:
lloss is a Cross Entropy Loss and dloss calculates the loss inbetween hierarchy layers.
The total loss is the sum of lloss and dloss. (Based on this)
When calling total_loss.backwards() however, I get the error:
RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed
Any idea why that happens? Can I force it to only call backwards once? Here is the loss calculation part:
dloss = calculate_dloss(prediction, labels, 3)
lloss = calculate_lloss(predeiction, labels, 3)
total_loss = lloss + dloss
total_loss.backward()
def calculate_lloss(predictions, true_labels, total_level):
'''Calculates the layer loss.
'''
loss_fct = nn.CrossEntropyLoss()
lloss = 0
for l in range(total_level):
lloss += loss_fct(predictions[l], true_labels[l])
return self.alpha * lloss
def calculate_dloss(predictions, true_labels, total_level):
'''Calculate the dependence loss.
'''
dloss = 0
for l in range(1, total_level):
current_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l]), dim=1)
prev_lvl_pred = torch.argmax(nn.Softmax(dim=1)(predictions[l-1]), dim=1)
D_l = self.check_hierarchy(current_lvl_pred, prev_lvl_pred, l) #just a boolean tensor
l_prev = torch.where(prev_lvl_pred == true_labels[l-1], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
l_curr = torch.where(current_lvl_pred == true_labels[l], torch.FloatTensor([0]).to(self.device), torch.FloatTensor([1]).to(self.device))
dloss += torch.sum(torch.pow(self.p_loss, D_l*l_prev)*torch.pow(self.p_loss, D_l*l_curr) - 1)
return self.beta * dloss
There is nothing wrong with having a loss that is the sum of two individual losses, here is a small proof of principle adapted from the docs:
import torch
import numpy
from sklearn.datasets import make_blobs
class Feedforward(torch.nn.Module):
def __init__(self, input_size, hidden_size):
super(Feedforward, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
self.relu = torch.nn.ReLU()
self.fc2 = torch.nn.Linear(self.hidden_size, 1)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
hidden = self.fc1(x)
relu = self.relu(hidden)
output = self.fc2(relu)
output = self.sigmoid(output)
return output
def blob_label(y, label, loc): # assign labels
target = numpy.copy(y)
for l in loc:
target[y == l] = label
return target
x_train, y_train = make_blobs(n_samples=40, n_features=2, cluster_std=1.5, shuffle=True)
x_train = torch.FloatTensor(x_train)
y_train = torch.FloatTensor(blob_label(y_train, 0, [0]))
y_train = torch.FloatTensor(blob_label(y_train, 1, [1,2,3]))
x_test, y_test = make_blobs(n_samples=10, n_features=2, cluster_std=1.5, shuffle=True)
x_test = torch.FloatTensor(x_test)
y_test = torch.FloatTensor(blob_label(y_test, 0, [0]))
y_test = torch.FloatTensor(blob_label(y_test, 1, [1,2,3]))
model = Feedforward(2, 10)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
model.eval()
y_pred = model(x_test)
before_train = criterion(y_pred.squeeze(), y_test)
print('Test loss before training' , before_train.item())
model.train()
epoch = 20
for epoch in range(epoch):
optimizer.zero_grad() # Forward pass
y_pred = model(x_train) # Compute Loss
lossCE= criterion(y_pred.squeeze(), y_train)
lossSQD = (y_pred.squeeze()-y_train).pow(2).mean()
loss=lossCE+lossSQD
print('Epoch {}: train loss: {}'.format(epoch, loss.item())) # Backward pass
loss.backward()
optimizer.step()
There must be a real second time that you call directly or indirectly backward on some varaible that then traverses through your graph. It is a bit too much to ask for the complete code here, only you can check this or at least reduce it to a minimal example (while doing so, you might already find the issue). Apart from that, I would start checking:
Does it already occur in the first iteration of training? If not: are you reusing any calculation results for the second iteration without a detach?
When you do backward on your losses individually lloss.backward() followed by dloss.backward() (this has the same effect as adding them together first as gradients are accumulated): what happens? This will let you track down for which of the two losses the error occurs.
After backward() your comp. graph is freed so for the second backward you need to create a new graph by providing inputs again. If you want to reiterate the same graph after backward (for some reason) you need to specify retain_graph flag in backward as True. see retain_graph here.
P.S. As the summation of Tensors is automatically differentiable, summing the losses would not cause any issue in the backward.
I am new to tensorflow, i am trying to use Linear regression technique to train my module, but the function results a tensor of Nans! Here is the code
That's how i read the dataset
train_x = np.asanyarray(df[['Fat']]).astype(np.float32)
train_y = np.asanyarray(df[['Calories']]).astype(np.float32)
the weights initialization
a = tf.Variable(20.0)
b = tf.Variable(10.0)
the linear regression function
#tf.function
def h(x):
y = a*x +b
return y
the cost function
#tf.function
def costFunc(y_predicted,train_y):
return tf.reduce_mean(tf.square(y_predicted-train_y))
the module training
learning_rate = 0.01
train_data = []
loss_values =[]
a_values = []
b_values = []
# steps of looping through all your data to update the parameters
training_epochs = 200
train model
for epoch in range(training_epochs):
with tf.GradientTape() as tape:
y_predicted = h(train_x)
loss_value = loss_object(train_y,y_predicted)
loss_values.append(loss_value)
get gradients
gradients = tape.gradient(loss_value, [b,a])
# compute and adjust weights
a_values.append(a.numpy())
b_values.append(b.numpy())
b.assign_sub(gradients[0]*learning_rate)
a.assign_sub(gradients[1]*learning_rate)
if epoch % 5 == 0:
train_data.append([a.numpy(), b.numpy()])
but when i print (a*train_x) the result is Nans tensor
UPDATE
I found that the problem is in the dataset, when i changed the dataset it gives tensor of numbers, but i still don't know what is the problem with the first dataset
I am sorry, the mistake is silly, i had to re-initialize the variables every time by running the cells each time i run the script, because the variables had a value which results in infinite values
Sorry if this sounds like a repeat. I have been through all of the related questions and found no suitable solutions to my problem's context.
I am trying to build a generative model that outputs probabilities for each tracked day of COVID to input into an SEIR-based epidemiology model.
The generation works. However, I cannot figure out how to train the model. I have to write a custom loss function that runs the day-by-day parameters through a step function for the epidemiology model with will populate a dataset of "confirmed" and "removed" for each day. I then compare that data to the recorded "confirmed" and "removed" from John Hopkin's COVID dataset on GitHub.
I use Mean Absolute Error to calculate a loss between the "confirmed" and "removed" based on the generated probabilities and the actual values from the JHU dataset. The issue I am running into is when I call the tf.gradient() function it returns a list of Nones. I am stuck here and any assistance would be greatly appreciated.
Here is the code I am using:
Training Step
# Define function to train the model based on one input
loss_fn = MeanAbsoluteError()
optimizer = Adam(learning_rate=0.005)
#tf.function
def train_step(x, y):
y_pred = np.zeros((3, latent_dim))
N = tf.constant(int(7_000_000_000), dtype=tf.float64)
E0 = tf.Variable(int(1000), trainable=False, dtype=tf.float64)
I0 = tf.Variable(covid_df.iloc[0]["Confirmed"], trainable=False, dtype=tf.float64)
R0 = tf.Variable(covid_df.iloc[0]["Removed"], trainable=False, dtype=tf.float64)
S0 = tf.Variable(N - E0 - I0 - R0, trainable=False, dtype=tf.float64)
u0 = tf.Variable(0, trainable=False, dtype=tf.float64)
SuEIRs = tf.stack([S0,u0,E0,I0,R0])
with tf.GradientTape() as tape:
logits = generator(tf.reshape(x, (batch_size, 4, latent_dim)), training=True)
betas = logits[0][0]
sigmas = logits[0][1]
mus = logits[0][2]
gammas = logits[0][3]
for t in range(latent_dim):
SuEIR_diffs = SuEIR_step(SuEIRs, t, N, betas, sigmas, mus, gammas)
SuEIRs = SuEIRs + SuEIR_diffs
confirmed = SuEIRs[3]
removed = SuEIRs[4]
# update y_pred
y_pred[0,t] = float(t+1)
y_pred[1,t] = confirmed.numpy()
y_pred[2,t] = removed.numpy()
# Convert predictions
y_pred = tf.convert_to_tensor(y_pred)
# Calculate loss
loss_value = loss_fn(y[1], y_pred[1]) + loss_fn(y[2], y_pred[2])
# Calculate the gradient
grads = tape.gradient(loss_value, generator.trainable_weights)
print(grads) ##==>> outputs [None, None, None, None]
# Apply gradients to model
optimizer.apply_gradients(zip(grads, generator.trainable_weights))
return loss_value
Training Loop
import time
epochs = 2
for epoch in range(epochs):
print("\nStart of epoch %d" % (epoch,))
start_time = time.time()
# Iterate over the batches of the dataset.
for step in range(sample_size):
loss_value = train_step(x_input[step], y_true)
# Log every 5 batches.
if step % 5 == 0:
print(
"Training loss (for one batch) at step %d: %.4f"
% (step, float(loss_value))
)
print("Time taken: %.2fs" % (time.time() - start_time))
Error output
ValueError: No gradients provided for any variable: ['dense/kernel:0', 'dense/bias:0', 'dense_1/kernel:0', 'dense_1/bias:0'].
loss_value and generator.trainable_weights are populated as expected.
EDIT: Updated code to reflect the suggestions of Myrl Marmarelis and the architecture of TensorFlow's custom training loop guide. Still having the same issue of gradients being a list of None's.
Try changing your calls to np.array(...) before calculating the loss (especially on y_pred) to tf.convert_to_tensor(...). You need to build a proper symbolic graph by keeping everything as tf.Tensors. In fact, make sure you are not converting anything to a non-Tensor anywhere along the chain of computation between the model parameters and the loss.
I would also suggest wrapping your training procedure in a #tf.function so that Tensorflow may compile it into a static graph.
Dear stackoverflow members,
I am currently trying to implement my own keras tuner training loop. In this loop I want to pass the input variable multiple times through the model in example:
Y = Startvalue
for i in range(x):
Y = model(Y)
I want to see if this method creates more stable simulations for my self feedback problem.
When I implement it I get an OOM error even when I do not loop. This error does not occur when I just do it normally.
My Class example (the OOM error occurs when i switch logits for logits2:
class MyTuner(kt.Tuner):
def run_trial(self, trial, train_ds, validation_data):
model = self.hypermodel.build(trial.hyperparameters)
optimizer = tf.keras.optimizers.Adam()
epoch_loss_metric = tf.keras.metrics.MeanSquaredError()
def microbatch(T_IN, A_IN, D_IN):
OUT_T = []
OUT_A = []
for i in range(len(T_IN)):
A_IN_R = tf.expand_dims(tf.squeeze(A_IN[i]), 0)
T_IN_R = tf.expand_dims(tf.squeeze(T_IN[i]), 0)
D_IN_R = tf.expand_dims(tf.squeeze(D_IN[i]), 0)
(OUT_T_R, OUT_A_R) = model((A_IN_R, T_IN_R, D_IN_R))
OUT_T.append(tf.squeeze(OUT_T_R))
OUT_A.append(tf.squeeze(OUT_A_R))
return(tf.squeeze(tf.stack(OUT_T)), tf.squeeze(tf.stack(OUT_A)))
def run_train_step(data):
T_IN = tf.dtypes.cast(data[0][0], 'float32')
A_IN = tf.dtypes.cast(data[0][1], 'float32')
D_IN = tf.dtypes.cast(data[0][2], 'float32')
A_Ta = tf.dtypes.cast(data[1][0], 'float32')
T_Ta = tf.dtypes.cast(data[1][1], 'float32')
mse = tf.keras.losses.MeanSquaredError()
with tf.GradientTape() as tape:
logits2 = microbatch(T_IN, A_IN, D_IN)
logits = model([A_IN, T_IN, D_IN])
loss = mse((T_Ta, A_Ta), logits2)
# Add any regularization losses.
if model.losses:
loss += tf.math.add_n(model.losses)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
epoch_loss_metric.update_state((T_Ta, A_Ta), logits2)
return loss
for epoch in range(1000):
print('Epoch: {}'.format(epoch))
self.on_epoch_begin(trial, model, epoch, logs={})
for batch, data in enumerate(train_ds):
self.on_batch_begin(trial, model, batch, logs={})
batch_loss = float(run_train_step(data))
self.on_batch_end(trial, model, batch, logs={'loss': batch_loss})
if batch % 100 == 0:
loss = epoch_loss_metric.result().numpy()
print('Batch: {}, Average Loss: {}'.format(batch, loss))
epoch_loss = epoch_loss_metric.result().numpy()
self.on_epoch_end(trial, model, epoch, logs={'loss': epoch_loss})
epoch_loss_metric.reset_states()
````
In my understanding, the micro-batch function is not implementing a self-feedback loop (though it does not affect the OOM)
I guess what's happening is that because you are computing the output of the network k times, the amount of memory consumption by the network is increasing by k times (because it needs to store intermediate tensors for backprop).
What you can do is, at each self-feedback instance, you backprop the gradients so that all the intermediate tensors do not increase beyond the limit.
lemme know if you have any doubt,