pytorch, How can i make same size of tensor model(x) and answer(x)? - python

I'm try to make a simple linear model to predict parameters of formula.
y = 3*x1 + x2 - 2*x3
Unfortunately, there are some problem when i try to compute loss.
def answer(x):
return 3 * x[:,0] + x[:,1] - 2 * x[:,2]
def loss_f(x):
y = answer(x)
y_hat = model(x)
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
return loss
When i set batch_size = 3, the size of each result is different
x = torch.randn(3,3)
answer(x)
tensor([ 2.0201, -3.8354, 2.0059])
model(x)
tensor([[ 0.2085],
[-0.0670],
[-1.3635]], grad_fn=<ThAddmmBackward>)
answer(x.data).size()
torch.Size([3])
model(x.data).size()
torch.Size([3, 1])
I think the broadcast applied automatically.
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
How can i make same size of two tensors? Thanks
This is my code
import torch
import torch.nn as nn
import torch.optim as optim
class model(nn.Module):
def __init__(self, input_size, output_size):
super(model, self).__init__()
self.linear = nn.Linear(input_size, output_size)
def forward(self, x):
y = self.linear(x)
return y
model = model(3,1)
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.1)
print('Parameters : ')
for p in model.parameters():
print(p)
print('')
print('Optimizer : ')
print(optimizer)
def generate_data(batch_size):
x = torch.randn(batch_size, 3)
return x
def answer(x):
return 3 * x[:,0] + x[:,1] - 2 * x[:,2]
def loss_f(x):
y = answer(x)
y_hat = model(x)
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
return loss
x = torch.randn(3,3)
print(x)
x = torch.FloatTensor(x)
batch_size = 3
epoch_n = 1000
iter_n = 100
for epoch in range(epoch_n):
avg_loss = 0
for i in range(iter_n):
x = torch.randn(batch_size, 3)
optimizer.zero_grad()
loss = loss_f(x.data)
loss.backward()
optimizer.step()
avg_loss += loss
avg_loss = avg_loss / iter_n
x_valid = torch.FloatTensor([[1,2,3]])
y_valid = answer(x_valid)
model.eval()
y_hat = model(x_valid)
model.train()
print(avg_loss, y_valid.data[0], y_hat.data[0])
if avg_loss < 0.001:
break

You can use Tensor.view
https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
So something like
answer(x.data).view(-1, 1)
should do the trick.

Related

Multihead attention model - IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1

I'm trying to train a multiclass classification model (with 3 classes) using a multihead attention layer and two linear layers with some tabular data, and I'm getting this error:
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
I have copied my model/dataset classes and my training loop below; it seems like the error is associated with the data I am passing into my loss function (criterion), which looks like this:
y_pred: tensor([-115.7523, -113.5820, 37.0307], dtype=torch.float64, grad_fn=<SqueezeBackward0>)
and
y: tensor(0).
I am unable to resolve this error, so any help with this would be greatly appreciated.
Here is the dataset and model classes:
class GeneExpressionDataset(torch.utils.data.Dataset):
def __init__(self, data):
self.data = data
self.features = self.data.iloc[:, 2:].values
self.labels = self.data.iloc[:, 1].values
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
features = torch.tensor(self.features[idx], dtype=torch.double)
labels = torch.tensor(self.labels[idx], dtype=torch.long)
return features, labels
class MultiheadAttention(nn.Module):
def __init__(self, input_dim, num_heads, dropout_rate):
super(MultiheadAttention, self).__init__()
self.input_dim = input_dim
self.num_heads = num_heads
self.dropout_rate = dropout_rate
self.q_linear = nn.Linear(input_dim, input_dim)
self.k_linear = nn.Linear(input_dim, input_dim)
self.v_linear = nn.Linear(input_dim, input_dim)
self.dropout = nn.Dropout(dropout_rate)
self.out_linear = nn.Linear(input_dim, input_dim)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
# Apply linear transformations to obtain query, key, and value representations
q = self.q_linear(query).view(batch_size, -1, self.num_heads)
k = self.k_linear(key).view(batch_size, -1, self.num_heads)
v = self.v_linear(value).view(batch_size, -1, self.num_heads)
# Compute scaled dot-product attention scores
scores = torch.matmul(q, k.transpose(1, 2)) / (self.input_dim ** 0.5)
if mask is not None:
mask = mask.unsqueeze(1)
scores = scores.masked_fill(mask == 0, -1e9)
# Apply softmax to obtain attention weights
attn_weights = torch.softmax(scores, dim=-1)
# Apply dropout to the attention weights
attn_weights = self.dropout(attn_weights)
# Compute the attention output
attn_output = torch.matmul(attn_weights, v)
# Concatenate the attention output from different heads
attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * (self.input_dim // self.num_heads))
# Apply linear transformation to obtain the final attention output
out = self.out_linear(attn_output)
return out
class geneGPT(nn.Module):
def __init__(self, input_dim, hid_dim, output_dim, num_heads, dropout_rate):
super().__init__()
self.attention = MultiheadAttention(input_dim, num_heads, dropout_rate)
self.fc1 = nn.Linear(num_heads * (input_dim//num_heads), hid_dim)
self.relu = nn.ReLU()
self.out = nn.Linear(hid_dim, output_dim)
def forward(self, x, mask=None):
x = self.attention(x, x, x, mask)
x = self.relu(self.fc1(x))
x = self.out(x)
return x
and here is the training loop:
print('Training...')
model = geneGPT(INPUT_DIM, HID_DIM, OUTPUT_DIM, NUM_HEADS, DROPOUT_RATE).double().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(NUM_EPOCHS):
train_losses = 0.0
valid_losses = 0.0
train_accs = 0.0
valid_accs = 0.0
for i, (x, y) in enumerate(train_dl):
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
y_pred = model(x).squeeze()
y = y.squeeze()
print(y_pred, y)
train_loss = criterion(y_pred, y)
train_acc = multi_acc(y_pred, y)
train_loss.backward()
optimizer.step()
train_losses += train_loss.item()
train_accs += train_acc.item()
for i, (x, y) in enumerate(val_dl):
x, y = x.to(device), y.to(device)
y_pred = model(x).squeeze()
y = y.squeeze()
valid_loss = criterion(y_pred, y)
valid_acc = multi_acc(y_pred, y)
valid_losses += valid_loss.item()
valid_accs += valid_acc.item()
print("Epoch {}/{} | Loss: {:.4f} | Train Loss:{:.4f} | Valid Loss".format(epoch + 1, NUM_EPOCHS, train_loss / len(train_dl), valid_loss / len(val_dl)))
print("Training Accuracy: {:.4f} | Validation Accuracy: {:.4f}".format(train_accs / len(train_dl), valid_accs / len(val_dl)))
test_accs = 0.0
for i, (x, y) in enumerate(test_dl):
x, y = x.to(device), y.to(device)
y_pred = model(x).squeeze()
y = y.squeeze()
test_acc = multi_acc(y_pred, y)
test_accs += test_acc.item()
print("Testing Accuracy: {:.4f}".format(test_accs / len(test_dl)))
torch.save(model.state_dict(), "model.pth")
In your training loop
y_pred = model(x).squeeze()
y = y.squeeze()
You changed the dimension of both, and in later step
train_loss = criterion(y_pred.unsqueeze(0), y)
You again changed the y_pred dimension, keeping the y dim same. So I assume the relative difference in the dim of both y and y_pred is resulting in the error " Expected input batch_size (1) to match target batch_size (0)".

pytorch's augmented assignment and requires_grad

Why does:
with torch.no_grad():
w = w - lr*w.grad
print(w)
results in:
tensor(0.9871)
and
with torch.no_grad():
w -= lr*w.grad
print(w)
results in:
tensor(0.9871, requires_grad=True)
Aren't both operations the same?
Here is some test code:
def test_stack():
np.random.seed(0)
n = 50
feat1 = np.random.randn(n, 1)
feat2 = np.random.randn(n, 1)
X = torch.tensor(feat1).view(-1, 1)
Y = torch.tensor(feat2).view(-1, 1)
w = torch.tensor(1.0, requires_grad=True)
epochs = 1
lr = 0.001
for epoch in range(epochs):
for i in range(len(X)):
y_pred = w*X[i]
loss = (y_pred - Y[i])**2
loss.backward()
with torch.no_grad():
#w = w - lr*w.grad # DOESN'T WORK!!!!
#print(w); return
w -= lr*w.grad
print(w); return
w.grad.zero_()
Remove the comments and you'll se the requires_grad disappearing. Could this be a bug?

RBF-Neural net can't classify MNIST dataset

I have implemented a RBF neural network classifier.
I use my implementation to classify the MNIST dataset, but it is not learning and always just predicts a single class. I would be very grateful if someone could help me identify the problem with my implementation.
I have to note that the implementation is quite slow due to the fact it works example by example, but I don't know how to make it such that it works batch by batch. (I am new to tensorflow and python in general)
My implementation is as follows:
class RBF_NN:
def __init__(self, M, K, L, lr):
#Layer sizes
self.M = M #input layer size - number of features
self.K = K #RBF layer size
self.L = L #output layer size - number of classes
#
x = tf.placeholder(tf.float32,shape=[M])
matrix = tf.reshape(tf.tile(x,multiples=[K]),shape=[K,M])
prototypes_input = tf.placeholder(tf.float32,shape=[K,M])
prototypes = tf.Variable(prototypes_input) # prototypes - representatives of the data
r = tf.reduce_sum(tf.square(prototypes-matrix),1)
s = tf.Variable(tf.random.uniform(shape=[K],maxval=1)) #scaling factors
h = tf.exp(-r/(2*tf.pow(s,2)))
W = tf.Variable(tf.random.uniform(shape=[K,L],maxval=1))
b = tf.Variable(tf.constant(0.1, shape=[L]))
o = tf.matmul(tf.transpose(tf.expand_dims(h,1)),W) + b
pred_class = tf.argmax(o,1)
y = tf.placeholder(shape=[L], dtype=tf.float32)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=o, labels=y))
optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
self.x = x
self.prototypes_input = prototypes_input
self.prototypes = prototypes
self.r = r
self.s = s
self.h = h
self.W = W
self.b = b
self.o = o
self.y = y
self.loss = loss
self.optimizer = optimizer
self.pred_class = pred_class
def fit(self,X,y,prototypes,epoch_count,print_step,sess):
for epoch in range(epoch_count):
epoch_loss = 0
for xi,yi in zip(X,y):
iter_loss, _ = sess.run((self.loss,self.optimizer),feed_dict={self.x: xi, self.y: yi, self.prototypes_input:prototypes})
epoch_loss = epoch_loss + iter_loss
epoch_loss = epoch_loss/len(X)
if epoch%print_step == 0:
print("Epoch loss",(epoch+1),":",epoch_loss)
def predict(self,x,sess):
return sess.run((self.pred_class),feed_dict={self.x:x})[0]
def get_prototypes(self,sess):
return sess.run((self.prototypes))
Usage:
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
y_train = to_one_hot(y_train,10)
y_test = to_one_hot(y_test,10)
x_train = np.asarray([np.asarray(x).reshape(-1) for x in x_train])
x_test = np.asarray([np.asarray(x).reshape(-1) for x in x_test])
M = 784
K = 1000
L = 10
lr = 0.01
rbfnn = RBF_NN(M,K,L,lr)
#Selecting prototypes from the train set
idx = np.random.randint(len(x_train), size=K)
prototypes = x_train[idx,:]
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init,feed_dict={rbfnn.prototypes_input:prototypes})
rbfnn.fit(x_train,y_train,prototypes,epoch_count=1, print_step=1,sess=sess)
y_test_p = []
for xi,yi in zip(x_test,y_test):
yp = rbfnn.predict(xi,sess=sess)
y_test_p.append(yp)
y_test_t = [np.argmax(yi) for yi in y_test]
acc = accuracy_score(y_test_t,y_test_p,)
precc = precision_score(y_test_t,y_test_p, average='macro')
recall = recall_score(y_test_t,y_test_p, average = 'macro')
f1 = f1_score(y_test_t,y_test_p,average='macro')
print("Accuracy:",acc)
print("Precision:",precc)
print("Recall:",recall)
print("F1 score:",f1)
sess.close()
The implementation is fine. However, it seems to be very sensitive to the data.
It will start learning just fine if the following lines are added:
x_train = (x_train-x_train.min())/(x_train.max()-x_train.min())
x_test = (x_test-x_test.min())/(x_test.max()-x_test.min())
In this way the data is normalized so that the interval of each feature is from 0 to 1.

Gradients are zero in manual weights update in PyTorch

I'm trying to implement a simple neural net with manual update of weights for MNIST with AUTOGRAD, similar to AUTOGRAD example given here. This is my code:
import os
import sys
import torch
import torchvision
class Datasets:
"""Helper for extracting datasets."""
def __init__(self, root='data/', batch_size=25):
if not os.path.exists(root):
os.mkdir(root)
self.root = root
self.batch_size = batch_size
def get_mnist_loaders(self):
train_data = torchvision.datasets.MNIST(
root=self.root, train=True, download=True)
test_data = torchvision.datasets.MNIST(
root=self.root, train=False, download=True)
train_loader = torch.utils.data.DataLoader(
dataset=train_data, batch_size=self.batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
dataset=test_data, batch_size=self.batch_size, shuffle=False)
return train_loader, test_loader
def create_batches(self, data, labels, batch_size):
return [(data[i:i+batch_size], labels[i:i+batch_size])
for i in range(0, len(data), max(1, batch_size))]
def train1():
dtype = torch.float
n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10
batch_size = 200
n_epochs = 25
learning_rate = 0.01
test_step = 100
device = torch.device("cpu")
datasets = Datasets(batch_size=batch_size)
train_loader, test_loader = datasets.get_mnist_loaders()
def feed_forward(X):
x_shape = list(X.size())
X = X.view(x_shape[0], x_shape[1]*x_shape[2])
hidden1 = torch.mm(X, w1)
hidden1 += b1
hidden1 = hidden1.clamp(min=0)
hidden2 = torch.mm(hidden1, w2) + b2
hidden2 = hidden2.clamp(min=0)
logits = torch.mm(hidden2, w3) + b3
softmax = pytorch_softmax(logits)
return softmax
def accuracy(y_pred, y):
if list(y_pred.size()) != list(y.size()):
raise ValueError('Inputs have different shapes.')
total_correct = 0
total = 0
for i, (y1, y2) in enumerate(zip(y_pred, y)):
if y1 == y2:
total_correct += 1
total += 1
return total_correct / total
w1 = torch.randn(n_inputs, n_hidden1, device=device, dtype=dtype, requires_grad=True)
b1 = torch.nn.Parameter(torch.zeros(n_hidden1), requires_grad=True)
w2 = torch.randn(n_hidden1, n_hidden2, requires_grad=True)
b2 = torch.nn.Parameter(torch.zeros(n_hidden2), requires_grad=True)
w3 = torch.randn(n_hidden2, n_outputs, dtype=dtype, requires_grad=True)
b3 = torch.nn.Parameter(torch.zeros(n_outputs), requires_grad=True)
pytorch_softmax = torch.nn.Softmax(0)
pytorch_cross_entropy = torch.nn.CrossEntropyLoss(reduction='elementwise_mean')
step = 0
for epoch in range(n_epochs):
batches = datasets.create_batches(train_loader.dataset.train_data,
train_loader.dataset.train_labels,
batch_size)
for x, y in batches:
step += 1
softmax = feed_forward(x.float())
vals, y_pred = torch.max(softmax, 1)
accuracy_ = accuracy(y_pred, y)
cross_entropy = pytorch_cross_entropy(softmax, y)
print(epoch, step, cross_entropy.item(), accuracy_)
cross_entropy.backward()
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
w3 -= learning_rate * w3.grad
b1 -= learning_rate * b1.grad
b2 -= learning_rate * b2.grad
b3 -= learning_rate * b3.grad
w1.grad.zero_()
w2.grad.zero_()
w3.grad.zero_()
b1.grad.zero_()
b2.grad.zero_()
b3.grad.zero_()
if __name__ == '__main__':
train1()
However the network doesn't seem to train. When I print parts of gradients (e.g. w1.grad.data[:10, :10]) they consist of zeros. I've tried to use weight.data and weight.grad.data to update the weights and tried to remove the w.grad.zero_() part (even though it is in the example) but it doesn't help. What is the problem here?
When you feed the network with Tensor, gradients are not calculating by default. To make it works you may wrap your FloatTensor into torch.autograd.Variable or set property requires_grad of tensor. Here is an example.
There are 3 issues here.
Firstly, the axis in which you're taking the softmax is wrong. It should be taken on the last axis.
pytorch_softmax = torch.nn.Softmax(-1)
Secondly, your logits consists of very large numbers. The resulting derivative of this is a very small number, hence the zeros that you are seeing.
tensor([[ -95782.0859, -30961.9023, -3614.0188, ..., -328240.6250,
-40818.2227, -160598.5469],
[-182128.5938, -76499.2969, 143654.6250, ..., -300924.1250,
-74291.3125, -109025.0391],
[-163018.4062, -71817.1172, -134466.0156, ..., -49884.1211,
-19183.3691, 116674.1406],
...,
[ 225013.4219, -37008.6484, 244807.2188, ..., -466822.8750,
63626.5625, -147146.0781],
[ 122045.7031, -90937.7344, 77259.1641, ..., -397063.9375,
-188736.9688, -78475.5000],
[ 23139.7578, -14914.8359, -205065.0625, ..., -65808.6562,
31458.8906, -11362.2344]], grad_fn=<AddBackward0>)
Several things you can do to includes normalising your data, adding BatchNorm, clamping etc. I can see that your data X is a tensor with values ranging from 0 to 255.
Thirdly, you shouldn't need to wrap your tensors with nn.Parameter as they are only used in conjunction with the nn.Module class.

Tensorflow copy of sklearn MLPRegressor produces other results

I am trying to reproduce a deep learning regression result in Tensorflow. If I train a neural network with the MLPRegressor class from sklearn I get very nice results of 98% validation.
The MLPRegressor:
http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor
I am trying to reproduce the model in Tensorflow. By copying the default values of the MLPRegressor class in a Tensorflow model. However I cannot get the same result. I only get 75% most of the time.
My TF model:
tf.reset_default_graph()
graph = tf.Graph()
n_input = 3 # n variables
n_hidden_1 = 100
n_hidden_2 = 1
n_output = 1
beta = 0.001
learning_rate = 0.001
with graph.as_default():
tf_train_feat = tf.placeholder(tf.float32, shape=(None, n_input))
tf_train_label = tf.placeholder(tf.float32, shape=(None))
tf_test_feat = tf.constant(test_feat, tf.float32)
"""
Weights and biases. The weights matix' columns will be the output vector.
* ndarray([rows, columns])
* ndarray([in, out])
tf.placeholder(None) and tf.placeholder([None, 3]) means that the row's size is not set. In the second
placeholder the columns are prefixed at 3.
"""
W = {
"layer_1": tf.Variable(tf.truncated_normal([n_input, n_hidden_1])),
"layer_2": tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2])),
"layer_3": tf.Variable(tf.truncated_normal([n_hidden_2, n_output])),
}
b = {
"layer_1": tf.Variable(tf.zeros([n_hidden_1])),
"layer_2": tf.Variable(tf.zeros([n_hidden_2])),
}
def computation(X):
layer_1 = tf.nn.relu(tf.matmul(X, W["layer_1"]) + b["layer_1"])
layer_2 = tf.nn.relu(tf.matmul(layer_1, W["layer_2"]) + b["layer_2"])
return layer_2
tf_prediction = computation(tf_train_feat)
tf_test_prediction = computation(tf_test_feat)
tf_loss = tf.reduce_mean(tf.pow(tf_train_label - tf_prediction, 2))
tf_loss = tf.reduce_mean( tf_loss + beta * tf.nn.l2_loss(W["layer_2"]) )
tf_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
#tf_optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(tf_loss)
init = tf.global_variables_initializer()
My TF session:
def accuracy(y_pred, y):
a = 0
for i in range(y.shape[0]):
a += abs(1 - y_pred[i][0] / y[i])
return round((1 - a / y.shape[0]) * 100, 3)
def accuracy_tensor(y_pred, y):
a = 0
for i in range(y.shape[0]):
a += abs(1 - y_pred[i][0] / y[i])
return round((1 - a / y.shape[0]) * 100, 3)
# Shuffles two arrays.
def shuffle_in_unison(a, b):
assert len(a) == len(b)
shuffled_a = np.empty(a.shape, dtype=a.dtype)
shuffled_b = np.empty(b.shape, dtype=b.dtype)
permutation = np.random.permutation(len(a))
for old_index, new_index in enumerate(permutation):
shuffled_a[new_index] = a[old_index]
shuffled_b[new_index] = b[old_index]
return shuffled_a, shuffled_b
train_epoch = int(5e4)
batch = int(200)
n_batch = int(X.shape[0] // batch)
prev_acc = 0
stable_count = 0
session = tf.InteractiveSession(graph=graph)
session.run(init)
print("Initialized.\n No. of epochs: %d.\n No. of batches: %d." % (train_epoch, n_batch))
for epoch in range(train_epoch):
offset = (epoch * n_batch) % (Y.shape[0] - n_batch)
for i in range(n_batch):
x = X[offset:(offset + n_batch)]
y = Y[offset:(offset + n_batch)]
x, y = shuffle_in_unison(x, y)
feed_dict = {tf_train_feat: x, tf_train_label: y}
_, l, pred, pred_label = session.run([tf_optimizer, tf_loss, tf_prediction, tf_train_label], feed_dict=feed_dict)
if epoch % 1 == 0:
print("Epoch: %d. Batch' loss: %f" %(epoch, l))
test_pred = tf_test_prediction.eval(session=session)
acc_test = accuracy(test_pred, test_label)
acc_train = accuracy_tensor(pred, pred_label)
print("Accuracy train set %s%%" % acc_train)
print("Accuracy test set: %s%%" % acc_test)
Am I missing something in the Tensorflow code? Thanks!
Unless you have a very good reason to not use them, regression should have linear output units. I ran into a similar problem a while back and ended up using linear outputs and linear hidden units which seemed to mirror the mlpregressor in my case.
There is a great section in Goodfellow's Deep Learning Book in chapter 6, starting at page 181, that goes over the activation functions.
At the very least try this for your output layer
layer_2 = tf.matmul(layer_1, W["layer_2"]) + b["layer_2"]

Categories