pytorch's augmented assignment and requires_grad - python

Why does:
with torch.no_grad():
w = w - lr*w.grad
print(w)
results in:
tensor(0.9871)
and
with torch.no_grad():
w -= lr*w.grad
print(w)
results in:
tensor(0.9871, requires_grad=True)
Aren't both operations the same?
Here is some test code:
def test_stack():
np.random.seed(0)
n = 50
feat1 = np.random.randn(n, 1)
feat2 = np.random.randn(n, 1)
X = torch.tensor(feat1).view(-1, 1)
Y = torch.tensor(feat2).view(-1, 1)
w = torch.tensor(1.0, requires_grad=True)
epochs = 1
lr = 0.001
for epoch in range(epochs):
for i in range(len(X)):
y_pred = w*X[i]
loss = (y_pred - Y[i])**2
loss.backward()
with torch.no_grad():
#w = w - lr*w.grad # DOESN'T WORK!!!!
#print(w); return
w -= lr*w.grad
print(w); return
w.grad.zero_()
Remove the comments and you'll se the requires_grad disappearing. Could this be a bug?

Related

Issue Implementing Custom Gradient Descent Function

I am implementing my own/custom Gradient descent algorithm using python but the weights and biases that are returned by my algorithm has 10 values (shape=(10, )) but my input data has only 1 column so I am expecting it to return 1 Weight and 1 Bias
Code:
import numpy as np
import matplotlib.pyplot as plt
def SGD(X, y, learning_rate=0.01, max_iter=1000):
w = np.random.randn(X.shape[1])
b = np.random.randn(1,)
print(w, b)
n = len(X)
loss_list = []
for i in range(max_iter):
y_pred = w*X + b
Lw = -(2/n)*sum(X*(y - y_pred))
Lb = -(2/n)*sum(y - y_pred)
w = w - learning_rate*Lw
b = b - learning_rate*Lb
loss = np.square(np.subtract(y, y_pred)).mean()
loss_list.append(loss)
print(f"Epoch: {i}, loss: {loss}")
return w, b
x = list(range(1, 11))
y = []
for i in x:
y.append(i**2)
x, y = np.array(x).reshape(-1, 1), np.array(y)
w, b = SGD(x, y)
print("\n\n\n\n")
print(w)
print(b)
Loss of last iteration:
Epoch: 999, loss: 0.11521764208740602
Returned weight and bias respectively,
w: [0.00149535 0.00777379 0.01823786 0.03288755 0.05172286 0.07474381
0.10195038 0.13334257 0.1689204 0.20868384] # giving 10 values
b: [ 0.98958964 3.94588026 8.87303129 15.77104274 24.63991461 35.47964689
48.29023958 63.07169269 79.82400621 98.54718014] # giving 10 values
I am not understanding the cause, how this is happening?
Thanks!
I think this is because your y is a 1d row list, but y_pred is a 1xn column list, so subtracting them will give you an nxn matrix which you don't want. The fix is to just reshape y before you call your function like so:
import numpy as np
import matplotlib.pyplot as plt
def SGD(X, y, learning_rate=0.01, max_iter=1000):
w = np.random.randn(X.shape[1])
b = np.random.randn(1,)
print(w, b)
n = len(X)
loss_list = []
for i in range(max_iter):
y_pred = w*X + b
Lw = -(2/n)*sum(X*(y - y_pred))
Lb = -(2/n)*sum(y - y_pred)
w = w - learning_rate*Lw
b = b - learning_rate*Lb
loss = np.square(np.subtract(y, y_pred)).mean()
loss_list.append(loss)
print(f"Epoch: {i}, loss: {loss}")
return w, b
x = list(range(1, 11))
y = []
for i in x:
y.append(i**2)
x, y = np.array(x).reshape(-1, 1), np.array(y).reshape((-1, 1)) # Change is here
w, b = SGD(x, y)
print("\n\n\n\n")
print(w)
print(b)
and then w, b are:
[10.94655101]
[-21.6278976]
respectively

XOR Neural Network python

I am a total noob and this is first thing in ML im trying to do. I just want to run the code.
I know feedforward is correct and my errors should be correct, but I get incorrect results.
Please help
import numpy as np
inputs = np.array([
[[0],[0]],
[[1],[0]],
[[0],[1]],
[[1],[1]]
])
expected_output = np.array([
[[0]],
[[1]],
[[1]],
[[0]]
])
epochs = 1000
lr = 0.01
hidden_weights = np.array([
[0.2, 0.3],
[0.4, 0.5]
])
hidden_bias = np.array([[0.3], [0.6]])
output_weights = np.array([[0.6, 0.7]])
output_bias = np.array([[0.5]])
def sigmoid(z):
return 1/(1+np.exp(-z))
def sigmoid_derivative(z):
return np.multiply(sigmoid(z), sigmoid(1.0-z))
for _ in range(epochs):
for index, input in enumerate(inputs):
hidden_layer_activation = np.dot(hidden_weights, input)
hidden_layer_activation += hidden_bias
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot(output_weights, hidden_layer_output)
output_layer_activation += output_bias
predicted_output = sigmoid(output_layer_activation)
#Backpropagation
output_errors = expected_output[index] - predicted_output
hidden_errors = output_weights.T.dot(output_errors)
d_predicted_output = output_errors * sigmoid_derivative(predicted_output)
d_hidden_layer = hidden_errors * sigmoid_derivative(hidden_layer_output)
# I am almost certain the problem is in the next 2 linees
output_weights += d_predicted_output.dot(hidden_layer_output.T) * lr
hidden_weights += d_hidden_layer.dot(input.T) * lr
output_bias += np.sum(d_predicted_output,axis=0,keepdims=True) * lr
hidden_bias += np.sum(d_hidden_layer,axis=0,keepdims=True) * lr
# NOW THE TESTING,I pass 2 input neurons. One with value 0 and value 1
test = np.array([
[[0], [1]]
])
hidden_layer_activation = np.dot(hidden_weights, test[0])
hidden_layer_activation += hidden_bias
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot(output_weights, hidden_layer_output)
output_layer_activation += output_bias
predicted_output = sigmoid(output_layer_activation)
print(predicted_output) # I usually get somewhere around [[0.5]], and the ideal answer should be [[1]] since it is a XOR gate
Result: [[0.5]] for inputs 0 and 1
Wanted: [[1]] for input 0 and 1
That's all the code... thank you in advance
I am guessing problem is somewhere where I update the weight and bias. I was doing path for forward propagation and i got correct results.
The problem must be transposing and taking the dot product in the backpropogation step.
My code on XOR:
import numpy as np
def sigmoid(z):
return 1/(1+np.exp(-z))
def sigmoid_derivative(z):
return np.multiply(sigmoid(z), sigmoid(1.0-z))
def init_w(epsilon):
# Input nodes
theta1=2*np.random.random([2,3])*epsilon - epsilon
# Output nodes
theta2=2*np.random.random([1,3])*epsilon - epsilon
theta1,theta2=np.mat(theta1),np.mat(theta2)
return theta1,theta2
def fit(X, Y, theta1,theta2, predict=False, x=None):
grad1,grad2=np.mat(np.zeros(np.shape(theta1))),np.mat(np.zeros(np.shape(theta2)))
for i in range(len(X)):
x = x if predict else X[i]
y = Y[0,i]
# forward propagate
a = x
a1=np.mat(np.append(1, a)).T
z2=theta1*a1
a2=sigmoid(z2)
a2=np.mat(np.append(1, a2)).T
z3=theta2*a2
a3=sigmoid(z3)
if predict: return a3
# back propagate
delta3 = a3 - y.T
grad2 += delta3 * a2.T
delta2 = np.multiply(theta2.T*delta3, sigmoid_derivative(a2))
grad1 += (delta2[1:] * a1.T)
return grad1,grad2
def predict(x):
return fit(X, Y, theta1,theta2, True, x)
X = np.mat([[0,0],
[0,1],
[1,0],
[1,1]])
Y = np.mat([0,1,1,0])
epochs = 10000
alpha = 0.85
epsilon = 1
theta1,theta2 = init_w(epsilon)
for i in range(epochs):
g1,g2 = fit(X, Y, theta1,theta2)
theta1 -= alpha * g1
theta2 -= alpha * g2
for i in range(len(X)):
x = X[i]
guess = predict(x)
print(x, ":", guess)
Output:
[[0 0]] : [[ 0.00233143]]
[[0 1]] : [[ 0.99775431]]
[[1 0]] : [[ 0.9977526]]
[[1 1]] : [[ 0.00233134]]
Edit:
Your array format is too complex so I will suggest you to write down the shapes after each steps so that you can debug easily.
Update:
import numpy as np
#np.random.seed(0)
def sigmoid (x):
return 1/(1 + np.exp(-x))
def sigmoid_derivative(x):
return x * (1 - x)
#Input datasets
inputs = np.array([[0,0],[0,1],[1,0],[1,1]])
expected_output = np.array([[0],[1],[1],[0]])
epochs = 10000
lr = 0.1
inputLayerNeurons, hiddenLayerNeurons, outputLayerNeurons = 2,2,1
#Random weights and bias initialization
#hidden_weights = np.random.uniform(size=(inputLayerNeurons,hiddenLayerNeurons))
#hidden_bias =np.random.uniform(size=(1,hiddenLayerNeurons))
#output_weights = np.random.uniform(size=(hiddenLayerNeurons,outputLayerNeurons))
#output_bias = np.random.uniform(size=(1,outputLayerNeurons))
hidden_weights = np.array([
[0.2, 0.3],
[0.4, 0.5]
])
hidden_bias = np.array([[0.3, 0.6]])
output_weights = np.array([[0.6], [0.7]])
output_bias = np.array([[0.5]])
print("Initial hidden weights: ",end='')
print(*hidden_weights)
print("Initial hidden biases: ",end='')
print(*hidden_bias)
print("Initial output weights: ",end='')
print(*output_weights)
print("Initial output biases: ",end='')
print(*output_bias)
#Training algorithm
for _ in range(epochs):
#Forward Propagation
hidden_layer_activation = np.dot(inputs,hidden_weights)
hidden_layer_activation += hidden_bias
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot(hidden_layer_output,output_weights)
output_layer_activation += output_bias
predicted_output = sigmoid(output_layer_activation)
#Backpropagation
error = expected_output - predicted_output
d_predicted_output = error * sigmoid_derivative(predicted_output)
error_hidden_layer = d_predicted_output.dot(output_weights.T)
d_hidden_layer = error_hidden_layer * sigmoid_derivative(hidden_layer_output)
#Updating Weights and Biases
output_weights += hidden_layer_output.T.dot(d_predicted_output) * lr
output_bias += np.sum(d_predicted_output,axis=0,keepdims=True) * lr
hidden_weights += inputs.T.dot(d_hidden_layer) * lr
hidden_bias += np.sum(d_hidden_layer,axis=0,keepdims=True) * lr
print("Final hidden weights: ",end='')
print(*hidden_weights)
print("Final hidden bias: ",end='')
print(*hidden_bias)
print("Final output weights: ",end='')
print(*output_weights)
print("Final output bias: ",end='')
print(*output_bias)
print("\nOutput from neural network after 10,000 epochs: ",end='')
print(*predicted_output)
test = np.array([
[0, 1]
])
hidden_layer_activation = np.dot(test, hidden_weights)
hidden_layer_activation += hidden_bias
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot( hidden_layer_output, output_weights)
output_layer_activation += output_bias
predicted_output = sigmoid(output_layer_activation)
print(predicted_output)
Final hidden weights: [3.59882402 5.68799788] [3.60260363 5.70714658]
Final hidden bias: [-5.50709978 -2.3415549 ]
Final output weights: [-7.85976304] [7.26409199]
Final output bias: [-3.26766959]
Output from neural network after 10,000 epochs: [0.06525552] [0.93906737] [0.93899963] [0.06635071]
[[0.93907536]]
here is the result:
[[0.93907536]]

RBF-Neural net can't classify MNIST dataset

I have implemented a RBF neural network classifier.
I use my implementation to classify the MNIST dataset, but it is not learning and always just predicts a single class. I would be very grateful if someone could help me identify the problem with my implementation.
I have to note that the implementation is quite slow due to the fact it works example by example, but I don't know how to make it such that it works batch by batch. (I am new to tensorflow and python in general)
My implementation is as follows:
class RBF_NN:
def __init__(self, M, K, L, lr):
#Layer sizes
self.M = M #input layer size - number of features
self.K = K #RBF layer size
self.L = L #output layer size - number of classes
#
x = tf.placeholder(tf.float32,shape=[M])
matrix = tf.reshape(tf.tile(x,multiples=[K]),shape=[K,M])
prototypes_input = tf.placeholder(tf.float32,shape=[K,M])
prototypes = tf.Variable(prototypes_input) # prototypes - representatives of the data
r = tf.reduce_sum(tf.square(prototypes-matrix),1)
s = tf.Variable(tf.random.uniform(shape=[K],maxval=1)) #scaling factors
h = tf.exp(-r/(2*tf.pow(s,2)))
W = tf.Variable(tf.random.uniform(shape=[K,L],maxval=1))
b = tf.Variable(tf.constant(0.1, shape=[L]))
o = tf.matmul(tf.transpose(tf.expand_dims(h,1)),W) + b
pred_class = tf.argmax(o,1)
y = tf.placeholder(shape=[L], dtype=tf.float32)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=o, labels=y))
optimizer = tf.train.AdamOptimizer(lr).minimize(loss)
self.x = x
self.prototypes_input = prototypes_input
self.prototypes = prototypes
self.r = r
self.s = s
self.h = h
self.W = W
self.b = b
self.o = o
self.y = y
self.loss = loss
self.optimizer = optimizer
self.pred_class = pred_class
def fit(self,X,y,prototypes,epoch_count,print_step,sess):
for epoch in range(epoch_count):
epoch_loss = 0
for xi,yi in zip(X,y):
iter_loss, _ = sess.run((self.loss,self.optimizer),feed_dict={self.x: xi, self.y: yi, self.prototypes_input:prototypes})
epoch_loss = epoch_loss + iter_loss
epoch_loss = epoch_loss/len(X)
if epoch%print_step == 0:
print("Epoch loss",(epoch+1),":",epoch_loss)
def predict(self,x,sess):
return sess.run((self.pred_class),feed_dict={self.x:x})[0]
def get_prototypes(self,sess):
return sess.run((self.prototypes))
Usage:
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
y_train = to_one_hot(y_train,10)
y_test = to_one_hot(y_test,10)
x_train = np.asarray([np.asarray(x).reshape(-1) for x in x_train])
x_test = np.asarray([np.asarray(x).reshape(-1) for x in x_test])
M = 784
K = 1000
L = 10
lr = 0.01
rbfnn = RBF_NN(M,K,L,lr)
#Selecting prototypes from the train set
idx = np.random.randint(len(x_train), size=K)
prototypes = x_train[idx,:]
init = tf.global_variables_initializer()
sess = tf.InteractiveSession()
sess.run(init,feed_dict={rbfnn.prototypes_input:prototypes})
rbfnn.fit(x_train,y_train,prototypes,epoch_count=1, print_step=1,sess=sess)
y_test_p = []
for xi,yi in zip(x_test,y_test):
yp = rbfnn.predict(xi,sess=sess)
y_test_p.append(yp)
y_test_t = [np.argmax(yi) for yi in y_test]
acc = accuracy_score(y_test_t,y_test_p,)
precc = precision_score(y_test_t,y_test_p, average='macro')
recall = recall_score(y_test_t,y_test_p, average = 'macro')
f1 = f1_score(y_test_t,y_test_p,average='macro')
print("Accuracy:",acc)
print("Precision:",precc)
print("Recall:",recall)
print("F1 score:",f1)
sess.close()
The implementation is fine. However, it seems to be very sensitive to the data.
It will start learning just fine if the following lines are added:
x_train = (x_train-x_train.min())/(x_train.max()-x_train.min())
x_test = (x_test-x_test.min())/(x_test.max()-x_test.min())
In this way the data is normalized so that the interval of each feature is from 0 to 1.

pytorch, How can i make same size of tensor model(x) and answer(x)?

I'm try to make a simple linear model to predict parameters of formula.
y = 3*x1 + x2 - 2*x3
Unfortunately, there are some problem when i try to compute loss.
def answer(x):
return 3 * x[:,0] + x[:,1] - 2 * x[:,2]
def loss_f(x):
y = answer(x)
y_hat = model(x)
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
return loss
When i set batch_size = 3, the size of each result is different
x = torch.randn(3,3)
answer(x)
tensor([ 2.0201, -3.8354, 2.0059])
model(x)
tensor([[ 0.2085],
[-0.0670],
[-1.3635]], grad_fn=<ThAddmmBackward>)
answer(x.data).size()
torch.Size([3])
model(x.data).size()
torch.Size([3, 1])
I think the broadcast applied automatically.
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
How can i make same size of two tensors? Thanks
This is my code
import torch
import torch.nn as nn
import torch.optim as optim
class model(nn.Module):
def __init__(self, input_size, output_size):
super(model, self).__init__()
self.linear = nn.Linear(input_size, output_size)
def forward(self, x):
y = self.linear(x)
return y
model = model(3,1)
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum=0.1)
print('Parameters : ')
for p in model.parameters():
print(p)
print('')
print('Optimizer : ')
print(optimizer)
def generate_data(batch_size):
x = torch.randn(batch_size, 3)
return x
def answer(x):
return 3 * x[:,0] + x[:,1] - 2 * x[:,2]
def loss_f(x):
y = answer(x)
y_hat = model(x)
loss = ((y - y_hat).pow(2)).sum() / x.size(0)
return loss
x = torch.randn(3,3)
print(x)
x = torch.FloatTensor(x)
batch_size = 3
epoch_n = 1000
iter_n = 100
for epoch in range(epoch_n):
avg_loss = 0
for i in range(iter_n):
x = torch.randn(batch_size, 3)
optimizer.zero_grad()
loss = loss_f(x.data)
loss.backward()
optimizer.step()
avg_loss += loss
avg_loss = avg_loss / iter_n
x_valid = torch.FloatTensor([[1,2,3]])
y_valid = answer(x_valid)
model.eval()
y_hat = model(x_valid)
model.train()
print(avg_loss, y_valid.data[0], y_hat.data[0])
if avg_loss < 0.001:
break
You can use Tensor.view
https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
So something like
answer(x.data).view(-1, 1)
should do the trick.

Tensorflow copy of sklearn MLPRegressor produces other results

I am trying to reproduce a deep learning regression result in Tensorflow. If I train a neural network with the MLPRegressor class from sklearn I get very nice results of 98% validation.
The MLPRegressor:
http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor
I am trying to reproduce the model in Tensorflow. By copying the default values of the MLPRegressor class in a Tensorflow model. However I cannot get the same result. I only get 75% most of the time.
My TF model:
tf.reset_default_graph()
graph = tf.Graph()
n_input = 3 # n variables
n_hidden_1 = 100
n_hidden_2 = 1
n_output = 1
beta = 0.001
learning_rate = 0.001
with graph.as_default():
tf_train_feat = tf.placeholder(tf.float32, shape=(None, n_input))
tf_train_label = tf.placeholder(tf.float32, shape=(None))
tf_test_feat = tf.constant(test_feat, tf.float32)
"""
Weights and biases. The weights matix' columns will be the output vector.
* ndarray([rows, columns])
* ndarray([in, out])
tf.placeholder(None) and tf.placeholder([None, 3]) means that the row's size is not set. In the second
placeholder the columns are prefixed at 3.
"""
W = {
"layer_1": tf.Variable(tf.truncated_normal([n_input, n_hidden_1])),
"layer_2": tf.Variable(tf.truncated_normal([n_hidden_1, n_hidden_2])),
"layer_3": tf.Variable(tf.truncated_normal([n_hidden_2, n_output])),
}
b = {
"layer_1": tf.Variable(tf.zeros([n_hidden_1])),
"layer_2": tf.Variable(tf.zeros([n_hidden_2])),
}
def computation(X):
layer_1 = tf.nn.relu(tf.matmul(X, W["layer_1"]) + b["layer_1"])
layer_2 = tf.nn.relu(tf.matmul(layer_1, W["layer_2"]) + b["layer_2"])
return layer_2
tf_prediction = computation(tf_train_feat)
tf_test_prediction = computation(tf_test_feat)
tf_loss = tf.reduce_mean(tf.pow(tf_train_label - tf_prediction, 2))
tf_loss = tf.reduce_mean( tf_loss + beta * tf.nn.l2_loss(W["layer_2"]) )
tf_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(tf_loss)
#tf_optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(tf_loss)
init = tf.global_variables_initializer()
My TF session:
def accuracy(y_pred, y):
a = 0
for i in range(y.shape[0]):
a += abs(1 - y_pred[i][0] / y[i])
return round((1 - a / y.shape[0]) * 100, 3)
def accuracy_tensor(y_pred, y):
a = 0
for i in range(y.shape[0]):
a += abs(1 - y_pred[i][0] / y[i])
return round((1 - a / y.shape[0]) * 100, 3)
# Shuffles two arrays.
def shuffle_in_unison(a, b):
assert len(a) == len(b)
shuffled_a = np.empty(a.shape, dtype=a.dtype)
shuffled_b = np.empty(b.shape, dtype=b.dtype)
permutation = np.random.permutation(len(a))
for old_index, new_index in enumerate(permutation):
shuffled_a[new_index] = a[old_index]
shuffled_b[new_index] = b[old_index]
return shuffled_a, shuffled_b
train_epoch = int(5e4)
batch = int(200)
n_batch = int(X.shape[0] // batch)
prev_acc = 0
stable_count = 0
session = tf.InteractiveSession(graph=graph)
session.run(init)
print("Initialized.\n No. of epochs: %d.\n No. of batches: %d." % (train_epoch, n_batch))
for epoch in range(train_epoch):
offset = (epoch * n_batch) % (Y.shape[0] - n_batch)
for i in range(n_batch):
x = X[offset:(offset + n_batch)]
y = Y[offset:(offset + n_batch)]
x, y = shuffle_in_unison(x, y)
feed_dict = {tf_train_feat: x, tf_train_label: y}
_, l, pred, pred_label = session.run([tf_optimizer, tf_loss, tf_prediction, tf_train_label], feed_dict=feed_dict)
if epoch % 1 == 0:
print("Epoch: %d. Batch' loss: %f" %(epoch, l))
test_pred = tf_test_prediction.eval(session=session)
acc_test = accuracy(test_pred, test_label)
acc_train = accuracy_tensor(pred, pred_label)
print("Accuracy train set %s%%" % acc_train)
print("Accuracy test set: %s%%" % acc_test)
Am I missing something in the Tensorflow code? Thanks!
Unless you have a very good reason to not use them, regression should have linear output units. I ran into a similar problem a while back and ended up using linear outputs and linear hidden units which seemed to mirror the mlpregressor in my case.
There is a great section in Goodfellow's Deep Learning Book in chapter 6, starting at page 181, that goes over the activation functions.
At the very least try this for your output layer
layer_2 = tf.matmul(layer_1, W["layer_2"]) + b["layer_2"]

Categories