I wrote logistic regression algorithm using data with 9 attributes and one vector of labels, but it is not training.
I think I have to transpose some of the inputs when updating the weights but not sure, tried a bit of trial and error but no luck.
If anyone can help thanks.
class logistic_regression(neural_network):
def __init__(self,data):
self.data = data # to store the the data location in a varable
self.data1 = load_data(self.data) # load the data
self.weights = np.random.normal(0,1,self.data1.shape[1] -1) # use the number of attributes to get the number of weights
self.bias = np.random.randn(1) # set the bias to a random number
self.x = self.data1.iloc[:,0:9] # split the xs and ys
self.y = self.data1.iloc[:,9:10]
self.x = np.array(self.x)
self.y = np.array(self.y)
print(self.weights)
print(np.dot(self.x[0].T,self.weights))
def load_data(self,file):
data = pd.read_csv(file)
return data
def sigmoid(self,x): # acivation function to limit the value to 0 and 1
return 1 / (1 + np.exp(-x))
def sigmoid_prime(self,x):
return self.sigmoid(x) * (1 - self.sigmoid(x))
def train(self):
error = 0 # init the error to zero
learning_rate = 0.01
for interation in range(100):
for i in range(len(self.x)): # loop though all the data
pred = np.dot(self.x[i].T,self.weights) + self.bias # calculate the output
pred1 = self.sigmoid(pred)
error = (pred1 - self.y[i])**2 # check the accuracy of the network
self.bias -= learning_rate * pred1 - self.y[i] * self.sigmoid_prime(pred1)
self.weights -= learning_rate * (pred1 - self.y[i]) * self.sigmoid_prime(pred1) * self.x[i]
print(str(pred1)+"pred")
print(str(error) + "error") # print the result
print(pred1[0] - self.y[i][0])
def test(self):
You cannot train any machine learning model using only one label. The resulting model will only have one response, no matter what test data is being used - the label provided while training.
Broken derivatives
You've got a bug in the self.bias adjustment, missing parenthesis around pred1-self.y[i].
Also, you're calculating the derivative from the wrong variable - it seems that instead of self.sigmoid_prime(pred1) you'd need self.sigmoid_prime(pred).
Test on a toy example
For any such code, I'd suggest that you first test it on a very simple function one where it's trivial to print out all the intermediate values and verify them on paper. For example, boolean AND and OR functions. That will show you whether you've got the update formulas correct, isolating the learning code from the peculiarities of your actual learning task.
Related
My neural network is stuck at 11.35 percent accuracy and i am unable to trace the error.
low accuracy at 11.35 percent
I am following this code https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb which I found in a youtube video.
Here is my code for the neural network(I have defined Xavier weight initialization in a module called nn):
"""1. 784 neurons in input layer
2. 128 neurons in hidden layer 1
3. 64 neurons in hidden layer 2
4. 10 neurons in output layer"""
def softmax(input):
y = np.exp(input - input.max())
activated = y/ np.sum(y, axis=0)
return activated
def softmax_grad(x):
exps = np.exp(x-x.max())
return exps / np.sum(exps,axis = 0) * (1 - exps /np.sum(exps,axis = 0))
def sigmoid(input):
activated = 1/(1 + np.exp(-input))
return activated
def sigmoid_grad(input):
grad = input*(1-input)
return grad
class DenseNN:
def __init__(self,d0,d1,d2,d3):
self.params = {'w1': nn.Xavier.initialize(d0, d1),
'w2': nn.Xavier.initialize(d1, d2),
'w3': nn.Xavier.initialize(d2, d3)}
def forward(self,a0):
params = self.params
params['a0'] = a0
params['z1'] = np.dot(params['w1'],params['a0'])
params['a1'] = sigmoid(params['z1'])
params['z2'] = np.dot(params['w2'],params['a1'])
params['a2'] = sigmoid(params['z2'])
params['z3'] = np.dot(params['w3'],params['a2'])
params['a3'] = softmax(params['z3'])
return params['a3']
def backprop(self,y_true,y_pred):
params = self.params
w_change = {}
error = softmax_grad(params['z3'])*((y_pred - y_true)/y_true.shape[0])
w_change['w3'] = np.outer(error,params['a2'])
error = np.dot(params['w3'].T,error)*sigmoid_grad(params['a2'])
w_change['w2'] = np.outer(error,params['a1'])
error = np.dot(params['w2'].T,error)*sigmoid_grad(params['a1'])
w_change['w1'] = np.outer(error,params['a0'])
return w_change
def update_weights(self,learning_rate,w_change):
self.params['w1'] -= learning_rate*w_change['w1']
self.params['w2'] -= learning_rate*w_change['w2']
self.params['w3'] -= learning_rate*w_change['w3']
def train(self,epochs,lr):
for epoch in range(epochs):
for i in range(60000):
a0 = np.array([x_train[i]]).T
o = np.array([y_train[i]]).T
y_pred = self.forward(a0)
w_change = self.backprop(o,y_pred)
self.update_weights(lr,w_change)
# print(self.compute_accuracy()*100)
# print(calc_mse(a3, o))
print((self.compute_accuracy())*100)
def compute_accuracy(self):
'''
This function does a forward pass of x, then checks if the indices
of the maximum value in the output equals the indices in the label
y. Then it sums over each prediction and calculates the accuracy.
'''
predictions = []
for i in range(10000):
idx = i
a0 = x_test[idx]
a0 = np.array([a0]).T
#print("acc a1",np.shape(a1))
o = y_test[idx]
o = np.array([o]).T
#print("acc o",np.shape(o))
output = self.forward(a0)
pred = np.argmax(output)
predictions.append(pred == np.argmax(o))
return np.mean(predictions)
Here is the code for loading the data:
#load dataset csv
train_data = pd.read_csv('../Datasets/MNIST/mnist_train.csv')
test_data = pd.read_csv('../Datasets/MNIST/mnist_test.csv')
#train data
x_train = train_data.drop('label',axis=1).to_numpy()
y_train = pd.get_dummies(train_data['label']).values
#test data
x_test = test_data.drop('label',axis=1).to_numpy()
y_test = pd.get_dummies(test_data['label']).values
fac = 0.99 / 255
x_train = np.asfarray(x_train) * fac + 0.01
x_test = np.asfarray(x_test) * fac + 0.01
# train_labels = np.asfarray(train_data[:, :1])
# test_labels = np.asfarray(test_data[:, :1])
#printing dimensions
print(np.shape(x_train)) #(60000,784)
print(np.shape(y_train)) #(60000,10)
print(np.shape(x_test)) #(10000,784)
print(np.shape(y_test)) #(10000,10)
print((x_train))
Kindly help
I am a newbie in machine learning so any help would be appreciated.I am unable to figure out where i am going wrong.Most of the code is almost similar to https://github.com/MLForNerds/DL_Projects/blob/main/mnist_ann.ipynb but it manages to get 60 percent accuracy.
EDIT
I found the mistake :
Thanks to Bartosz Mikulski.
The problem was with how the weights were initialized in my Xavier weights initialization algorithm.
I changed the code for weights initialization to this:
self.params = {
'w1':np.random.randn(d1, d0) * np.sqrt(1. / d1),
'w2':np.random.randn(d2, d1) * np.sqrt(1. / d2),
'w3':np.random.randn(d3, d2) * np.sqrt(1. / d3),
'b1':np.random.randn(d1, 1) * np.sqrt(1. / d1),
'b2':np.random.randn(d2, 1) * np.sqrt(1. / d2),
'b3':np.random.randn(d3, 1) * np.sqrt(1. / d3),
}
then i got the output:
After changing weights initialization
after adding the bias parameters i got the output:
After changing weights initialization and adding bias
3: After changing weights initialization and adding bias
The one problem that I can see is that you are using only weights but no biases. They are very important because they allow your model to change the position of the decision plane (boundary) in the solution space. If you only have weights you can only angle the solution.
I guess that basically, this is the best fit you can get without biases. The dense layer is basically a linear function: w*x + b and you are missing the b. See the PyTorch documentation for the example: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html#linear.
Also, can you show your Xavier initialization? In your case, even the simple normal distributed values would be enough as initialization, no need to rush into more advanced topics.
I would also suggest you start from the smaller problem (for example Iris dataset) and no hidden layers (just a simple linear regression that learns by using gradient descent). Then you can expand it by adding hidden layers, and then by trying harder problems with the code you already have.
I have created the following neural network:
def init_weights(m, n=1):
"""
initialize a matrix/vector of weights with xavier initialization
:param m: out dim
:param n: in dim
:return: matrix/vector of random weights
"""
limit = (6 / (n * m)) ** 0.5
weights = np.random.uniform(-limit, limit, size=(m, n))
if n == 1:
weights = weights.reshape((-1,))
return weights
def softmax(v):
exp = np.exp(v)
return exp / np.tile(exp.sum(1), (v.shape[1], 1)).T
def relu(x):
return np.maximum(x, 0)
def sign(x):
return (x > 0).astype(int)
class Model:
"""
A class for neural network model
"""
def __init__(self, sizes, lr):
self.lr = lr
self.weights = []
self.biases = []
self.memory = []
for i in range(len(sizes) - 1):
self.weights.append(init_weights(sizes[i + 1], sizes[i]))
self.biases.append(init_weights(sizes[i + 1]))
def forward(self, X):
self.memory = [X]
X = np.dot(self.weights[0], X.T).T + self.biases[0]
for W, b in zip(self.weights[1:], self.biases[1:]):
X = relu(X)
self.memory.append(X)
X = np.dot(W, X.T).T + b
return softmax(X)
def backward(self, y, y_pred):
# calculate the errors for each layer
y = np.eye(y_pred.shape[1])[y]
errors = [y_pred - y]
for i in range(len(self.weights) - 1, 0, -1):
new_err = sign(self.memory[i]) * \
np.dot(errors[0], self.weights[i])
errors.insert(0, new_err)
# update weights
for i in range(len(self.weights)):
self.weights[i] -= self.lr *\
np.dot(self.memory[i].T, errors[i]).T
self.biases[i] -= self.lr * errors[i].sum(0)
The data has 10 classes. When using a single hidden layer the accuracy is almost 40%. when using 2 or 3 hidden layers, the accuracy is around 9-10% from the first epoch and remains that way. The accuracy on the train set is also in that range. Is there a problem with my implementation that could cause such a thing?
You asked about the accuracy improvement of a machine learning model, which is a very broad and ambiguous problem in the era of ML, because it varies between various model types and data types
In your case the model is neural network that has several factors on which accuracy is dependent. You are trying to optimize the accuracy on the basis of activation functions, weights or number of hidden layers which is not the correct way. To increase the accuracy you have to consider other factors too e.g. your basic checklist can be following
Increase Hidden Layers
Change Activation Functions
Experiment with initial weight initialization
Normalize Training Data
Scale Training Data
Check for Class Biasness
Now you are trying to achieve state of the art accuracy on the basis of very few factors, I don't know about your dataset as you haven't shown the pre processing code, but I recommend that you double check the dataset may be by correctly normalizing the dataset you can increase accuracy, also check if your dataset can be scaled and the most important thing if one of the class sample in your dataset is overloaded or too big in count as compared to other samples then it will also lead to the poor accuracy matrix.
For more details check this it contains the mathematical proof and explanation how these things affect your ML model accuracy
I have a homework question that tells me to do this:
For the run with 20000 iterations, use the weights (of the four middle neurons, and the single output neuron), to hand-calculate the classification. In other words, when 0,0,1 is the input, use the weights to process this input, and show that the output would be 0.00765428, to use my output above as an example. Remember to pass the summed weights through the sigmoid (non-linear) function [look at the code, for the sigmoid formula that involves exponentiation]! Do this for all the four input triples. The point of this is to make you see this: once the weights have been learned, it's a straightforward process to handle incoming data to generate outputs [it is not complex/mysterious!]. In this exercise, we're simply using the training data and the learned weights, to hand calculate the outputs; in much more complex situations, the network will be fed new (so far unseen) data, which it will process using learned weights.
I have no idea where to even start processing by hand
I know I have to go through the functions and do the math myself, but I'm not sure what to input for self.weight or how far back in the process I'm supposed to go to start calculating the output. I'm just generally confused
I'm supposed to do calculations for each of the four input triples ([0,0,1], [0,1,1], etc), but I don't know how to make each calculation specific to each input since all the triples are in the same array which is inputed for x in the nn class.
This is the code:
def sigmoid(x):
return 1.0/(1+ np.exp(-x))
def sigmoid_derivative(x):
return x * (1.0 - x)
class NeuralNetwork:
def __init__(self, x, y):
self.input = x
self.weights1 = np.random.rand(self.input.shape[1],4)
self.weights2 = np.random.rand(4,1)
self.y = y
self.output = np.zeros(self.y.shape)
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1))
self.output = sigmoid(np.dot(self.layer1, self.weights2))
def backprop(self):
# application of the chain rule to find derivative of the loss function with respect to weights2 and weights1
d_weights2 = np.dot(self.layer1.T, (2*(self.y - self.output) * sigmoid_derivative(self.output)))
d_weights1 = np.dot(self.input.T, (np.dot(2*(self.y - self.output) * sigmoid_derivative(self.output), self.weights2.T) *
sigmoid_derivative(self.layer1)))
# update the weights with the derivative (slope) of the loss function
self.weights1 += d_weights1
self.weights2 += d_weights2
if __name__ == "__main__":
X = np.array([[0,0,1],
[0,1,1],
[1,0,1],
[1,1,1]])
y = np.array([[0],[1],[1],[0]])
nn = NeuralNetwork(X,y)
for i in range(10000):
nn.feedforward()
nn.backprop()
print(nn.weights1)
print(nn.weights2)
print(nn.output)```
I have this neural network that I've trained seen bellow, it works, or at least appears to work, but the problem is with the training. I'm trying to train it to act as an OR gate, but it never seems to get there, the output tends to looks like this:
prior to training:
[[0.50181624]
[0.50183743]
[0.50180414]
[0.50182533]]
post training:
[[0.69641759]
[0.754652 ]
[0.75447178]
[0.79431198]]
expected output:
[[0]
[1]
[1]
[1]]
I have this loss graph:
Its strange it appears to be training, but at the same time not quite getting to the expected output. I know that it would never really achieve the 0s and 1s, but at the same time I expect it to manage and get something a little bit closer to the expected output.
I had some issues trying to figure out how to back prop the error as I wanted to make this network have any number of hidden layers, so I stored the local gradient in a layer, along side the weights, and sent the error from the end back.
The main functions I suspect are the culprits are NeuralNetwork.train and both forward methods.
import sys
import math
import numpy as np
import matplotlib.pyplot as plt
from itertools import product
class NeuralNetwork:
class __Layer:
def __init__(self,args):
self.__epsilon = 1e-6
self.localGrad = 0
self.__weights = np.random.randn(
args["previousLayerHeight"],
args["height"]
)*0.01
self.__biases = np.zeros(
(args["biasHeight"],1)
)
def __str__(self):
return str(self.__weights)
def forward(self,X):
a = np.dot(X, self.__weights) + self.__biases
self.localGrad = np.dot(X.T,self.__sigmoidPrime(a))
return self.__sigmoid(a)
def adjustWeights(self, err):
self.__weights -= (err * self.__epsilon)
def __sigmoid(self, z):
return 1/(1 + np.exp(-z))
def __sigmoidPrime(self, a):
return self.__sigmoid(a)*(1 - self.__sigmoid(a))
def __init__(self,args):
self.__inputDimensions = args["inputDimensions"]
self.__outputDimensions = args["outputDimensions"]
self.__hiddenDimensions = args["hiddenDimensions"]
self.__layers = []
self.__constructLayers()
def __constructLayers(self):
self.__layers.append(
self.__Layer(
{
"biasHeight": self.__inputDimensions[0],
"previousLayerHeight": self.__inputDimensions[1],
"height": self.__hiddenDimensions[0][0]
if len(self.__hiddenDimensions) > 0
else self.__outputDimensions[0]
}
)
)
for i in range(len(self.__hiddenDimensions)):
self.__layers.append(
self.__Layer(
{
"biasHeight": self.__hiddenDimensions[i + 1][0]
if i + 1 < len(self.__hiddenDimensions)
else self.__outputDimensions[0],
"previousLayerHeight": self.__hiddenDimensions[i][0],
"height": self.__hiddenDimensions[i + 1][0]
if i + 1 < len(self.__hiddenDimensions)
else self.__outputDimensions[0]
}
)
)
def forward(self,X):
out = self.__layers[0].forward(X)
for i in range(len(self.__layers) - 1):
out = self.__layers[i+1].forward(out)
return out
def train(self,X,Y,loss,epoch=5000000):
for i in range(epoch):
YHat = self.forward(X)
delta = -(Y-YHat)
loss.append(sum(Y-YHat))
err = np.sum(np.dot(self.__layers[-1].localGrad,delta.T), axis=1)
err.shape = (self.__hiddenDimensions[-1][0],1)
self.__layers[-1].adjustWeights(err)
i=0
for l in reversed(self.__layers[:-1]):
err = np.dot(l.localGrad, err)
l.adjustWeights(err)
i += 1
def printLayers(self):
print("Layers:\n")
for l in self.__layers:
print(l)
print("\n")
def main(args):
X = np.array([[x,y] for x,y in product([0,1],repeat=2)])
Y = np.array([[0],[1],[1],[1]])
nn = NeuralNetwork(
{
#(height,width)
"inputDimensions": (4,2),
"outputDimensions": (1,1),
"hiddenDimensions":[
(6,1)
]
}
)
print("input:\n\n",X,"\n")
print("expected output:\n\n",Y,"\n")
nn.printLayers()
print("prior to training:\n\n",nn.forward(X), "\n")
loss = []
nn.train(X,Y,loss)
print("post training:\n\n",nn.forward(X), "\n")
nn.printLayers()
fig,ax = plt.subplots()
x = np.array([x for x in range(5000000)])
loss = np.array(loss)
ax.plot(x,loss)
ax.set(xlabel="epoch",ylabel="loss",title="logic gate training")
plt.show()
if(__name__=="__main__"):
main(sys.argv[1:])
Could someone please point out what I'm doing wrong here, I strongly suspect it has to do with the way I'm dealing with matrices but at the same time I don't have the slightest idea what's going on.
Thanks for taking the time to read my question, and taking the time to respond (if relevant).
edit:
Actually quite a lot is wrong with this but I'm still a bit confused over how to fix it. Although the loss graph looks like its training, and it kind of is, the math I've done above is wrong.
Look at the training function.
def train(self,X,Y,loss,epoch=5000000):
for i in range(epoch):
YHat = self.forward(X)
delta = -(Y-YHat)
loss.append(sum(Y-YHat))
err = np.sum(np.dot(self.__layers[-1].localGrad,delta.T), axis=1)
err.shape = (self.__hiddenDimensions[-1][0],1)
self.__layers[-1].adjustWeights(err)
i=0
for l in reversed(self.__layers[:-1]):
err = np.dot(l.localGrad, err)
l.adjustWeights(err)
i += 1
Note how I get delta = -(Y-Yhat) and then dot product it with the "local gradient" of the last layer. The "local gradient" is the local W gradient.
def forward(self,X):
a = np.dot(X, self.__weights) + self.__biases
self.localGrad = np.dot(X.T,self.__sigmoidPrime(a))
return self.__sigmoid(a)
I'm skipping a step in the chain rule. I should really be multiplying by W* sigprime(XW + b) first as that's the local gradient of X, then by the local W gradient. I tried that, but I'm still getting issues, here is the new forward method (note the __init__ for layers needs to be initialised for the new vars, and I changed the activation function to tanh)
def forward(self, X):
a = np.dot(X, self.__weights) + self.__biases
self.localPartialGrad = self.__tanhPrime(a)
self.localWGrad = np.dot(X.T, self.localPartialGrad)
self.localXGrad = np.dot(self.localPartialGrad,self.__weights.T)
return self.__tanh(a)
and updated the training method to look something like this:
def train(self, X, Y, loss, epoch=5000):
for e in range(epoch):
Yhat = self.forward(X)
err = -(Y-Yhat)
loss.append(sum(err))
print("loss:\n",sum(err))
for l in self.__layers[::-1]:
l.adjustWeights(err)
if(l != self.__layers[0]):
err = np.multiply(err,l.localPartialGrad)
err = np.multiply(err,l.localXGrad)
The new graphs I'm getting are all over the place, I have no idea what's going on. Here is the final bit of code I changed:
def adjustWeights(self, err):
perr = np.multiply(err, self.localPartialGrad)
werr = np.sum(np.dot(self.__weights,perr.T),axis=1)
werr = werr * self.__epsilon
werr.shape = (self.__weights.shape[0],1)
self.__weights = self.__weights - werr
Your network is learning, as can be seen from the loss chart, so backprop implementation is correct (congrats!). The main problem with this particular architecture is the choice of the activation function: sigmoid. I have replaced sigmoid with tanh and it works much better instantly.
From this discussion on CV.SE:
There are two reasons for that choice (assuming you have normalized
your data, and this is very important):
Having stronger gradients: since data is centered around 0, the
derivatives are higher. To see this, calculate the derivative of the
tanh function and notice that input values are in the range [0,1]. The
range of the tanh function is [-1,1] and that of the sigmoid function
is [0,1]
Avoiding bias in the gradients. This is explained very well in the
paper, and it is worth reading it to understand these issues.
Though I'm sure sigmoid-based NN can be trained as well, looks like it's much more sensitive to input values (note that they are not zero-centered), because the activation itself is not zero-centered. tanh is better than sigmoid by all means, so a simpler approach is just use that activation function.
The key change is this:
def __tanh(self, z):
return np.tanh(z)
def __tanhPrime(self, a):
return 1 - self.__tanh(a) ** 2
... instead of __sigmoid and __sigmoidPrime.
I have also tuned hyperparameters a little bit, so that the network now learns in 100k epochs, instead of 5m:
prior to training:
[[ 0. ]
[-0.00056925]
[-0.00044885]
[-0.00101794]]
post training:
[[0. ]
[0.97335842]
[0.97340917]
[0.98332273]]
A complete code is in this gist.
Well I'm an idiot. I was right about being wrong but I was wrong about how wrong I was. Let me explain.
Within the backwards training method I got the last layer trained correctly, but all layers after that wasn't trained correctly, hence why the above network was coming up with a result, it was indeed training, but only one layer.
So what did i do wrong? Well I was only multiplying by the local graident of the Weights with respect to the output, and thus the chain rule was partially correct.
Lets say the loss function was this:
t = Y-X2
loss = 1/2*(t)^2
a2 = X1W2 + b
X2 = activation(a2)
a1 = X0W1 + b
X1 = activation(a1)
We know that the the derivative of loss with respect to W2 would be -(Y-X2)*X1. This was done in the first part of my training function:
def train(self,X,Y,loss,epoch=5000000):
for i in range(epoch):
#First part
YHat = self.forward(X)
delta = -(Y-YHat)
loss.append(sum(Y-YHat))
err = np.sum(np.dot(self.__layers[-1].localGrad,delta.T), axis=1)
err.shape = (self.__hiddenDimensions[-1][0],1)
self.__layers[-1].adjustWeights(err)
i=0
#Second part
for l in reversed(self.__layers[:-1]):
err = np.dot(l.localGrad, err)
l.adjustWeights(err)
i += 1
However the second part is where I screwed up. In order to calculate the loss with respect to W1, I must multiply the original error -(Y-X2) by W2 as W2 is the local X Gradient of the last layer, and due to the chain rule this must be done first. Then I could multiply by the local W gradient (X1) to get the loss with respect to W1. I failed to do the multiplication of the local X gradient first, so the last layer was indeed training, but all layers after that had an error that magnified as the layer increased.
To solve this I updated the train method:
def train(self,X,Y,loss,epoch=10000):
for i in range(epoch):
YHat = self.forward(X)
err = -(Y-YHat)
loss.append(sum(Y-YHat))
werr = np.sum(np.dot(self.__layers[-1].localWGrad,err.T), axis=1)
werr.shape = (self.__hiddenDimensions[-1][0],1)
self.__layers[-1].adjustWeights(werr)
for l in reversed(self.__layers[:-1]):
err = np.multiply(err, l.localXGrad)
werr = np.sum(np.dot(l.weights,err.T),axis=1)
l.adjustWeights(werr)
Now the loss graph I got looks like this:
I am playing with vanilla Rnn's, training with gradient descent (non-batch version), and I am having an issue with the gradient computation for the (scalar) cost; here's the relevant portion of my code:
class Rnn(object):
# ............ [skipping the trivial initialization]
def recurrence(x_t, h_tm_prev):
h_t = T.tanh(T.dot(x_t, self.W_xh) +
T.dot(h_tm_prev, self.W_hh) + self.b_h)
return h_t
h, _ = theano.scan(
recurrence,
sequences=self.input,
outputs_info=self.h0
)
y_t = T.dot(h[-1], self.W_hy) + self.b_y
self.p_y_given_x = T.nnet.softmax(y_t)
self.y_pred = T.argmax(self.p_y_given_x, axis=1)
def negative_log_likelihood(self, y):
return -T.mean(T.log(self.p_y_given_x)[:, y])
def testRnn(dataset, vocabulary, learning_rate=0.01, n_epochs=50):
# ............ [skipping the trivial initialization]
index = T.lscalar('index')
x = T.fmatrix('x')
y = T.iscalar('y')
rnn = Rnn(x, n_x=27, n_h=12, n_y=27)
nll = rnn.negative_log_likelihood(y)
cost = T.lscalar('cost')
gparams = [T.grad(cost, param) for param in rnn.params]
updates = [(param, param - learning_rate * gparam)
for param, gparam in zip(rnn.params, gparams)
]
train_model = theano.function(
inputs=[index],
outputs=nll,
givens={
x: train_set_x[index],
y: train_set_y[index]
},
)
sgd_step = theano.function(
inputs=[cost],
outputs=[],
updates=updates
)
done_looping = False
while(epoch < n_epochs) and (not done_looping):
epoch += 1
tr_cost = 0.
for idx in xrange(n_train_examples):
tr_cost += train_model(idx)
# perform sgd step after going through the complete training set
sgd_step(tr_cost)
For some reasons I don't want to pass complete (training) data to the train_model(..), instead I want to pass individual examples at a time. Now the problem is that each call to train_model(..) returns me the cost (negative log-likelihood) of that particular example and then I have to aggregate all the cost (of the complete (training) data-set) and then take derivative and perform the relevant update to the weight parameters in the sgd_step(..), and for obvious reasons with my current implementation I am getting this error: theano.gradient.DisconnectedInputError: grad method was asked to compute the gradient with respect to a variable that is not part of the computational graph of the cost, or is used only by a non-differentiable operator: W_xh. Now I don't understand how to make 'cost' a part of computational graph (as in my case when I have to wait for it to be aggregated) or is there any better/elegant way to achieve the same thing ?
Thanks.
It turns out one cannot bring the symbolic variable into Theano graph if they are not part of computational graph. Therefore, I have to change the way to pass data to the train_model(..); passing the complete training data instead of individual example fix the issue.