Understanding matrix obtained from indexing with arrays - python

In code listed in logistic regression code, I saw the following code snippet. What throws me off is the expression:
probs[range(num_examples),y].
Can someone tell me what dimension this matrix has? My guess is that it's a N*K by N*K matrix, but I am not sure. Thanks.
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')
for j in xrange(K):
ix = range(N*j,N*(j+1))
r = np.linspace(0.0,1,N) # radius
t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
y[ix] = j
#Train a Linear Classifier
# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# gradient descent loop
num_examples = X.shape[0]
for i in xrange(200):
# evaluate class scores, [N x K]
scores = np.dot(X, W) + b
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
# compute the loss: average cross-entropy loss and regularization
corect_logprobs = -np.log(probs[range(num_examples),y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W)
loss = data_loss + reg_loss
if i % 10 == 0:

probs[range(num_examples), y] seems to be a 1D slice, where:
range(num_examples) is a vector spanning the length of your samples
y is a 1D vector, length N*K

Related

Implementation of backpropagation in Python

I'm following the Andrew-Ng course on Machine Learning and I'm currently doing the week 5 exercise.
I've found myself stuck on the implementation of the backpropagation algorithm, due to the fact that the relative difference, compared to numerical gradient, is very high (order of 1e-1), but I can't find any error within my implementation, so I'm gently asking if someone could take a look at it and explain what I did wrong.
Forward propagation:
def forward_propagation(thetas, X, history=False):
activation_arr = []
a = X # X is the array of the first activation values
for k in range(0, len(thetas)):
a = add_intercept(a) # add the bias unit
a = sigmoid(a # thetas[k].T)
if history:
activation_arr.append(a)
return activation_arr if history else a
Backpropagation:
def gradient_nn(thetas, X, y, num_labels, reg_lambda=None):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
# add intercepted X to the activation array
activation_arr = [add_intercept(X)] + forward_propagation(thetas, X, history=True)
sigma = [activation_arr[-1] - Y] # sigma^L = a^L - y
delta = [sigma[-1].T # activation_arr[-2]] # find delta for the first row
thetas_grad = []
# Calculate sigma and delta
for idx in range(1, len(thetas)): # skip last iteration
sigma = [
(sigma[0] # thetas[-idx][:, 1:]) * partial_derivative(activation_arr[-1-idx])
] + sigma
delta = [
sigma[0].T # activation_arr[-2-idx]
] + delta
return [np.sum(d) / n_examples for d in thetas_grad]
Partial derivative:
def partial_derivative(a):
return a * (1 - a) # element wise multiplication
Numerical gradient:
def compute_numerical_gradient(cost_function, thetas):
# Unroll parameters
nn_params = unroll_thetas(thetas)
num_grad = np.zeros(nn_params.shape)
perturb = np.zeros(nn_params.shape)
shapes = [theta.shape for theta in thetas]
epsilon = 1e-4 # not the one of random initialization
for p in range(nn_params.shape[0]):
# Set perturbation vector
perturb[p] = epsilon
minus_theta = nn_params - perturb
plus_theta = nn_params + perturb
# --- Roll params back in order to use the cost function ---
minus_theta = roll_thetas(minus_theta, shapes)
plus_theta = roll_thetas(plus_theta, shapes)
# calculate the loss of the cost function
minus_loss = cost_function(minus_theta)
plus_loss = cost_function(plus_theta)
# Compute Numerical Gradient
num_grad[p] = (plus_loss - minus_loss) / (2 * epsilon)
perturb[p] = 0
num_grad = roll_thetas(num_grad, shapes)
return [np.sum(num_g) for num_g in num_grad]
Cost function:
def J_nn(num_labels, reg_lambda=None):
def non_reg_func(thetas, X, y):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
prediction = forward_propagation(thetas, X)
return np.sum(np.sum(-Y * np.log(prediction) - (1 - Y) * np.log(1 - prediction))) / n_examples
if reg_lambda is None:
func = non_reg_func
else: # regularization
def func(thetas, X, y):
cost = non_reg_func(thetas, X, y)
for theta in thetas: # regularize for every layer
theta = theta[1:] # remove bias unit
cost = cost + (reg_lambda / (2 * y.shape[0])) * np.sum(np.sum(theta[:, ] ** 2))
return cost
return func
Checking backpropagation with numerical gradient:
def check_nn_gradients(reg_lambda=None):
"""
Creates a small neural network (max 8 x 8 x 7 x 8) and checks that
the implementation of the backpropagation algorithm is good
"""
#n_examples, sizes = random.randint(5, 10), [random.randint(2, 8), random.randint(2, 8), random.randint(1, 8)]
n_examples, sizes = 5, [8, 8, 5, 4]
n_labels = sizes[-1] # Last size is equal to the number of labels
init_epsilon = 0.0001
thetas = random_init_thetas(sizes, init_epsilon)
X = np.array(
random_init_thetas([sizes[0]-1, n_examples], init_epsilon)
).squeeze() # We squeeze it because random_init_thetas returns a 3D array, but we want X to be 2D
y = np.array([random.randint(0, n_labels-1) for _ in X])
y = y[:, np.newaxis]
inner_cost = lambda _thetas: J_nn(n_labels, reg_lambda)(_thetas, X, y)
gradients = gradient_nn(thetas, X, y, n_labels, 0)
unrolled_gradients = unroll_thetas(gradients)
print(unrolled_gradients)
# finite difference method
grad_checking_epsilon = 1e-4
num_grad = compute_numerical_gradient(inner_cost, thetas)
unrolled_num_grad = unroll_thetas(num_grad)
print(unrolled_num_grad)
return diff = np.linalg.norm(unrolled_num_grad - unrolled_gradients) / np.linalg.norm(unrolled_num_grad + unrolled_gradients)

Network Cost Function code Python Implementation

I was implementing Andrew NG’s ML course in Python and in week 5 exercise 4 I was referring to a code. What I didn’t understand was the need to use np.trace() in the final output. Having a problem visualising the matrices
import numpy as np
from scipy.optimize import minimize
import scipy.io
import matplotlib.pyplot as plt
data_dict = scipy.io.loadmat('ex4_orig_octave/ex4data1.mat')
X = data_dict['X']
y = data_dict['y'].ravel()
M = X.shape[0]
N = X.shape[1]
L = 26 # = number of nodes in the hidden layer (including bias node)
K = len(np.unique(y))
X = np.hstack((np.ones((M, 1)), X))
Y = np.zeros((M, K), dtype='uint8')
for i, row in enumerate(Y):
Y[i, y[i] - 1] = 1
weights_dict = scipy.io.loadmat('ex4_orig_octave/ex4weights.mat')
theta_1 = weights_dict['Theta1']
theta_2 = weights_dict['Theta2']
nn_params_saved = np.concatenate((theta_1.flatten(), theta_2.flatten()))
def nn_cost_function(nn_params, X, Y, M, N, L, K):
"""Python version of nnCostFunction.m after completing 'Part 1'."""
# Unroll the parameter vector.
theta_1 = nn_params[:(L - 1) * (N + 1)].reshape(L - 1, N + 1)
theta_2 = nn_params[(L - 1) * (N + 1):].reshape(K, L)
# Calculate activations in the second layer.
a_2 = sigmoid(theta_1.dot(X.T))
# Add the second layer's bias node.
a_2_p = np.vstack((np.ones(M), a_2))
# Calculate the activation of the third layer.
a_3 = sigmoid(theta_2.dot(a_2_p))
# Calculate the cost function.
cost = 1 / M * np.trace(- Y.dot(np.log(a_3)) - (1 - Y).dot(np.log(1 - a_3)))
return cost
cost_saved = nn_cost_function(nn_params_saved, X, Y, M, N, L, K)
print 'Cost at parameters (loaded from ex4weights): %.6f' % cost_saved
print '(this value should be about 0.287629)'
The operation 1/M * np.trace() is calculating the average cost over a batch of size M:
A bit less readable, but significantly faster should be:
np.sum(np.sum(Y.multiply(np.log(a_3.T)),axis=1),axis=0)
, if Y.shape==(M,K) and a_3.shape==(K,M):
Y = lambda : np.random.uniform(size=(5000,10)) # (M,K)
a3 = lambda : np.random.uniform(size=(10,5000)) # (K,M)
timeit.timeit('import numpy as np; np.trace(Y().dot(a3()))', number=10, globals=globals())
# 0.5633535870001651
timeit.timeit('import numpy as np; np.sum(np.sum(np.multiply(Y(),a3().T),axis=1),axis=0)', number=10, globals=globals())
# 0.013223066000136896

Regularized Logistic Regression in Python (Andrew ng Course)

I'm starting the ML journey and I'm having troubles with this coding exercise
here is my code
import numpy as np
import pandas as pd
import scipy.optimize as op
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, name['Test1', 'Test2', 'Accepted'])
# Separate the features to make it fit into the mapFeature function
X1 = data['Test1'].values.T
X2 = data['Test2'].values.T
# This function makes more features (degree)
def mapFeature(x1, x2):
degree = 6
out = np.ones((x1.shape[0], sum(range(degree + 2))))
curr_column = 1
for i in range(1, degree + 1):
for j in range(i+1):
out[:,curr_column] = np.power(x1, i-j) * np.power(x2, j)
curr_column += 1
return out
# Separate the data into training and target, also initialize theta
X = mapFeature(X1, X2)
y = np.matrix(data['Accepted'].values).T
m, n = X.shape
cols = X.shape[1]
theta = np.matrix(np.zeros(cols))
#Initialize the learningRate(sigma)
learningRate = 1
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
first = np.multiply(-y, np.log(error))
second = np.multiply(1 - y, np.log(1 - error))
j = np.sum((first - second)) / m + (learningRate * np.sum(np.power(theta, 2)) / 2 * m)
return j
# Define the gradient of the cost function
def gradient(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
grad = (X.T # (error - y)) / m + ((learningRate * theta) / m)
grad_no = (X.T # (error - y)) / m
grad[0] = grad_no[0]
return grad
Result = op.minimize(fun=cost, x0=theta, args=(X, y, learningRate), method='TNC', jac=gradient)
opt_theta = np.matrix(Result.x)
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
p = predict(opt_theta, X)
print('Train Accuracy: {:f}'.format(np.mean(p == y) * 100))
So, when the learningRate = 1, the accuracy should be around 83,05% but I'm getting 80.5% and when the learningRate = 0, the accuracy should be 91.52% but I'm getting 87.28%
So the question is What am I doing wrong? Why my accuracy is below the problem default answer?
Hope someone can guide me in the right direction. Thanks!
P.D: Here is the dataset, maybe it can help
https://raw.githubusercontent.com/TheGirlWhiteWithBandages/Machine-Learning-Algorithms/master/Logistic%20Regression/ex2data2.txt
Hey guys I found a way to make it even better!
Here is the code
import numpy as np
import pandas as pd
import scipy.optimize as op
from sklearn.preprocessing import PolynomialFeatures
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
# Separate the data into training and target
X = (data.iloc[:, 0:2]).values
y = (data.iloc[:, 2:3]).values
# Modify the features to a certain degree (Polynomial)
poly = PolynomialFeatures(6)
m = y.size
XX = poly.fit_transform(data.iloc[:, 0:2].values)
# Initialize Theta
theta = np.zeros(XX.shape[1])
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return(1 / (1 + np.exp(-z)))
# Define the Regularized cost function
def costFunctionReg(theta, reg, *args):
# This is require to make the optimize function work
h = sigmoid(XX # theta)
first = np.log(h).T # - y
second = np.log(1 - h).T # (1 - y)
J = (1 / m) * (first - second) + (reg / (2 * m)) * np.sum(np.square(theta[1:]))
return J
# Define the Regularized gradient function
def gradientReg(theta, reg, *args):
theta = theta.reshape(-1, 1)
h = sigmoid(XX # theta)
grad = (1 / m) * (XX.T # (h - y)) + (reg / m) * np.r_[[[0]], theta[1:]]
return grad.flatten()
# Define the predict Function
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
# A loop to test between different values for sigma (reg parameter)
for i, Sigma in enumerate([0, 1, 100]):
# Optimize costFunctionReg
res2 = op.minimize(costFunctionReg, theta, args=(Sigma, XX, y), method=None, jac=gradientReg)
# Get the accuracy of the model
accuracy = 100 * sum(predict(res2.x, XX) == y.ravel()) / y.size
# Get the Error between different weights
error1 = costFunctionReg(res2.x, Sigma, XX, y)
# print the accuracy and error
print('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=4), Sigma))
print(error1)
Thanks for all your help!
try out this:
# import library
import pandas as pd
import numpy as np
dataset = pd.read_csv('ex2data2.csv',names = ['Test #1','Test #2','Accepted'])
# splitting to x and y variables for features and target variable
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print('x[0] ={}, y[0] ={}'.format(x[0],y[0]))
m, n = x.shape
print('#{} Number of training samples, #{} features per sample'.format(m,n))
# import library FeatureMapping
from sklearn.preprocessing import PolynomialFeatures
# We also add one column of ones to interpret theta 0 (x with power of 0 = 1) by
include_bias as True
pf = PolynomialFeatures(degree = 6, include_bias = True)
x_poly = pf.fit_transform(x)
pd.DataFrame(x_poly).head(5)
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_ = 1
# reshape (-1,1) because we just have one feature in y column
y = y.reshape(-1,1)
def sigmoid(z):
return 1/(1+np.exp(-z))
def lr_hypothesis(x,theta):
return np.dot(x,theta)
def compute_cost(theta,x,y,lambda_):
theta = theta.reshape(n,1)
infunc1 = -y*(np.log(sigmoid(lr_hypothesis(x,theta)))) - ((1-y)*(np.log(1 - sigmoid(lr_hypothesis(x,theta)))))
infunc2 = (lambda_*np.sum(theta[1:]**2))/(2*m)
j = np.sum(infunc1)/m+ infunc2
return j
# gradient[0] correspond to gradient for theta(0)
# gradient[1:] correspond to gradient for theta(j) j>0
def compute_gradient(theta,x,y,lambda_):
gradient = np.zeros(n).reshape(n,)
theta = theta.reshape(n,1)
infunc1 = sigmoid(lr_hypothesis(x,theta))-y
gradient_in = np.dot(x.transpose(),infunc1)/m
gradient[0] = gradient_in[0,0] # theta(0)
gradient[1:] = gradient_in[1:,0]+(lambda_*theta[1:,]/m).reshape(n-1,) # theta(j) ; j>0
gradient = gradient.flatten()
return gradient
You can now test your cost and gradient without optimization. Th below code will optimize the model:
# hyperparameters
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_array = [0, 1, 10, 100]
import scipy.optimize as opt
for i in range(0,len(lambda_array)):
# Train
print('======================================== Iteration {} ===================================='.format(i))
optimized = opt.minimize(fun = compute_cost, x0 = theta, args = (x_poly, y,lambda_array[i]),
method = 'TNC', jac = compute_gradient)
new_theta = optimized.x
# Prediction
y_pred_train = predictor(x_poly,new_theta)
cm_train = confusion_matrix(y,y_pred_train)
t_train,f_train,acc_train = acc(cm_train)
print('With lambda = {}, {} correct, {} wrong ==========> accuracy = {}%'
.format(lambda_array[i],t_train,f_train,acc_train*100))
Now you should see output like this :
=== Iteration 0 === With lambda = 0, 104 correct, 14 wrong ==========> accuracy = 88.13559322033898%
=== Iteration 1 === With lambda = 1, 98 correct, 20 wrong ==========> accuracy = 83.05084745762711%
=== Iteration 2 === With lambda = 10, 88 correct, 30 wrong ==========> accuracy = 74.57627118644068%
=== Iteration 3 === With lambda = 100, 72 correct, 46 wrong ==========> accuracy = 61.016949152542374%

How to calculate cost for softmax regression with pytorch

I would to calculate the cost for the softmax regression.
The cost function to calculate is given at the bottom of the page.
For numpy I can get the cost as follows:
"""
X.shape = 2,300 # floats
y.shape = 300, # integers
W.shape = 2,3
b.shape = 3,1
"""
import numpy as np
np.random.seed(100)
# Data and labels
X = np.random.randn(300,2)
y = np.ones(300)
y[0:100] = 0
y[200:300] = 2
y = y.astype(np.int)
# weights and bias
W = np.random.randn(2,3)
b = np.random.randn(3)
N = X.shape[0]
scores = np.dot(X, W) + b
hyp = np.exp(scores-np.max(scores, axis=0, keepdims=True))
probs = hyp / np.sum(hyp, axis = 0)
logprobs = np.log(probs[range(N),y])
cost_data = -1/N * np.sum(logprobs)
print("hyp.shape = {}".format(hyp.shape)) # hyp.shape = (300, 3)
print(cost_data)
But, when I tried torch I could not get this.
So far I have got this:
"""
X.shape = 2,300 # floats
y.shape = 300, # integers
W.shape = 2,3
b.shape = 3,1
"""
import numpy as np
import torch
from torch.autograd import Variable
np.random.seed(100)
# Data and labels
X = np.random.randn(300,2)
y = np.ones(300)
y[0:100] = 0
y[200:300] = 2
y = y.astype(np.int)
X = Variable(torch.from_numpy(X),requires_grad=True).type(torch.FloatTensor)
y = Variable(torch.from_numpy(y),requires_grad=True).type(torch.LongTensor)
# weights and bias
W = Variable(torch.randn(2,3),requires_grad=True)
b = Variable(torch.randn(3),requires_grad=True)
N = X.shape[0]
scores = torch.mm(X, W) + b
hyp = torch.exp(scores - torch.max(scores))
probs = hyp / torch.sum(hyp)
correct_probs = probs[range(N),y] # got problem HERE
# logprobs = np.log(correct_probs)
# cost_data = -1/N * torch.sum(logprobs)
# print(cost_data)
I got problem calculating the correct probabilities for the classes.
How can we solve this problem and get the correct cost value.
The cost function to calculate is given below:
Your problem is that you cannot use range(N) with pytorch, use the slice 0:N instead:
hyp = torch.exp(scores - torch.max(scores))
probs = hyp / torch.sum(hyp)
correct_probs = probs[0:N,y] # problem solved
logprobs = torch.log(correct_probs)
cost_data = -1/N * torch.sum(logprobs)
Another point is that your labels y do not require gradients, you would better have:
y = Variable(torch.from_numpy(y),requires_grad=False).type(torch.LongTensor)

Simple Neural-Network that gives the summation of the input variables as output in Python?

Can some please make a Simple Neural Network that give the summation of the input variables as the output.
Example if input variables are X1, X2, X3 then the output is Y= X1 + X2 + X3.
Simple Python Program, using matrix multiplication would be helpful.
Thank You.
Here is the code I am trying to apply it is just a modified version of "iamtrask" code, but it is not giving me right answer and tends to saturate at [1.] when I increase the test cases(set_size).
import numpy as np
outputs=[]
#initializinf hyper parameters
set_size=20
iterations=10000
input_variables=3
# sigmoid function
def nonlin(x, deriv=False):
if (deriv == True):
return 1 * (1 - x)
return 1 / (1 + np.exp(-x))
#inverse of sigmoid, Logit function
def logit(x):
return np.log(x/(1-x))
#initializing inputs with random values
inputs = 2 * np.random.random((set_size, input_variables)) - 1
X=np.array(inputs)
#Getting desired output using mathematical operations
for h in range(set_size):
outputs.append(nonlin((X[h][0]) + (X[h][1]) + (X[h][2])))
# output dataset
y = np.array([outputs]).T # converting list into array and taking transpose
# seed random numbers to make calculation
# deterministic (just a good practice)
np.random.seed(1)
# initialize weights randomly with mean 0
syn0 = 2 * np.random.random((input_variables, set_size)) - 1
syn1 = 2 * np.random.random((set_size, 1)) - 1
print(y)
for iter in range(0,10000):
# forward propagation
l0 = X
l1 = nonlin(np.dot(l0, syn0))
l2 = nonlin(np.dot(l1, syn1))
# how much did we miss?
#l1_error = y - l1
l2_error = y - l2
#print(l1_error)
l2_delta = l2_error * nonlin(l2, deriv=True)
l1_error = l2_delta.dot(syn1.T)
l1_delta = l1_error * nonlin(l1, deriv=True)
# multiply how much we missed by the
# slope of the sigmoid at the values in l1
#l1_delta = l1_error * nonlin(l1, True)
# update weights
syn1 += l1.T.dot(l2_delta)
syn0 += l0.T.dot(l1_delta)
print("Output After Training:")
#out=logit(l2)
print(l2)
#testing the trained network with new values
X1=input("Enter the new inputs:")
mynums = [float(i) for i in X1.split()]
#mynums = map(float, X1.split())
print(mynums)
l0 = mynums
l1 = nonlin(np.dot(l0, syn0))
l2 = nonlin(np.dot(l1, syn1))
print(l2)
"A bare bones neural network implementation to describe the inner workings of backpropagation." 11 lines of code!
X = np.array([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
y = np.array([[0,1,1,0]]).T
syn0 = 2*np.random.random((3,4)) - 1
syn1 = 2*np.random.random((4,1)) - 1
for j in xrange(60000):
l1 = 1/(1+np.exp(-(np.dot(X,syn0))))
l2 = 1/(1+np.exp(-(np.dot(l1,syn1))))
l2_delta = (y - l2)*(l2*(1-l2))
l1_delta = l2_delta.dot(syn1.T) * (l1 * (1-l1))
syn1 += l1.T.dot(l2_delta)
syn0 += X.T.dot(l1_delta)
http://iamtrask.github.io/2015/07/12/basic-python-network/
I have modified Trask's code this way:
import numpy as np
# sigmoid function
def nonlin(x,deriv=False):
if(deriv==True):
return x*(1-x)
return 1/(1+np.exp(-x))
# input dataset of 100 pairs of X1, X2 & X3 numbers lying between 1 and 3
X = np.random.randint(1,3,size=(100,3)).astype(int)
#Rescaling them or Normalizing them
X = X/(3*3) #(Max_element, No_of_imputs)
# output dataset
y = np.sum(X, axis = 1, keepdims=True)
#Normalizing
y=y/(3*3)
#Initializing weights
np.random.seed(1)
# randomly initialize our weights with mean 0
syn0 = 2*np.random.random((3,4)) - 1
syn1 = 2*np.random.random((4,1)) - 1
#Training
for iter in range(30000):
# forward propagation
l0 = X
l1 = nonlinn(np.dot(l0,syn0))
l2 = nonlinn(np.dot(l1,syn1))
# how much did we miss?
l2_error = y-l2
#if (iter% 100) == 0:
# print ("Error:" + str(np.mean(np.abs(l2_error))))
l2_delta = l2_error*nonlinn(l2, deriv=True)
l1_error = l2_delta.dot(syn1.T)
# multiply how much we missed by the
# slope of the sigmoid at the values in l1
l1_delta = l1_error * nonlinn(l1,True)
# update weights
syn1 += l1.T.dot(l2_delta)
syn0 += l0.T.dot(l1_delta)
#Predict
l0 = [[3,1,0]] #Should give 4 as answer
l1 = nonlinn(np.dot(l0,syn0))
l2 = nonlinn(np.dot(l1,syn1))
print (l2*(3*3))
#(3*3) will invert the effect of normalization
The result was 4.157 (quite accurate).
I think the issue was with normalization.

Categories