In code listed in logistic regression code, I saw the following code snippet. What throws me off is the expression:
probs[range(num_examples),y].
Can someone tell me what dimension this matrix has? My guess is that it's a N*K by N*K matrix, but I am not sure. Thanks.
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
N = 100 # number of points per class
D = 2 # dimensionality
K = 3 # number of classes
X = np.zeros((N*K,D))
y = np.zeros(N*K, dtype='uint8')
for j in xrange(K):
ix = range(N*j,N*(j+1))
r = np.linspace(0.0,1,N) # radius
t = np.linspace(j*4,(j+1)*4,N) + np.random.randn(N)*0.2 # theta
X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
y[ix] = j
#Train a Linear Classifier
# initialize parameters randomly
W = 0.01 * np.random.randn(D,K)
b = np.zeros((1,K))
# some hyperparameters
step_size = 1e-0
reg = 1e-3 # regularization strength
# gradient descent loop
num_examples = X.shape[0]
for i in xrange(200):
# evaluate class scores, [N x K]
scores = np.dot(X, W) + b
# compute the class probabilities
exp_scores = np.exp(scores)
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) # [N x K]
# compute the loss: average cross-entropy loss and regularization
corect_logprobs = -np.log(probs[range(num_examples),y])
data_loss = np.sum(corect_logprobs)/num_examples
reg_loss = 0.5*reg*np.sum(W*W)
loss = data_loss + reg_loss
if i % 10 == 0:
probs[range(num_examples), y] seems to be a 1D slice, where:
range(num_examples) is a vector spanning the length of your samples
y is a 1D vector, length N*K
Related
I'm following the Andrew-Ng course on Machine Learning and I'm currently doing the week 5 exercise.
I've found myself stuck on the implementation of the backpropagation algorithm, due to the fact that the relative difference, compared to numerical gradient, is very high (order of 1e-1), but I can't find any error within my implementation, so I'm gently asking if someone could take a look at it and explain what I did wrong.
Forward propagation:
def forward_propagation(thetas, X, history=False):
activation_arr = []
a = X # X is the array of the first activation values
for k in range(0, len(thetas)):
a = add_intercept(a) # add the bias unit
a = sigmoid(a # thetas[k].T)
if history:
activation_arr.append(a)
return activation_arr if history else a
Backpropagation:
def gradient_nn(thetas, X, y, num_labels, reg_lambda=None):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
# add intercepted X to the activation array
activation_arr = [add_intercept(X)] + forward_propagation(thetas, X, history=True)
sigma = [activation_arr[-1] - Y] # sigma^L = a^L - y
delta = [sigma[-1].T # activation_arr[-2]] # find delta for the first row
thetas_grad = []
# Calculate sigma and delta
for idx in range(1, len(thetas)): # skip last iteration
sigma = [
(sigma[0] # thetas[-idx][:, 1:]) * partial_derivative(activation_arr[-1-idx])
] + sigma
delta = [
sigma[0].T # activation_arr[-2-idx]
] + delta
return [np.sum(d) / n_examples for d in thetas_grad]
Partial derivative:
def partial_derivative(a):
return a * (1 - a) # element wise multiplication
Numerical gradient:
def compute_numerical_gradient(cost_function, thetas):
# Unroll parameters
nn_params = unroll_thetas(thetas)
num_grad = np.zeros(nn_params.shape)
perturb = np.zeros(nn_params.shape)
shapes = [theta.shape for theta in thetas]
epsilon = 1e-4 # not the one of random initialization
for p in range(nn_params.shape[0]):
# Set perturbation vector
perturb[p] = epsilon
minus_theta = nn_params - perturb
plus_theta = nn_params + perturb
# --- Roll params back in order to use the cost function ---
minus_theta = roll_thetas(minus_theta, shapes)
plus_theta = roll_thetas(plus_theta, shapes)
# calculate the loss of the cost function
minus_loss = cost_function(minus_theta)
plus_loss = cost_function(plus_theta)
# Compute Numerical Gradient
num_grad[p] = (plus_loss - minus_loss) / (2 * epsilon)
perturb[p] = 0
num_grad = roll_thetas(num_grad, shapes)
return [np.sum(num_g) for num_g in num_grad]
Cost function:
def J_nn(num_labels, reg_lambda=None):
def non_reg_func(thetas, X, y):
n_examples = X.shape[0]
Y = np.zeros(( # creates a n_examples X num_labels matrix
n_examples, # n of examples
num_labels
))
for i in range(n_examples):
Y[i, y[i, 0]] = 1 # the index corresponding to the correct label for each row has value = 1
prediction = forward_propagation(thetas, X)
return np.sum(np.sum(-Y * np.log(prediction) - (1 - Y) * np.log(1 - prediction))) / n_examples
if reg_lambda is None:
func = non_reg_func
else: # regularization
def func(thetas, X, y):
cost = non_reg_func(thetas, X, y)
for theta in thetas: # regularize for every layer
theta = theta[1:] # remove bias unit
cost = cost + (reg_lambda / (2 * y.shape[0])) * np.sum(np.sum(theta[:, ] ** 2))
return cost
return func
Checking backpropagation with numerical gradient:
def check_nn_gradients(reg_lambda=None):
"""
Creates a small neural network (max 8 x 8 x 7 x 8) and checks that
the implementation of the backpropagation algorithm is good
"""
#n_examples, sizes = random.randint(5, 10), [random.randint(2, 8), random.randint(2, 8), random.randint(1, 8)]
n_examples, sizes = 5, [8, 8, 5, 4]
n_labels = sizes[-1] # Last size is equal to the number of labels
init_epsilon = 0.0001
thetas = random_init_thetas(sizes, init_epsilon)
X = np.array(
random_init_thetas([sizes[0]-1, n_examples], init_epsilon)
).squeeze() # We squeeze it because random_init_thetas returns a 3D array, but we want X to be 2D
y = np.array([random.randint(0, n_labels-1) for _ in X])
y = y[:, np.newaxis]
inner_cost = lambda _thetas: J_nn(n_labels, reg_lambda)(_thetas, X, y)
gradients = gradient_nn(thetas, X, y, n_labels, 0)
unrolled_gradients = unroll_thetas(gradients)
print(unrolled_gradients)
# finite difference method
grad_checking_epsilon = 1e-4
num_grad = compute_numerical_gradient(inner_cost, thetas)
unrolled_num_grad = unroll_thetas(num_grad)
print(unrolled_num_grad)
return diff = np.linalg.norm(unrolled_num_grad - unrolled_gradients) / np.linalg.norm(unrolled_num_grad + unrolled_gradients)
I was implementing Andrew NG’s ML course in Python and in week 5 exercise 4 I was referring to a code. What I didn’t understand was the need to use np.trace() in the final output. Having a problem visualising the matrices
import numpy as np
from scipy.optimize import minimize
import scipy.io
import matplotlib.pyplot as plt
data_dict = scipy.io.loadmat('ex4_orig_octave/ex4data1.mat')
X = data_dict['X']
y = data_dict['y'].ravel()
M = X.shape[0]
N = X.shape[1]
L = 26 # = number of nodes in the hidden layer (including bias node)
K = len(np.unique(y))
X = np.hstack((np.ones((M, 1)), X))
Y = np.zeros((M, K), dtype='uint8')
for i, row in enumerate(Y):
Y[i, y[i] - 1] = 1
weights_dict = scipy.io.loadmat('ex4_orig_octave/ex4weights.mat')
theta_1 = weights_dict['Theta1']
theta_2 = weights_dict['Theta2']
nn_params_saved = np.concatenate((theta_1.flatten(), theta_2.flatten()))
def nn_cost_function(nn_params, X, Y, M, N, L, K):
"""Python version of nnCostFunction.m after completing 'Part 1'."""
# Unroll the parameter vector.
theta_1 = nn_params[:(L - 1) * (N + 1)].reshape(L - 1, N + 1)
theta_2 = nn_params[(L - 1) * (N + 1):].reshape(K, L)
# Calculate activations in the second layer.
a_2 = sigmoid(theta_1.dot(X.T))
# Add the second layer's bias node.
a_2_p = np.vstack((np.ones(M), a_2))
# Calculate the activation of the third layer.
a_3 = sigmoid(theta_2.dot(a_2_p))
# Calculate the cost function.
cost = 1 / M * np.trace(- Y.dot(np.log(a_3)) - (1 - Y).dot(np.log(1 - a_3)))
return cost
cost_saved = nn_cost_function(nn_params_saved, X, Y, M, N, L, K)
print 'Cost at parameters (loaded from ex4weights): %.6f' % cost_saved
print '(this value should be about 0.287629)'
The operation 1/M * np.trace() is calculating the average cost over a batch of size M:
A bit less readable, but significantly faster should be:
np.sum(np.sum(Y.multiply(np.log(a_3.T)),axis=1),axis=0)
, if Y.shape==(M,K) and a_3.shape==(K,M):
Y = lambda : np.random.uniform(size=(5000,10)) # (M,K)
a3 = lambda : np.random.uniform(size=(10,5000)) # (K,M)
timeit.timeit('import numpy as np; np.trace(Y().dot(a3()))', number=10, globals=globals())
# 0.5633535870001651
timeit.timeit('import numpy as np; np.sum(np.sum(np.multiply(Y(),a3().T),axis=1),axis=0)', number=10, globals=globals())
# 0.013223066000136896
I'm starting the ML journey and I'm having troubles with this coding exercise
here is my code
import numpy as np
import pandas as pd
import scipy.optimize as op
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, name['Test1', 'Test2', 'Accepted'])
# Separate the features to make it fit into the mapFeature function
X1 = data['Test1'].values.T
X2 = data['Test2'].values.T
# This function makes more features (degree)
def mapFeature(x1, x2):
degree = 6
out = np.ones((x1.shape[0], sum(range(degree + 2))))
curr_column = 1
for i in range(1, degree + 1):
for j in range(i+1):
out[:,curr_column] = np.power(x1, i-j) * np.power(x2, j)
curr_column += 1
return out
# Separate the data into training and target, also initialize theta
X = mapFeature(X1, X2)
y = np.matrix(data['Accepted'].values).T
m, n = X.shape
cols = X.shape[1]
theta = np.matrix(np.zeros(cols))
#Initialize the learningRate(sigma)
learningRate = 1
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
first = np.multiply(-y, np.log(error))
second = np.multiply(1 - y, np.log(1 - error))
j = np.sum((first - second)) / m + (learningRate * np.sum(np.power(theta, 2)) / 2 * m)
return j
# Define the gradient of the cost function
def gradient(theta, X, y, learningRate):
# This is require to make the optimize function work
theta = theta.reshape(-1, 1)
error = sigmoid(X # theta)
grad = (X.T # (error - y)) / m + ((learningRate * theta) / m)
grad_no = (X.T # (error - y)) / m
grad[0] = grad_no[0]
return grad
Result = op.minimize(fun=cost, x0=theta, args=(X, y, learningRate), method='TNC', jac=gradient)
opt_theta = np.matrix(Result.x)
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
p = predict(opt_theta, X)
print('Train Accuracy: {:f}'.format(np.mean(p == y) * 100))
So, when the learningRate = 1, the accuracy should be around 83,05% but I'm getting 80.5% and when the learningRate = 0, the accuracy should be 91.52% but I'm getting 87.28%
So the question is What am I doing wrong? Why my accuracy is below the problem default answer?
Hope someone can guide me in the right direction. Thanks!
P.D: Here is the dataset, maybe it can help
https://raw.githubusercontent.com/TheGirlWhiteWithBandages/Machine-Learning-Algorithms/master/Logistic%20Regression/ex2data2.txt
Hey guys I found a way to make it even better!
Here is the code
import numpy as np
import pandas as pd
import scipy.optimize as op
from sklearn.preprocessing import PolynomialFeatures
# Read the data and give it labels
data = pd.read_csv('ex2data2.txt', header=None, names=['Test1', 'Test2', 'Accepted'])
# Separate the data into training and target
X = (data.iloc[:, 0:2]).values
y = (data.iloc[:, 2:3]).values
# Modify the features to a certain degree (Polynomial)
poly = PolynomialFeatures(6)
m = y.size
XX = poly.fit_transform(data.iloc[:, 0:2].values)
# Initialize Theta
theta = np.zeros(XX.shape[1])
# Define the Sigmoid Function (Output between 0 and 1)
def sigmoid(z):
return(1 / (1 + np.exp(-z)))
# Define the Regularized cost function
def costFunctionReg(theta, reg, *args):
# This is require to make the optimize function work
h = sigmoid(XX # theta)
first = np.log(h).T # - y
second = np.log(1 - h).T # (1 - y)
J = (1 / m) * (first - second) + (reg / (2 * m)) * np.sum(np.square(theta[1:]))
return J
# Define the Regularized gradient function
def gradientReg(theta, reg, *args):
theta = theta.reshape(-1, 1)
h = sigmoid(XX # theta)
grad = (1 / m) * (XX.T # (h - y)) + (reg / m) * np.r_[[[0]], theta[1:]]
return grad.flatten()
# Define the predict Function
def predict(theta, X):
sigValue = sigmoid(X # theta.T)
p = sigValue >= 0.5
return p
# A loop to test between different values for sigma (reg parameter)
for i, Sigma in enumerate([0, 1, 100]):
# Optimize costFunctionReg
res2 = op.minimize(costFunctionReg, theta, args=(Sigma, XX, y), method=None, jac=gradientReg)
# Get the accuracy of the model
accuracy = 100 * sum(predict(res2.x, XX) == y.ravel()) / y.size
# Get the Error between different weights
error1 = costFunctionReg(res2.x, Sigma, XX, y)
# print the accuracy and error
print('Train accuracy {}% with Lambda = {}'.format(np.round(accuracy, decimals=4), Sigma))
print(error1)
Thanks for all your help!
try out this:
# import library
import pandas as pd
import numpy as np
dataset = pd.read_csv('ex2data2.csv',names = ['Test #1','Test #2','Accepted'])
# splitting to x and y variables for features and target variable
x = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values
print('x[0] ={}, y[0] ={}'.format(x[0],y[0]))
m, n = x.shape
print('#{} Number of training samples, #{} features per sample'.format(m,n))
# import library FeatureMapping
from sklearn.preprocessing import PolynomialFeatures
# We also add one column of ones to interpret theta 0 (x with power of 0 = 1) by
include_bias as True
pf = PolynomialFeatures(degree = 6, include_bias = True)
x_poly = pf.fit_transform(x)
pd.DataFrame(x_poly).head(5)
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_ = 1
# reshape (-1,1) because we just have one feature in y column
y = y.reshape(-1,1)
def sigmoid(z):
return 1/(1+np.exp(-z))
def lr_hypothesis(x,theta):
return np.dot(x,theta)
def compute_cost(theta,x,y,lambda_):
theta = theta.reshape(n,1)
infunc1 = -y*(np.log(sigmoid(lr_hypothesis(x,theta)))) - ((1-y)*(np.log(1 - sigmoid(lr_hypothesis(x,theta)))))
infunc2 = (lambda_*np.sum(theta[1:]**2))/(2*m)
j = np.sum(infunc1)/m+ infunc2
return j
# gradient[0] correspond to gradient for theta(0)
# gradient[1:] correspond to gradient for theta(j) j>0
def compute_gradient(theta,x,y,lambda_):
gradient = np.zeros(n).reshape(n,)
theta = theta.reshape(n,1)
infunc1 = sigmoid(lr_hypothesis(x,theta))-y
gradient_in = np.dot(x.transpose(),infunc1)/m
gradient[0] = gradient_in[0,0] # theta(0)
gradient[1:] = gradient_in[1:,0]+(lambda_*theta[1:,]/m).reshape(n-1,) # theta(j) ; j>0
gradient = gradient.flatten()
return gradient
You can now test your cost and gradient without optimization. Th below code will optimize the model:
# hyperparameters
m,n = x_poly.shape
# define theta as zero
theta = np.zeros(n)
# define hyperparameter λ
lambda_array = [0, 1, 10, 100]
import scipy.optimize as opt
for i in range(0,len(lambda_array)):
# Train
print('======================================== Iteration {} ===================================='.format(i))
optimized = opt.minimize(fun = compute_cost, x0 = theta, args = (x_poly, y,lambda_array[i]),
method = 'TNC', jac = compute_gradient)
new_theta = optimized.x
# Prediction
y_pred_train = predictor(x_poly,new_theta)
cm_train = confusion_matrix(y,y_pred_train)
t_train,f_train,acc_train = acc(cm_train)
print('With lambda = {}, {} correct, {} wrong ==========> accuracy = {}%'
.format(lambda_array[i],t_train,f_train,acc_train*100))
Now you should see output like this :
=== Iteration 0 === With lambda = 0, 104 correct, 14 wrong ==========> accuracy = 88.13559322033898%
=== Iteration 1 === With lambda = 1, 98 correct, 20 wrong ==========> accuracy = 83.05084745762711%
=== Iteration 2 === With lambda = 10, 88 correct, 30 wrong ==========> accuracy = 74.57627118644068%
=== Iteration 3 === With lambda = 100, 72 correct, 46 wrong ==========> accuracy = 61.016949152542374%
I would to calculate the cost for the softmax regression.
The cost function to calculate is given at the bottom of the page.
For numpy I can get the cost as follows:
"""
X.shape = 2,300 # floats
y.shape = 300, # integers
W.shape = 2,3
b.shape = 3,1
"""
import numpy as np
np.random.seed(100)
# Data and labels
X = np.random.randn(300,2)
y = np.ones(300)
y[0:100] = 0
y[200:300] = 2
y = y.astype(np.int)
# weights and bias
W = np.random.randn(2,3)
b = np.random.randn(3)
N = X.shape[0]
scores = np.dot(X, W) + b
hyp = np.exp(scores-np.max(scores, axis=0, keepdims=True))
probs = hyp / np.sum(hyp, axis = 0)
logprobs = np.log(probs[range(N),y])
cost_data = -1/N * np.sum(logprobs)
print("hyp.shape = {}".format(hyp.shape)) # hyp.shape = (300, 3)
print(cost_data)
But, when I tried torch I could not get this.
So far I have got this:
"""
X.shape = 2,300 # floats
y.shape = 300, # integers
W.shape = 2,3
b.shape = 3,1
"""
import numpy as np
import torch
from torch.autograd import Variable
np.random.seed(100)
# Data and labels
X = np.random.randn(300,2)
y = np.ones(300)
y[0:100] = 0
y[200:300] = 2
y = y.astype(np.int)
X = Variable(torch.from_numpy(X),requires_grad=True).type(torch.FloatTensor)
y = Variable(torch.from_numpy(y),requires_grad=True).type(torch.LongTensor)
# weights and bias
W = Variable(torch.randn(2,3),requires_grad=True)
b = Variable(torch.randn(3),requires_grad=True)
N = X.shape[0]
scores = torch.mm(X, W) + b
hyp = torch.exp(scores - torch.max(scores))
probs = hyp / torch.sum(hyp)
correct_probs = probs[range(N),y] # got problem HERE
# logprobs = np.log(correct_probs)
# cost_data = -1/N * torch.sum(logprobs)
# print(cost_data)
I got problem calculating the correct probabilities for the classes.
How can we solve this problem and get the correct cost value.
The cost function to calculate is given below:
Your problem is that you cannot use range(N) with pytorch, use the slice 0:N instead:
hyp = torch.exp(scores - torch.max(scores))
probs = hyp / torch.sum(hyp)
correct_probs = probs[0:N,y] # problem solved
logprobs = torch.log(correct_probs)
cost_data = -1/N * torch.sum(logprobs)
Another point is that your labels y do not require gradients, you would better have:
y = Variable(torch.from_numpy(y),requires_grad=False).type(torch.LongTensor)
Can some please make a Simple Neural Network that give the summation of the input variables as the output.
Example if input variables are X1, X2, X3 then the output is Y= X1 + X2 + X3.
Simple Python Program, using matrix multiplication would be helpful.
Thank You.
Here is the code I am trying to apply it is just a modified version of "iamtrask" code, but it is not giving me right answer and tends to saturate at [1.] when I increase the test cases(set_size).
import numpy as np
outputs=[]
#initializinf hyper parameters
set_size=20
iterations=10000
input_variables=3
# sigmoid function
def nonlin(x, deriv=False):
if (deriv == True):
return 1 * (1 - x)
return 1 / (1 + np.exp(-x))
#inverse of sigmoid, Logit function
def logit(x):
return np.log(x/(1-x))
#initializing inputs with random values
inputs = 2 * np.random.random((set_size, input_variables)) - 1
X=np.array(inputs)
#Getting desired output using mathematical operations
for h in range(set_size):
outputs.append(nonlin((X[h][0]) + (X[h][1]) + (X[h][2])))
# output dataset
y = np.array([outputs]).T # converting list into array and taking transpose
# seed random numbers to make calculation
# deterministic (just a good practice)
np.random.seed(1)
# initialize weights randomly with mean 0
syn0 = 2 * np.random.random((input_variables, set_size)) - 1
syn1 = 2 * np.random.random((set_size, 1)) - 1
print(y)
for iter in range(0,10000):
# forward propagation
l0 = X
l1 = nonlin(np.dot(l0, syn0))
l2 = nonlin(np.dot(l1, syn1))
# how much did we miss?
#l1_error = y - l1
l2_error = y - l2
#print(l1_error)
l2_delta = l2_error * nonlin(l2, deriv=True)
l1_error = l2_delta.dot(syn1.T)
l1_delta = l1_error * nonlin(l1, deriv=True)
# multiply how much we missed by the
# slope of the sigmoid at the values in l1
#l1_delta = l1_error * nonlin(l1, True)
# update weights
syn1 += l1.T.dot(l2_delta)
syn0 += l0.T.dot(l1_delta)
print("Output After Training:")
#out=logit(l2)
print(l2)
#testing the trained network with new values
X1=input("Enter the new inputs:")
mynums = [float(i) for i in X1.split()]
#mynums = map(float, X1.split())
print(mynums)
l0 = mynums
l1 = nonlin(np.dot(l0, syn0))
l2 = nonlin(np.dot(l1, syn1))
print(l2)
"A bare bones neural network implementation to describe the inner workings of backpropagation." 11 lines of code!
X = np.array([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
y = np.array([[0,1,1,0]]).T
syn0 = 2*np.random.random((3,4)) - 1
syn1 = 2*np.random.random((4,1)) - 1
for j in xrange(60000):
l1 = 1/(1+np.exp(-(np.dot(X,syn0))))
l2 = 1/(1+np.exp(-(np.dot(l1,syn1))))
l2_delta = (y - l2)*(l2*(1-l2))
l1_delta = l2_delta.dot(syn1.T) * (l1 * (1-l1))
syn1 += l1.T.dot(l2_delta)
syn0 += X.T.dot(l1_delta)
http://iamtrask.github.io/2015/07/12/basic-python-network/
I have modified Trask's code this way:
import numpy as np
# sigmoid function
def nonlin(x,deriv=False):
if(deriv==True):
return x*(1-x)
return 1/(1+np.exp(-x))
# input dataset of 100 pairs of X1, X2 & X3 numbers lying between 1 and 3
X = np.random.randint(1,3,size=(100,3)).astype(int)
#Rescaling them or Normalizing them
X = X/(3*3) #(Max_element, No_of_imputs)
# output dataset
y = np.sum(X, axis = 1, keepdims=True)
#Normalizing
y=y/(3*3)
#Initializing weights
np.random.seed(1)
# randomly initialize our weights with mean 0
syn0 = 2*np.random.random((3,4)) - 1
syn1 = 2*np.random.random((4,1)) - 1
#Training
for iter in range(30000):
# forward propagation
l0 = X
l1 = nonlinn(np.dot(l0,syn0))
l2 = nonlinn(np.dot(l1,syn1))
# how much did we miss?
l2_error = y-l2
#if (iter% 100) == 0:
# print ("Error:" + str(np.mean(np.abs(l2_error))))
l2_delta = l2_error*nonlinn(l2, deriv=True)
l1_error = l2_delta.dot(syn1.T)
# multiply how much we missed by the
# slope of the sigmoid at the values in l1
l1_delta = l1_error * nonlinn(l1,True)
# update weights
syn1 += l1.T.dot(l2_delta)
syn0 += l0.T.dot(l1_delta)
#Predict
l0 = [[3,1,0]] #Should give 4 as answer
l1 = nonlinn(np.dot(l0,syn0))
l2 = nonlinn(np.dot(l1,syn1))
print (l2*(3*3))
#(3*3) will invert the effect of normalization
The result was 4.157 (quite accurate).
I think the issue was with normalization.