Why doesn't my logistic regression algorithm work? - python

I'm trying to compute my gradient for a multiclass classification model with logistic regression and it seems not to be working properly.
This is the data that I am using for this model.
import pandas as pd
from sklearn.preprocessing import normalize
# Create x and y datasets
path = '/kaggle/input/digit-recognizer'
# Set train and test sets
data = pd.read_csv(path + '/train.csv', nrows=6000)
x_train, y_train = data.iloc[:4800, 1:].values, data.iloc[:4800, 0].values
x_test, y_test = data.iloc[4800:, 1:].values, data.iloc[4800:, 0].values
# Normalize and expand dims
x_train, x_test = x_train / 255, x_test / 255
y_train, y_test = np.expand_dims(y_train, 1), np.expand_dims(y_test, 1)
assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)
x_train.shape, y_train.shape
((4800, 784), (4800, 1))
Here is the following code where I try to implement gradient descent:
def Sigmoid(z):
from math import e
return 1 / (1 + e**-z)
def CostFunction(h, y, m):
j = -(1/m) * (y # np.log(h) + (1-y) # np.log(1-h))
return j
def GradientDescent(X, y, theta, n_classes):
import numpy as np
# Useful variables
m = len(y)
theta0, theta1 = [x.copy() for x in theta]
grad0, grad1 = [np.zeros(x.shape) for x in [theta0, theta1]]
y_vec = np.zeros((m, n_classes))
j = 0
for i in range(m):
y_vec[i, y[i]] = 1
### Forward propagation
a0 = np.concatenate(([1], X[i]))
a1 = np.concatenate(([1], Sigmoid(theta0 # a0)))
a2 = Sigmoid(theta1 # a1)
h = a2
j += CostFunction(h, y_vec[i], m)
### Backpropagation
delta2 = a2 - y_vec[i]
delta1 = theta1.T # delta2 * (a1 * (1 - a1))
grad0 += np.expand_dims(delta1[1:], 1) # np.expand_dims(a0, 0)
grad1 += np.expand_dims(delta2, 1) # np.expand_dims(a1, 0)
grad0 = grad0 / m
grad1 = grad1 / m
return j, [grad0, grad1]
Now comes the training process.
### Create theta parameters
n_layers = 3
n_classes = 10
# Weigth matrix dims(i, j) = (number of nodes, input shape + bias)
theta0 = np.random.uniform(0, 0.01, (24, x_train.shape[1] + 1))
theta1 = np.random.uniform(0, 0.01, (n_classes, len(theta0) + 1))
theta_params = [theta0, theta1]
### Train parameters
%%time
epochs = 200
alpha = 0.001
j, t = np.zeros(epochs), theta_params.copy()
for i in range(epochs):
print("Iterarion: {}/{}".format(i + 1, epochs))
j[i], g = GradientDescent(x_train, y_train, t, n_classes)
print(j[i])
t[0] = t[0] - a * g[0]
t[1] = t[1] - a * g[1]
The cost starts from J=7.2583 and goes down to approximately J=3.5223, where it gets stuck.
Then, whenever I try to predict any of the samples from the training or test sets it outputs the same approximate probability for all classes.
def Predict(X, theta):
import numpy as np
# Useful variables
m = len(X)
theta0, theta1 = [x for x in theta]
h = np.zeros(m)
for i in range(m):
### Forward propagation
a0 = np.concatenate(([1], X[i]))
a1 = np.concatenate(([1], Sigmoid(theta0 # a0)))
a2 = Sigmoid(theta1 # a1)
print(a2)
h[i] = np.argmax(a2)
return h
Predict(x_train[:1], t)
[0.20078521 0.19842413 0.20535222 0.1953332 0.19425315 0.19302124
0.20107485 0.19589331 0.19688894 0.19526526]
array([2.])
Notice that I'm am printing the results of the hypothesis probability for each node in the last layer during the Predict function.
Anyone could point me the direction by sharing some tips?

Related

Logistic regression from scratch: error keeps increasing

I have implemented logistic regression from scratch, however when I run the script the algorithm always predict the wrong label.
I've tried changing the training output and test_output by switching all 1 to 0 and vice versa but it always predict the wrong label.
I also noticed that changing the "-" sign to "+", when updating the weigths and the bias, the script correctly predicts the label.
What am I doing wrong?
This is the code I've written:
# IMPORTS
import numpy as np
# HYPERPARAMETERS
EPOCHS = 1000
LEARNING_RATE = 0.1
# FUNCTIONS
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(y_pred, training_outputs, m):
j = - np.sum(training_outputs * np.log(y_pred) + (1 - training_outputs) * np.log(1 - y_pred)) / m
return j
# ENTRY
if __name__ == "__main__":
# Training input and output
x = np.array([[1, 1, 1], [0, 0, 0], [1, 0, 1]])
training_outputs = np.array([1, 0, 1])
# Test input and output
test_input = np.array([[0, 1, 1]])
test_output = np.array([0])
# Weigths
w = np.array([0.3, 0.3, 0.3])
# Biases
b = 0
m = 3
# Training
for iteration in range(EPOCHS):
print("Iteration n.", iteration, end= "\r")
# Compute log odds
z = np.dot(x, w) + b
# Compute predicted probability
y_pred = sigmoid(z)
# Back propagation
dz = y_pred - training_outputs
dw = np.dot(x, dz) / m
db = np.sum(dz) / m
# Update weights and bias according to the gradient descent algorithm
w = w - LEARNING_RATE * dw
b = b - LEARNING_RATE * db
print("Model trained. Proceeding with model evaluation...")
# Test
# Compute log odds
z = np.dot(test_input, w) + b
# Compute predicted probability
y_pred = sigmoid(z)
print(y_pred)
# Compute cost
cost = cost(y_pred, test_output, m)
print(cost)
There was an incorrect assumption pointed out by #J_H:
>>> from sklearn.linear_model import LogisticRegression
>>> import numpy as np
>>> x = np.array([[1, 1, 1], [0, 0, 0], [1, 0, 1]])
>>> y = np.array([1, 0, 1])
>>> clf = LogisticRegression().fit(x, y)
>>> clf.predict([[0, 1, 1]])
array([1])
scikit-learn at appears to believe that test_output should be a 1 rather than a 0.
A few more recommendations:
m should be fine to remove (it's a constant, so it could be included in the LEARNING_RATE)
w should be initialized proportional to the number of columns in x (i.e., x.shape[1])
dw = np.dot(x, dz) should be np.dot(dz, x)
Prediction in logistic regression depends on a threshold, usually 0.5
Taking this into account would look something like the following.
# Initialize weights and bias
w, b = np.zeros(X.shape[1]), 0
for _ in range(EPOCHS):
# Compute log odds
z = np.dot(x, w) + b
# Compute predicted probability
y_pred = sigmoid(z)
# Back propagation
dz = y_pred - training_outputs
dw = np.dot(dz, x)
db = np.sum(dz)
# Update
w = w - LEARNING_RATE * dw
b = b - LEARNING_RATE * db
# Test
z = np.dot(test_input, w) + b
test_pred = sigmoid(z) >= 0.5
print(test_pred)
And a complete example on random train/test sets created with sklearn.datasets.make_classification could look like this—which usually gets within a few decimals of the scikit-learn implementation as well:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
EPOCHS = 100
LEARNING_RATE = 0.01
def sigmoid(z):
return 1 / (1 + np.exp(-z))
if __name__ == "__main__":
X, y = make_classification(n_samples=1000, n_features=5)
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Initialize `w` and `b`
w, b = np.zeros(X.shape[1]), 0
for _ in range(EPOCHS):
z = np.dot(X_train, w) + b
y_pred = sigmoid(z)
dz = y_pred - y_train
dw = np.dot(dz, X_train)
db = np.sum(dz)
w = w - LEARNING_RATE * dw
b = b - LEARNING_RATE * db
# Test
z = np.dot(X_test, w) + b
test_pred = sigmoid(z) >= 0.5
print(accuracy_score(y_test, test_pred))

dividing by zero cost function in neural network with one hidden layer

I have a neural network with one hidden layer. When I compute the cost through the entropy function with this code
def neural_network_model(X, Y, hidden_unit, num_iterations = 1000):
np.random.seed(3)
input_unit = define_structure(X, Y)[0]
output_unit = define_structure(X, Y)[2]
parameters = parameters_initialization(input_unit, hidden_unit, output_unit)
W1 = parameters['W1']
b1 = parameters['b1']
W2 = parameters['W2']
b2 = parameters['b2']
for i in range(0, num_iterations):
A2, cache = forward_propagation(X, parameters)
cost = cross_entropy_cost(A2, Y, parameters)
grads = backward_propagation(parameters, cache, X, Y)
parameters = gradient_descent(parameters, grads)
if i % 5 == 0:
print ("Cost after iteration %i: %f" %(i, cost))
return parameters
parameters = neural_network_model(X_train, y_train, 4, num_iterations=1000)
I obtain NaN as result and the warning
divide by zero encountered in log
logprobs = np.multiply(np.log(A2), Y) + np.multiply((1-Y), np.log(1 - A2))
the functions cross_entropy_cost and backward_propagation are the following:
def cross_entropy_cost(A2, Y, parameters):
# number of training example
m = Y.shape[1]
# Compute the cross-entropy cost
logprobs = np.multiply(np.log(A2), Y) + np.multiply((1-Y), np.log(1 - A2))
cost = - np.sum(logprobs) / m
cost = float(np.squeeze(cost))
return cost
def backward_propagation(parameters, cache, X, Y):
#number of training example
m = X.shape[1]
W1 = parameters['W1']
W2 = parameters['W2']
A1 = cache['A1']
A2 = cache['A2']
dZ2 = A2-Y
dW2 = (1/m) * np.dot(dZ2, A1.T)
db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)
dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2))
dW1 = (1/m) * np.dot(dZ1, X.T)
db1 = (1/m)*np.sum(dZ1, axis=1, keepdims=True)
grads = {"dW1": dW1, "db1": db1, "dW2": dW2,"db2": db2}
return grads
How can I solve the problem of the division by zero?

Problem reproducing the predicted covariance of a gaussian process using gpytorch with same hyperparameters

I need to build a function that gives the a posteriori covariance of a Gaussian Process. The idea is to train a GP using GPytorch, then take the learned hyperparameters, and pass them into my kernel function. (for several reason I can't use the GPyTorch directly).
Now the problem is that I can't reproduce the prediction. Here the code I wrote. I have been working on it the whole day but I can't find the problem. Do you know what I am doing wrong?
from gpytorch.mlls import ExactMarginalLogLikelihood
import numpy as np
import gpytorch
import torch
train_x1 = torch.linspace(0, 0.95, 50) + 0.05 * torch.rand(50)
train_y1 = torch.sin(train_x1 * (2 * np.pi)) + 0.2 * torch.randn_like(train_x1)
n_datapoints = train_x1.shape[0]
def kernel_rbf(x1, x2, c, l):
# my RBF function
if x1.shape is ():
x1 = np.atleast_2d(x1)
if x2.shape is ():
x2 = np.atleast_2d(x2)
return c * np.exp(- np.matmul((x1 - x2).T, (x1 - x2)) / (2 * l ** 2))
class ExactGPModel(gpytorch.models.ExactGP):
def __init__(self, train_x, train_y, likelihood):
super().__init__(train_x, train_y, likelihood)
lengthscale_prior = gpytorch.priors.GammaPrior(3.0, 6.0)
outputscale_prior = gpytorch.priors.GammaPrior(2.0, 0.15)
self.mean_module = gpytorch.means.ConstantMean()
self.covar_module = gpytorch.kernels.ScaleKernel(
gpytorch.kernels.RBFKernel(lengthscale_prior=lengthscale_prior),
outputscale_prior=outputscale_prior)
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
likelihood = gpytorch.likelihoods.GaussianLikelihood()
model = ExactGPModel(train_x1, train_y1, likelihood)
# Find optimal model hyperparameters
model.train()
likelihood.train()
mll = ExactMarginalLogLikelihood(likelihood, model)
# Use the Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1) # Includes GaussianLikelihood parameters
training_iterations = 50
for i in range(training_iterations):
optimizer.zero_grad()
output = model(*model.train_inputs)
loss = -mll(output, model.train_targets)
loss.backward()
print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item()))
optimizer.step()
# Get the learned hyperparameters
outputscale = model.covar_module.outputscale.item()
lengthscale = model.covar_module.base_kernel.lengthscale.item()
noise = likelihood.noise_covar.noise.item()
train_x1 = train_x1.numpy()
train_y1 = train_y1.numpy()
# Get covariance train points
K = np.zeros((n_datapoints, n_datapoints))
for i in range(n_datapoints):
for j in range(n_datapoints):
K[i, j] = kernel_rbf(train_x1[i], train_x1[j], outputscale, lengthscale)
# Add noise
K += noise ** 2 * np.eye(n_datapoints)
# Get covariance train-test points
x_test = torch.rand(1, 1)
Ks = np.zeros((n_datapoints, 1))
for i in range(n_datapoints):
Ks[i] = kernel_rbf(train_x1[i], x_test.numpy(), outputscale, lengthscale)
# Get variance test points
Kss = kernel_rbf(x_test.numpy(), x_test.numpy(), outputscale, lengthscale)
L = np.linalg.cholesky(K)
v = np.linalg.solve(L, Ks)
var = Kss - np.matmul(v.T, v)
model.eval()
likelihood.eval()
with gpytorch.settings.fast_pred_var():
y_preds = likelihood(model(x_test))
print(f"Predicted variance with gpytorch:{y_preds.variance.item()}")
print(f"Predicted variance with my kernel:{var}")
I found the errors:
The noise is not squared so it is K += noise * np.eye(n_datapoints) and not K += noise**2 * np.eye(n_datapoints)
I forgot to add the noise term in the $$ K** $$, i.e. Kss += noise

How can i complete gradient descent algorithm code?

I am freshman & beginner.
I am studying machine learning with open tutorials.
I have a trouble with making gradient descent algorithm
I have to complete "for _ in range(max_iter):" but, I don't know about numpy... so I don't know what code should i add
Could you please help me fill the blank?
I know this type of question is so rude... sorry but I need your help :(
Thank you in advance.
from sklearn import datasets
import numpy as np
from sklearn.metrics import accuracy_score
X, y = datasets.make_classification(
n_samples = 200, n_features = 2, random_state = 333,
n_informative =2, n_redundant = 0 , n_clusters_per_class= 1)
def sigmoid(s):
return 1 / (1 + np.exp(-s))
def loss(y, h):
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
def gradient(X, y, w):
return -(y * X) / (1 + np.exp(-y * np.dot(X, w)))
X_bias = np.append(np.ones((X.shape[0], 1)), X, axis=1)
y = np.array([[1] if label == 0 else [0] for label in y])
w = np.array([[random.uniform(-1, 1)] for _ in range(X.shape[1]+1)])
max_iter = 100
learning_rate = 0.1
threshold = 0.5
for _ in range(max_iter):
#fill in the blank
what code should i add ????
probabilities = sigmoid(np.dot(X_bias, w))
predictions = [[1] if p > threshold else [0] for p in probabilities]
print("loss: %.2f, accuracy: %.2f" %
(loss(y, probabilities), accuracy_score(y, predictions)))
Inside the for loop, we have to first compute the probabilities. Then find the gradients and then update the weights.
For computing probabilities, you can use the code below
probs=sigmoid(np.dot(X_bias,w))
np.dot is numpy command for matrix multiplication. Then we will calculate the loss and its gradients.
J=loss(y,probs)
dJ=gradient(X_bias,y,w)
Now we will update the weights.
w=w-learning_rate*dJ
So the final code will be
from sklearn import datasets
import numpy as np
from sklearn.metrics import accuracy_score
X, y = datasets.make_classification(
n_samples = 200, n_features = 2, random_state = 333,
n_informative =2, n_redundant = 0 , n_clusters_per_class= 1)
def sigmoid(s):
return 1 / (1 + np.exp(-s))
def loss(y, h):
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
def gradient(X, y, w):
return -(y * X) / (1 + np.exp(-y * np.dot(X, w)))
X_bias = np.append(np.ones((X.shape[0], 1)), X, axis=1)
y = np.array([[1] if label == 0 else [0] for label in y])
w = np.array([[np.random.uniform(-1, 1)] for _ in range(X.shape[1]+1)])
max_iter = 100
learning_rate = 0.1
threshold = 0.5
for _ in range(max_iter):
probs=sigmoid(np.dot(X_bias,w))
J=loss(y,probs)
dJ=gradient(X_bias,y,w)
w=w-learning_rate*dJ
probabilities = sigmoid(np.dot(X_bias, w))
predictions = [[1] if p > threshold else [0] for p in probabilities]
print("loss: %.2f, accuracy: %.2f" %
(loss(y, probabilities), accuracy_score(y, predictions)))
Note: In the for loop, there is no need to compute probs and loss, As we only need gradients to update the weights. I did that because it will be easy to understand.

Neural network backpropagation algorithm not working in Python

I am writing a neural network in Python, following the example here. It seems that the backpropagation algorithm isn't working, given that the neural network fails to produce the right value (within a margin of error) after being trained 10 thousand times. Specifically, I am training it to compute the sine function in the following example:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = .1
for layer in range(len(neurons)):
self.inputs.append(np.empty(neurons[layer]))
self.outputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = np.dot(self.weights[layer].T, self.outputs[layer])
self.outputs[-1] = np.tanh(self.inputs[-1])
def backpropagate(self, targets):
gradient = 1 - self.outputs[-1] * self.outputs[-1]
self.errors[-1] = gradient * (self.outputs[-1] - targets)
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(100000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(x)])
net.feedforward([-2])
print(net.outputs[-1])
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
sine_example()
The output fails to be close to tanh(sin(3)) = 0.140190616. I suspected a mistake involving wrong indices or alignment, but Numpy isn't raising any errors like these. Any tips on where I went wrong?
EDIT: I forgot to add the bias neurons. Here is the updated code:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.outputs = []
self.inputs = []
self.errors = []
self.offsets = []
self.rate = .01
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
self.outputs.append(np.empty(neurons[layer]))
self.inputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
self.offsets.append(np.random.normal(scale=1/np.sqrt(neurons[layer]), size=neurons[layer + 1]))
self.inputs.append(np.empty(neurons[-1]))
self.errors.append(np.empty(neurons[-1]))
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = self.offsets[layer] + np.dot(self.weights[layer].T, self.outputs[layer])
def backpropagate(self, targets):
self.errors[-1] = self.inputs[-1] - targets
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
self.offsets[layer] -= self.rate * self.errors[layer + 1]
def sine_example():
net = Neuralnet([1, 5, 1])
for step in range(10000):
x = np.random.uniform(-5, 5)
net.feedforward([x])
net.backpropagate([np.sin(x)])
net.feedforward([np.pi])
print(net.inputs[-1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(10000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(10000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([x])
net.feedforward([-2])
print(net.outputs[-1])
identity_example()
I think you train the NN in the wrong way. You have a loop over 10000 iterations and feed a new sample in each cycle. The NN will never get trained in this case.
(the statement is wrong! See the update! )
What you need to do is to generate a large array of true samples Y = sin(X), give it to your network ONCE and iterate over the training set forwards and backwards, in order to minimize the cost function. To check the algorithm you may need to plot the cost function depending on the iteration number and make sure the cost goes down.
Another important point is the initialization of the weights. Your numbers are pretty large and the network will take a lot of time to converge, especially when using low rates. It's a good practice to generate the initial weights in some small range [-eps .. eps] uniformly.
In my code I implemented two different activation functions: sigmoid() and tanh(). You need to scale your inputs depending on the selected function: [0 .. 1] and [-1 .. 1] respectively.
Here are some images which show the cost function and the resulting predictions for sigmoid() and tanh() activation functions:
As you can see the sigmoid() activation gives a little bit better results, than the tanh().
Also I got much better predictions when using a network [1, 6, 1], compared to a bigger network with 4 layers [1, 6, 4, 1]. So the size of the NN is not always the crucial factor. Here is the prediction for the mentioned network with 4 layers:
Here is my code with some comments. I tried to use your notations where it was possible.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.5
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; # range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
plt.plot(J_history)
plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' # the input should be scaled into [ 0..1]
activation2 = 'tanh' # the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
##########################################################################################
# TRAINING
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); # input training set
Y = np.sin(X) # target
kx = 0.1 # noise parameter
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise # noisy target
# scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
# number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
# gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) # prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
# visualization
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
plt.show()
raw_input('press any key to exit')
UPDATE
I would like to take back the statement regarding the training method used in your code. The network can be indeed trained using only one sample per iteration. I got interesting results in online-training using both sigmoid and tanh activation functions:
Online-training using Sigmoid (cost function and prediction)
Online-training using Tanh (cost function and prediction)
As can be seen the choice of Sigmoid as activation function gives better performance. The cost function looks not that good as during the offline-training, but at least it tends to go down.
I plotted the cost function in your implementation, it looks pretty jerky as well:
May be it is a good idea to try your code with the sigmoid or even the ReLU function.
Here is the updated source code. To switch between online and offline training modes just change the method variable.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.2
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; #range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
#plt.plot(J_history)
#plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def cost_online(self, min_x, max_x, iter_number):
h_arr = np.zeros([iter_number, 1])
y_arr = np.zeros([iter_number, 1])
for step in range(iter_number):
x = np.random.uniform(min_x, max_x, 1).reshape(1, 1)
self.feedforward(x)
h_arr[step, 0] = self.outputs[-1]
y_arr[step, 0] = np.sin(x)
J = np.sum(np.sum(np.power((y_arr - h_arr), 2), axis=0))/(2*iter_number)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' #the input should be scaled into [0..1]
activation2 = 'tanh' #the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
method1 = 'online'
method2 = 'offline'
method = method1
kx = 0.1 #noise parameter
###################################################################################################
# TRAINING
if (method == 'offline'):
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); #input training set
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
#number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
elif (method == 'online'):
sampling_count = 100000 # number of samplings during the training stage
m = 1 #batch size
iter_count = sampling_count/m
for layer in range(net.L):
net.inputs.append(np.empty([m, net.neurons[layer]]))
net.errors.append(np.empty([m, net.neurons[layer]]))
if (layer < net.L -1):
net.outputs.append(np.empty([m, net.neurons[layer]+1]))
else:
net.outputs.append(np.empty([m, net.neurons[layer]]))
J_history = []
step_history = []
for i in range(iter_count):
X = np.random.uniform(0, 4*math.pi, m).reshape(m, 1)
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
net.feedforward(X)
net.backpropagate(Y_scaled)
if (np.remainder(i, 1000) == 0):
J = net.cost_online(0, 4*math.pi, 1000)
J_history.append(J)
step_history.append(i)
plt.plot(step_history, J_history)
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
#plt.ylim([0, 0.1])
plt.show()
#gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) #prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
#visualization
#fake sine curve to show the ideal signal
if (method == 'online'):
X = np.linspace(0, 4*math.pi, num = 100)
Y = np.sin(X)
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
if (method == 'online'):
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
plt.ylim([-1.5, 1.5])
plt.show()
raw_input('press any key to exit')
Now I have some remarks to your current code:
Your sine function looks like this:
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
I don't know why you use tanh in your target input. If you really want to use tanh of sine as target, you need to scale it to [-1..1], because tanh(sin(x)) returns values in range [-0.76..0.76].
The next thing is the range of your training set. You use x = np.random.normal() to generate the samples. Here is the distribution of such an input:
After it you want your network to predict the sine of 3, but the network has almost never seen this number during the training stage. I would use the uniform distribution in a wider range for sample generation instead.

Categories