Training TensorFlow to predict a sum - python

The TensorFlow provided examples are a little complicated for getting started, so I am trying to teach TensorFlow train a neural network to predict the sum of three binary digits. The network gets two of them as inputs; the third one is unknown. So an "optimal" network would guess that the sum will be the sum of the two known bits, plus 1/2 for the unknown bit. Let's say that the "loss" function is the square of the difference between the value predicted by the network and the actual value.
I have written code to generate the trials:
import tensorflow as tf
import numpy as np
from random import randint
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('batch_size', 5, 'Batch size. ')
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('dim1', 3, 'layer size')
flags.DEFINE_integer('training_epochs', 10, 'Number of passes through the main training loop')
def ezString(list):
#debugging code so I can see what is going on
listLength = len(list)
r = ''
for i in range(listLength):
value = list[i]
valueString = str(value)
r = r + ' '
r = r + valueString
return r
def generateTrial():
inputs = np.zeros(2, dtype=np.int)
for i in range(2):
inputs[i] = randint(0,1)
unknownInput = randint(0,1)
sum = 0
for j in range(2):
sum = sum + inputs[j]
sum = sum + unknownInput
inputTensor = tf.pack(inputs)
print 'inputs' + ezString(inputs)
print 'unknown ' + str(unknownInput)
print 'sum ' + str(sum)
print ''
return inputTensor, sum
def printTensor(tensor):
sh = tensor.get_shape()
print(sh)
def placeholder_inputs(size):
output_placeholder = tf.placeholder(tf.int32, shape=(size))
input_placeholder = tf.placeholder(tf.int32, shape=(size,
2))
return input_placeholder, output_placeholder
def fill_feed_dict(inputs_pl, output_pl):
print ('Filling feed dict')
inputs_placeholder, output_placeholder = placeholder_inputs(FLAGS.batch_size)
inputs = []
outputs = []
for i in range(FLAGS.batch_size):
input, output = generateTrial()
inputTensor = tf.pack(input)
inputs.append(input)
outputs.append(output)
inputs_placeholder = tf.pack(inputs)
outputs_placeholder = tf.pack(outputs)
def run_training():
input_placeholder, output_placeholder = placeholder_inputs(FLAGS.batch_size)
fill_feed_dict(input_placeholder, output_placeholder)
printTensor(input_placeholder)
printTensor(output_placeholder)
run_training()
The output suggests that this much is working:
Filling feed dict
inputs 1 0
unknown 0
sum 1
inputs 1 0
unknown 1
sum 2
inputs 0 1
unknown 1
sum 2
inputs 0 1
unknown 0
sum 1
inputs 0 0
unknown 0
sum 0
(5, 2)
(5,)
But I'm unclear on how I would finish it up. In particular, I need to define a loss function, and I also need to hook things up so that the outputs from my network get used to generate guesses for further training steps. Can anyone help?

I'm not sure whether this code is what you wanted to get, but i hope you would find it useful anyway. Mean squared error is actually decreasing along the iterations, though I haven't tested it for making predictions, so it's up to you!
import tensorflow as tf
import numpy as np
from random import randint
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('batch_size', 50, 'Batch size.')
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('dim1', 3, 'layer size')
flags.DEFINE_integer('training_epochs', 10, 'Number of passes through the main training loop')
flag.DEFINE_integer('num_iters', 100, 'Number of iterations')
def ezString(list):
#debugging code so I can see what is going on
listLength = len(list)
r = ''
for i in range(listLength):
value = list[i]
valueString = str(value)
r = r + ' '
r = r + valueString
return r
def generateTrial():
inputs = np.zeros(2, dtype = np.float)
for i in range(2):
inputs[i] = randint(0, 1)
unknownInput = randint(0, 1)
um = 0
for j in range(2):
sum = sum + inputs[j]
sum = sum + unknownInput
inputTensor = np.asarray(inputs)
return inputTensor, sum
def printTensor(tensor):
sh = tensor.get_shape()
print(sh)
def placeholder_inputs(size):
output_placeholder = tf.placeholder(tf.float32, shape=(size))
input_placeholder = tf.placeholder(tf.float32, shape=(size, 2))
return input_placeholder, output_placeholder
def fill_feed_dict(inputs_pl, output_pl):
inputs = []
outputs = []
for i in range(FLAGS.batch_size):
input, output = generateTrial()
inputs.append(input)
outputs.append(output)
return {inputs_pl: inputs, output_pl: outputs}
def loss(y, pred):
return tf.reduce_mean(tf.pow(y - pred, 2))
def NN(x, y, W1, b1, W2, b2):
layer1 = tf.add(tf.matmul(x, W1), b1)
layer1 = tf.nn.relu(layer1)
output = tf.add(tf.matmul(layer1, W2), b2)
return output, loss(y, output)
def get_params(dim_hidden):
with tf.variable_scope('nn_params'):
return tf.Variable(tf.truncated_normal([2, dim_hidden], stddev = 0.05)), tf.Variable(0.0, (dim_hidden)),\
tf.Variable(tf.truncated_normal([dim_hidden, 1], stddev = 0.05)), tf.Variable(0.0, 1)
def run_training():
input_placeholder, output_placeholder = placeholder_inputs(FLAGS.batch_size)
W1, b1, W2, b2 = get_params(FLAGS.dim1)
pred, loss = NN(input_placeholder, output_placeholder, W1, b1, W2, b2)
optm = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(loss)
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for iters in range(FLAGS.num_iters):
l, _ = sess.run([loss, optm], feed_dict = fill_feed_dict(input_placeholder, output_placeholder))
print l, iters + 1

Related

Scipy fails to minimize cost function

Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
function.
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
)
# Appending the params to the theta matrix
theta.append(theta_i)
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
else:
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
else:
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
print(cost);
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
fprime=Gradient,
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
activations.append(currentActivations)
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
weighted_layers.append(weighted_inputs)
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
else:
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
activations.append(activation_nodes)
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
}
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
15.441233231650283
15.441116436313076
15.441192262452514
15.44122384651483
15.441231216030646
15.441232804294314
15.441233141284435
15.44123321255294
15.441233227614855
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)

For the deep network, assuming that each batch consists of two inputs (S and I), how to calculate whether each sample Si and all I match?

For the deep network, assuming that each batch consists of two inputs (S and I), how to calculate whether each single sample Si and all I(batch examples) match? I wrote the following program which uses tensorarray to calculate the attention of each Si on all I, S and I have tan function relationship. Ideally, each Si will pay the most attention to its corresponding I. But the end result of convergence is that Si pays the same attention to each I.Any Advice?
import tensorflow as tf
from tensorflow.python.ops import tensor_array_ops
import tensorflow.contrib.layers as layers
import numpy as np
batch_szie = 64
word_len = 16
word_emb_dim = 512
feature_dim = 512
x = tf.placeholder(dtype=tf.float32,shape =[None,1024],name="S")
sentence = tf.placeholder(dtype=tf.float32,shape =[None,1024],name="I")
# target = tf.placeholder(dtype=tf.float32,shape=[None,64],name = "target")
batch_size = tf.shape(sentence)[0]
labels = tf.eye(batch_size)
loss_array = tensor_array_ops.TensorArray(dtype=tf.float32, size=64,dynamic_size=False, infer_shape=True)
attention_array = tensor_array_ops.TensorArray(dtype=tf.float32, size=64,dynamic_size=False, infer_shape=True)
x_pre = layers.fully_connected(
x,
num_outputs=1024,
# activation_fn=tf.nn.relu,
scope="pre",
reuse=tf.AUTO_REUSE)
sentence_tp = layers.fully_connected(
sentence,
num_outputs=1024,
# activation_fn=tf.nn.relu,
scope="s_pre",
reuse=tf.AUTO_REUSE)
def body(i,loss_array,attention_array):
res = tf.tile(tf.expand_dims(tf.expand_dims(x_pre[i],1),0) [batch_size,1,1])
res = tf.matmul(tf.expand_dims(sentence_tp,1),res)
res = tf.reshape(res, [batch_size])
attention = tf.reduce_sum(labels[i] * tf.nn.softmax(res, 0), 0)
tp_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels[i], logits=res))
loss_array =loss_array.write(i,tp_loss)
attention_array = attention_array.write(i,attention)
return i+1 , loss_array , attention_array
_, loss_res,attention_res = tf.while_loop(cond=lambda i, _1,_2: i < 64,
body=body,
loop_vars=[tf.constant(0), loss_array,attention_array])
loss= tf.reduce_mean(loss_res.stack())
attention_all = attention_res.stack()
vars = tf.trainable_variables()
dis_optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.5, beta2=0.9)
dis_grads = tf.gradients(loss, vars)
dis_grads_and_vars = list(zip(dis_grads, vars))
for grad, var in dis_grads_and_vars:
print("var:", var, " ", grad)
dis_train_op =dis_optimizer.apply_gradients(grads_and_vars=dis_grads_and_vars)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(1000000):
org = np.random.uniform(low=0.0,high=100,size = (64,1024))
image = np.sin(org)
s = np.cos(org)
feed_dict = {x:image,sentence:s}
a,_,lr = sess.run([attention_all,dis_train_op,loss],feed_dict)
print(lr)
for j in range(1):
print("attention:",a)
Converges when attention on each use case is 1/batch_size

Why does cost function for MLP flatten?

I am very new to machine learning and am trying to implement an MLP however the cost function seems to be reaching a local minimum before reaching the global minimum. I plotted the cost as a function of iteration (including a 0 value as to not be fooled by where the y-axis starts). Here is the code that I am using at my attempt:
import numpy as np
class NNet(object):
def __init__(self, n_in, n_hidden, n_out):
self.n_in = n_in
self.n_hidden = n_hidden
self.n_out = n_out
self.W1 = np.random.randn(n_in, n_hidden)
self.W2 = np.random.randn(n_hidden, n_out)
self.b1 = np.random.randn(n_hidden,)
self.b2 = np.random.randn(n_out,)
def sigmoid(self, z):
return 1/(1 + np.exp(-z))
def sig_prime(self, z):
return (np.exp(-z))/((1+np.exp(-z))**2)
def propagate_forward(self, X):
self.z1 = np.dot(self.W1.T, X) + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = np.dot(self.W2.T, self.a1) + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def cost(self, y, y_hat):
return np.mean([np.sum((y[i] - y_hat[i])**2) for i in range(y.shape[0])])/2
def cost_grad(self, X, y):
y_hat = self.propagate_forward(X)
d2 = np.multiply(self.sig_prime(self.z2), -(y - y_hat))
gJ_W2 = np.matrix(np.multiply(self.a1.T, d2))
d1 = np.dot(self.W2, d2)*self.sig_prime(self.z1)
gJ_W1 = np.dot(np.matrix(X).T, np.matrix(d1))
return [gJ_W1, d1, gJ_W2, d2]
m = 1000
n = 1
X = np.zeros((m, n))
y = np.zeros((m,1))
import random
import math
i = 0
for r, theta in zip(np.linspace(0, 5, num=m), np.linspace(0, 8 * math.pi, num=m)):
r += random.random()
X[i] = [r * math.cos(theta), r * math.sin(theta)]
if i < 333:
y[i] = 0
elif i < 666:
y[i] = 1
else:
y[i] = 2
i += 1
nnet = NNet(n, 5, 1)
learning_rate = 0.2
improvement_threshold = 0.995
cost = np.inf
xs = []
ys = []
iter = 0
while cost > 0.2:
cost = nnet.cost(y, [nnet.propagate_forward(x_train) for x_train
if iter % 100 == 0:
xs.append(iter)
ys.append(cost)
print("Cost", cost)
if iter >= 1000:
print("Gradient descent is taking too long, giving up.")
break
cost_grads = [nnet.cost_grad(x_train, y_train) for x_train, y_train in zip(X, y)]
gW1 = [grad[0] for grad in cost_grads]
gb1 = [grad[1] for grad in cost_grads]
gW2 = [grad[2] for grad in cost_grads]
gb2 = [grad[3] for grad in cost_grads]
nnet.W1 -= np.mean(gW1, axis=0)/2 * learning_rate
nnet.b1 -= np.mean(gb1, axis=0)/2 * learning_rate
nnet.W2 -= np.mean(gW2, axis=0).T/2 * learning_rate
nnet.b2 -= np.mean(gb2, axis=0)/2 * learning_rate
iter += 1
Why is the cost not improving after a certain point? Also any other tips are highly appreciated.
The generated toy dataset looks like this
Your goal seems to be to predict to which class {0,1,2} belongs the data.
The output of your net is a sigmoid (sigm(x) in [0,1]) and you're
training using mean squared error (MSE), it's impossible for the model to predict a value above 1. So it's always wrong when the class to predict is 2.
The cost probably flattens because your sigmoid unit saturate (when trying to predict 2) and the gradient for saturating sigmoid is 0
For classification neural net normally end with a softmax layer and
are trained using cross-entropy.
If you want to keep using MSE and sigmoids unit for classification, you should consider predicting only two classes at a time in a One-vs-(One/All) kinda way.
Anyway, if you only do bi-class classification by rounding output to 0 or 1,it seems to work. Cost is decreasing and accuracy rising (quickly modified code):

Why is my LSTM in tensorflow learning so slowly and badly?

This program reads a text file RNNtext.txt, creates one-hot vector representation for all the data, trains the LSTM with the data and displays a bunch of sampled characters every now and then. However, even looking at the cost vs iterations graph shows that it's learning very very inefficiently. Honestly, the raw code (numpy) for the LSTM I have does a MUCH better job. It's not only faster but it produces mostly meaningful words. This produces gibberish only. Where is my mistake? I really am out of ideas and I can't seem to find where it is logically wrong.
import numpy as np
import random
import tensorflow as tf
import os
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# Reading RNNtext.txt file
direc = os.path.dirname(os.path.realpath(__file__))
data = open(direc + "/RNNtext.txt", "r").read()
# Array of unique characters
chars = list(set(data))
num_hidden = 80
iterations = 1000
display_iteration = 100 # Sample when iteration % display_iteration == 0
sample_size = 250
batch_size = 120 # batch size or the number of time steps to unroll RNN
alpha = 0.01 # Learning rate
#Vocabulary and text file sizes
vocab_size = len(chars)
data_size = len(data)
# Bijection from a unique character to an index
char_to_ix = {}
# Bijection from an index to a unique character
ix_to_char = {}
for j in range(vocab_size):
char_to_ix[chars[j]] = j
ix_to_char[j] = chars[j]
# Transforming all characters to indices
data_ix = [char_to_ix[ch] for ch in data]
train_data = [] # This will contain one-hot vectors
for k in range(data_size):
# Representing each index/character by a one-hot vector
hot1 = np.zeros((vocab_size, 1))
hot1[data_ix[k]] = 1
train_data.append(hot1)
X = tf.placeholder(tf.float32, [None, vocab_size, 1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, vocab_size])
cell = tf.contrib.rnn.LSTMCell(num_hidden,state_is_tuple=True)
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
output = tf.transpose(output, [1, 0, 2])
weight = tf.Variable(tf.random_normal([num_hidden, vocab_size]))
bias = tf.Variable(tf.constant(0.0, shape=[vocab_size]))
prediction = tf.matmul(output[-1], weight) + bias
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=target))
optimizer = tf.train.ProximalGradientDescentOptimizer(alpha)
minimize = optimizer.minimize(cost)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
ARR = [i for i in range(vocab_size)] # for extracting index by probabilities in np.random.choice()
ITER = []
COST = []
p = 0 # p will be iterated by batch_size steps
for i in range(iterations):
if p + batch_size >= data_size:
p = 0
# sweeping through data one-hot vectors
inp, out = train_data[p:p+batch_size], train_data[p+1:p+batch_size+1]
out = np.reshape(out, [-1, vocab_size])
c = sess.run(cost, {X: inp, target: out}) # calculating cost for plotting later
COST.append(c)
ITER.append(i)
sess.run(minimize, {X: inp, target: out})
# displaying sample_size number of characters with random seed
# doesn't affect training
if i % display_iteration == 0:
seed = np.random.randint(0, vocab_size)
CHARS = []
for j in range(sample_size):
x = np.zeros((vocab_size, 1))
x[seed] = 1
x = [x]
pred = sess.run(prediction, {X: x})[0]
pred = np.exp(pred) / np.sum(np.exp(pred))
pred = pred.ravel()
seed = np.random.choice(ARR, 1, p = pred)[0]
ch = ix_to_char[seed]
CHARS.append(ch)
TXT = ''.join(CHARS)
print("-------------------------------------------------")
print(TXT)
print("Iteration: ", str(i))
p += batch_size
sess.close()
plt.plot(ITER, COST)
plt.show()
EDIT: Added numpy code for comparison
import numpy as np
import matplotlib.pyplot as plt
import os
plt.style.use('fivethirtyeight')
direc = os.path.dirname(os.path.realpath(__file__))
readFile = open(direc + "\RNNtext.txt", 'r')
data = readFile.read()
readFile.close()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(chars)
print("Vocabulary size: " + str(vocab_size))
char_to_ix = {}
ix_to_char = {}
for j in range(len(chars)):
char_to_ix[chars[j]] = j
ix_to_char[j] = chars[j]
hidden_size = 80
batch_size = 120
alpha = 0.1
sample_size = 250
iterations = 1000
display_iteration = 100
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias
def sample(hid, seed, weights, sample_size):
X = np.zeros((vocab_size, 1))
X[seed] = 1
CHARS = []
ARR = [i for i in range(vocab_size)]
for t in range(sample_size):
hid = np.tanh(np.dot(Wxh, X) + np.dot(Whh, hid) + bh)
y = np.dot(Why, hid) + by
prob = np.exp(y) / np.sum(np.exp(y))
prob = prob.ravel()
ix = np.random.choice(ARR, 1, p=prob)[0]
CHARS.append(ix_to_char[ix])
X = np.zeros((vocab_size, 1))
X[ix] = 1
TXT = ''.join(CHARS)
return TXT
LOSS = []
ITER = []
p = 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*batch_size # loss at iteration 0
hprev = np.zeros((hidden_size,1))
for i in range(iterations): ## just time passing by
dWxh = np.zeros_like(Wxh)
dWhh = np.zeros_like(Whh)
dWhy = np.zeros_like(Why)
dbh = np.zeros_like(bh)
dby = np.zeros_like(by)
if p+batch_size >= len(data) or i == 0:
hprev = np.zeros((hidden_size,1))
p = 0
inputs = [char_to_ix[ch] for ch in data[p:p+batch_size]]
targets = [char_to_ix[ch] for ch in data[p+1:p+batch_size+1]]
HID = {}
X = {}
Y = {}
P = {}
HID[-1] = np.copy(hprev)
loss = 0
##======FORWARD======##
for t in range(len(inputs)):
X[t] = np.zeros((vocab_size,1))
X[t][inputs[t]] = 1
HID[t] = np.tanh(np.dot(Wxh, X[t]) + np.dot(Whh, HID[t-1]) + bh) # inp -> X
Y[t] = np.dot(Why, HID[t]) + by # tanh
P[t] = np.exp(Y[t]) / np.sum(np.exp(Y[t]))
loss += -np.log(P[t][targets[t]][0])
dhnext = np.zeros_like(HID[0])
##======BACKPROP======##
for t in reversed(range(len(inputs))):
dy = np.copy(P[t])
dy[targets[t]] -= 1
dh = (np.dot(Why.T, dy) + dhnext)*(1-HID[t]*HID[t])
dx = np.dot(Why.T, dy)*(1 - HID[t]**2)
dWhy += np.dot(dy, HID[t].T)
dWhh += np.dot(dh, HID[t-1].T)
dWxh += np.dot(dh, X[t].T)
dby += dy
dbh += dh
dhnext = np.dot(Whh.T, dh)
##=====================##
hprev = HID[-1]
smooth_loss = smooth_loss * 0.999 + loss * 0.001
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
[dWxh, dWhh, dWhy, dbh, dby],
[mWxh, mWhh, mWhy, mbh, mby]):
mem += dparam * dparam
param += -alpha * dparam / np.sqrt(mem + 1e-8) # Adagrad
if i % display_iteration == 0:
print(str(i))
weights = [Wxh,Whh,Why,bh,by]
seed = inputs[np.random.randint(0,len(inputs))]
TXT = sample(HID[-1], seed, weights, sample_size)
print("-----------------------------------------------")
print(TXT)
print("-----------------------------------------------")
with open(direc + "\RNNout.txt", 'w') as writeFile:
writeFile.write(TXT)
ITER.append(i)
LOSS.append(loss)
p += batch_size
best_text = sample(HID[-1], inputs[0], weights, sample_size)
plt.plot(ITER, LOSS, linewidth = 1)
plt.show()
writeFile.close()
Well, doh... looks like you are not re-using the state! How is LSTM (state machine) supposed to work properly if you are not maintaining the state?
To me this looks like a red flag:
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
the second output from tf.nn.dynamic_rnn is the latest state after the given sequence has been processed. Looks like you are explicitly ignoring it and not re-feeding it into each following iteration of training in sess.run(...) (and hence your dynamic_rnn doesn't have the initial_state parameter).
I would highly recommend changing that part of your code before looking any further.
Also, I don't know what your data looks like, but your feeding and batching strategy needs to be such as to make sense out of this whole state-passing exercise. Otherwise, once again, it will just produce gibberish.
With the information provided, I would suggest these two initial steps to try to improve the model.
Increase the number of iterations, Recurrent Neural Networks work differently than other deep arhitectures and could need maybe an additional order of magnitude in iteration number, to settle.
Play with the seeds: from my experience in order to get meaningful sequences can depend on the quality of the used seeds.

Neural network backpropagation algorithm not working in Python

I am writing a neural network in Python, following the example here. It seems that the backpropagation algorithm isn't working, given that the neural network fails to produce the right value (within a margin of error) after being trained 10 thousand times. Specifically, I am training it to compute the sine function in the following example:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = .1
for layer in range(len(neurons)):
self.inputs.append(np.empty(neurons[layer]))
self.outputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = np.dot(self.weights[layer].T, self.outputs[layer])
self.outputs[-1] = np.tanh(self.inputs[-1])
def backpropagate(self, targets):
gradient = 1 - self.outputs[-1] * self.outputs[-1]
self.errors[-1] = gradient * (self.outputs[-1] - targets)
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(100000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(x)])
net.feedforward([-2])
print(net.outputs[-1])
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
sine_example()
The output fails to be close to tanh(sin(3)) = 0.140190616. I suspected a mistake involving wrong indices or alignment, but Numpy isn't raising any errors like these. Any tips on where I went wrong?
EDIT: I forgot to add the bias neurons. Here is the updated code:
import numpy as np
class Neuralnet:
def __init__(self, neurons):
self.weights = []
self.outputs = []
self.inputs = []
self.errors = []
self.offsets = []
self.rate = .01
for layer in range(len(neurons)-1):
self.weights.append(
np.random.normal(
scale=1/np.sqrt(neurons[layer]),
size=[neurons[layer], neurons[layer + 1]]
)
)
self.outputs.append(np.empty(neurons[layer]))
self.inputs.append(np.empty(neurons[layer]))
self.errors.append(np.empty(neurons[layer]))
self.offsets.append(np.random.normal(scale=1/np.sqrt(neurons[layer]), size=neurons[layer + 1]))
self.inputs.append(np.empty(neurons[-1]))
self.errors.append(np.empty(neurons[-1]))
def feedforward(self, inputs):
self.inputs[0] = inputs
for layer in range(len(self.weights)):
self.outputs[layer] = np.tanh(self.inputs[layer])
self.inputs[layer + 1] = self.offsets[layer] + np.dot(self.weights[layer].T, self.outputs[layer])
def backpropagate(self, targets):
self.errors[-1] = self.inputs[-1] - targets
for layer in reversed(range(len(self.errors) - 1)):
gradient = 1 - self.outputs[layer] * self.outputs[layer]
self.errors[layer] = gradient * np.dot(self.weights[layer], self.errors[layer + 1])
for layer in range(len(self.weights)):
self.weights[layer] -= self.rate * np.outer(self.outputs[layer], self.errors[layer + 1])
self.offsets[layer] -= self.rate * self.errors[layer + 1]
def sine_example():
net = Neuralnet([1, 5, 1])
for step in range(10000):
x = np.random.uniform(-5, 5)
net.feedforward([x])
net.backpropagate([np.sin(x)])
net.feedforward([np.pi])
print(net.inputs[-1])
def xor_example():
net = Neuralnet([2, 2, 1])
for step in range(10000):
net.feedforward([0, 0])
net.backpropagate([-1])
net.feedforward([0, 1])
net.backpropagate([1])
net.feedforward([1, 0])
net.backpropagate([1])
net.feedforward([1, 1])
net.backpropagate([-1])
net.feedforward([1, 1])
print(net.outputs[-1])
def identity_example():
net = Neuralnet([1, 3, 1])
for step in range(10000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([x])
net.feedforward([-2])
print(net.outputs[-1])
identity_example()
I think you train the NN in the wrong way. You have a loop over 10000 iterations and feed a new sample in each cycle. The NN will never get trained in this case.
(the statement is wrong! See the update! )
What you need to do is to generate a large array of true samples Y = sin(X), give it to your network ONCE and iterate over the training set forwards and backwards, in order to minimize the cost function. To check the algorithm you may need to plot the cost function depending on the iteration number and make sure the cost goes down.
Another important point is the initialization of the weights. Your numbers are pretty large and the network will take a lot of time to converge, especially when using low rates. It's a good practice to generate the initial weights in some small range [-eps .. eps] uniformly.
In my code I implemented two different activation functions: sigmoid() and tanh(). You need to scale your inputs depending on the selected function: [0 .. 1] and [-1 .. 1] respectively.
Here are some images which show the cost function and the resulting predictions for sigmoid() and tanh() activation functions:
As you can see the sigmoid() activation gives a little bit better results, than the tanh().
Also I got much better predictions when using a network [1, 6, 1], compared to a bigger network with 4 layers [1, 6, 4, 1]. So the size of the NN is not always the crucial factor. Here is the prediction for the mentioned network with 4 layers:
Here is my code with some comments. I tried to use your notations where it was possible.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.5
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; # range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
plt.plot(J_history)
plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' # the input should be scaled into [ 0..1]
activation2 = 'tanh' # the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
##########################################################################################
# TRAINING
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); # input training set
Y = np.sin(X) # target
kx = 0.1 # noise parameter
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise # noisy target
# scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
# number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
# gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) # prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
# visualization
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
plt.show()
raw_input('press any key to exit')
UPDATE
I would like to take back the statement regarding the training method used in your code. The network can be indeed trained using only one sample per iteration. I got interesting results in online-training using both sigmoid and tanh activation functions:
Online-training using Sigmoid (cost function and prediction)
Online-training using Tanh (cost function and prediction)
As can be seen the choice of Sigmoid as activation function gives better performance. The cost function looks not that good as during the offline-training, but at least it tends to go down.
I plotted the cost function in your implementation, it looks pretty jerky as well:
May be it is a good idea to try your code with the sigmoid or even the ReLU function.
Here is the updated source code. To switch between online and offline training modes just change the method variable.
import numpy as np
import math
import matplotlib.pyplot as plt
class Neuralnet:
def __init__(self, neurons, activation):
self.weights = []
self.inputs = []
self.outputs = []
self.errors = []
self.rate = 0.2
self.activation = activation #sigmoid or tanh
self.neurons = neurons
self.L = len(self.neurons) #number of layers
eps = 0.12; #range for uniform distribution -eps..+eps
for layer in range(len(neurons)-1):
self.weights.append(np.random.uniform(-eps,eps,size=(neurons[layer+1], neurons[layer]+1)))
###################################################################################################
def train(self, X, Y, iter_count):
m = X.shape[0];
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
self.errors.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
#accumulate the cost function
J_history = np.zeros([iter_count, 1])
for i in range(iter_count):
self.feedforward(X)
J = self.cost(Y, self.outputs[self.L-1])
J_history[i, 0] = J
self.backpropagate(Y)
#plot the cost function to check the descent
#plt.plot(J_history)
#plt.show()
###################################################################################################
def cost(self, Y, H):
J = np.sum(np.sum(np.power((Y - H), 2), axis=0))/(2*m)
return J
###################################################################################################
def cost_online(self, min_x, max_x, iter_number):
h_arr = np.zeros([iter_number, 1])
y_arr = np.zeros([iter_number, 1])
for step in range(iter_number):
x = np.random.uniform(min_x, max_x, 1).reshape(1, 1)
self.feedforward(x)
h_arr[step, 0] = self.outputs[-1]
y_arr[step, 0] = np.sin(x)
J = np.sum(np.sum(np.power((y_arr - h_arr), 2), axis=0))/(2*iter_number)
return J
###################################################################################################
def feedforward(self, X):
m = X.shape[0];
self.outputs[0] = np.concatenate( (np.ones([m, 1]), X), axis=1)
for i in range(1, self.L):
self.inputs[i] = np.dot( self.outputs[i-1], self.weights[i-1].T )
if (self.activation == 'sigmoid'):
output_temp = self.sigmoid(self.inputs[i])
elif (self.activation == 'tanh'):
output_temp = np.tanh(self.inputs[i])
if (i < self.L - 1):
self.outputs[i] = np.concatenate( (np.ones([m, 1]), output_temp), axis=1)
else:
self.outputs[i] = output_temp
###################################################################################################
def backpropagate(self, Y):
self.errors[self.L-1] = self.outputs[self.L-1] - Y
for i in range(self.L - 2, 0, -1):
if (self.activation == 'sigmoid'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * self.sigmoid_prime(self.inputs[i])
elif (self.activation == 'tanh'):
self.errors[i] = np.dot( self.errors[i+1], self.weights[i][:, 1:] ) * (1 - self.outputs[i][:, 1:]*self.outputs[i][:, 1:])
for i in range(0, self.L-1):
grad = np.dot(self.errors[i+1].T, self.outputs[i]) / m
self.weights[i] = self.weights[i] - self.rate*grad
###################################################################################################
def sigmoid(self, z):
s = 1.0/(1.0 + np.exp(-z))
return s
###################################################################################################
def sigmoid_prime(self, z):
s = self.sigmoid(z)*(1 - self.sigmoid(z))
return s
###################################################################################################
def predict(self, X, weights):
m = X.shape[0];
self.inputs = []
self.outputs = []
self.weights = weights
for layer in range(self.L):
self.inputs.append(np.empty([m, self.neurons[layer]]))
if (layer < self.L -1):
self.outputs.append(np.empty([m, self.neurons[layer]+1]))
else:
self.outputs.append(np.empty([m, self.neurons[layer]]))
self.feedforward(X)
return self.outputs[self.L-1]
###################################################################################################
# MAIN PART
activation1 = 'sigmoid' #the input should be scaled into [0..1]
activation2 = 'tanh' #the input should be scaled into [-1..1]
activation = activation1
net = Neuralnet([1, 6, 1], activation) # structure of the NN and its activation function
method1 = 'online'
method2 = 'offline'
method = method1
kx = 0.1 #noise parameter
###################################################################################################
# TRAINING
if (method == 'offline'):
m = 1000 #size of the training set
X = np.linspace(0, 4*math.pi, num = m).reshape(m, 1); #input training set
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
#number of the iteration for the training stage
iter_count = 20000
net.train(X, Y_scaled, iter_count) #training
elif (method == 'online'):
sampling_count = 100000 # number of samplings during the training stage
m = 1 #batch size
iter_count = sampling_count/m
for layer in range(net.L):
net.inputs.append(np.empty([m, net.neurons[layer]]))
net.errors.append(np.empty([m, net.neurons[layer]]))
if (layer < net.L -1):
net.outputs.append(np.empty([m, net.neurons[layer]+1]))
else:
net.outputs.append(np.empty([m, net.neurons[layer]]))
J_history = []
step_history = []
for i in range(iter_count):
X = np.random.uniform(0, 4*math.pi, m).reshape(m, 1)
Y = np.sin(X) #target
noise = (2.0*np.random.uniform(0, kx, m) - kx).reshape(m, 1)
Y = Y + noise #noisy target
#scaling of the target depending on the activation function
if (activation == 'sigmoid'):
Y_scaled = (Y/(1+kx) + 1)/2.0
elif (activation == 'tanh'):
Y_scaled = Y/(1+kx)
net.feedforward(X)
net.backpropagate(Y_scaled)
if (np.remainder(i, 1000) == 0):
J = net.cost_online(0, 4*math.pi, 1000)
J_history.append(J)
step_history.append(i)
plt.plot(step_history, J_history)
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
#plt.ylim([0, 0.1])
plt.show()
#gained weights
trained_weights = net.weights
##########################################################################################
# PREDICTION
m_new = 40 #size of the prediction set
X_new = np.linspace(0, 4*math.pi, num = m_new).reshape(m_new, 1);
Y_new = net.predict(X_new, trained_weights) #prediction
#rescaling of the result
if (activation == 'sigmoid'):
Y_new = (2.0*Y_new - 1.0) * (1+kx)
elif (activation == 'tanh'):
Y_new = Y_new * (1+kx)
#visualization
#fake sine curve to show the ideal signal
if (method == 'online'):
X = np.linspace(0, 4*math.pi, num = 100)
Y = np.sin(X)
plt.plot(X, Y)
plt.plot(X_new, Y_new, 'ro')
if (method == 'online'):
plt.title('Batch size ' + str(m) + ', rate ' + str(net.rate) + ', samples ' + str(sampling_count))
plt.ylim([-1.5, 1.5])
plt.show()
raw_input('press any key to exit')
Now I have some remarks to your current code:
Your sine function looks like this:
def sine_example():
net = Neuralnet([1, 6, 1])
for step in range(100000):
x = np.random.normal()
net.feedforward([x])
net.backpropagate([np.tanh(np.sin(x))])
net.feedforward([3])
print(net.outputs[-1])
I don't know why you use tanh in your target input. If you really want to use tanh of sine as target, you need to scale it to [-1..1], because tanh(sin(x)) returns values in range [-0.76..0.76].
The next thing is the range of your training set. You use x = np.random.normal() to generate the samples. Here is the distribution of such an input:
After it you want your network to predict the sine of 3, but the network has almost never seen this number during the training stage. I would use the uniform distribution in a wider range for sample generation instead.

Categories