I am very new to neural networks. And I'm learning some basics and I'm stuck at this point.
I have played with many parameters included the inital values of the dynamic system. I dropped the learning rate and changed the number of epoches and the batch size. Also I changed the number of samples I give to the fit function. I added a hidden layer and removed one again. After all nothing really helped. Sometimes the predicted value is pretty close and sometimes its really far away from the tested value. Do I miss something? Or how can I improve neural networks in general to get what I want to predict? Do I have to find a "sweet spot"?
Below you can find my code. I would be glad for new ideas to improve the network. I am a beginner and this is my first StackOverflow post.
######################### import stuff ##########################
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time
import matplotlib.pyplot as p
from keras import backend as K
###Model
k_dim =1000 # Number of timestamps
seq_dim = 1000 # Number of samples
def g_func(x):
return np.power(x,2)
input1 = keras.Input(shape=(k_dim,))
input2 = keras.Input(shape=(k_dim,))
merged = layers.concatenate([input1,input2])
alpha_pred = layers.Dense(32,input_dim=2, activation='relu')(merged)
x = layers.Dense(16,activation='sigmoid')(alpha_pred)
output_a = layers.Dense(1)(x)
model = keras.Model(inputs=[input1,input2], outputs=output_a)
model.compile(loss='mean_absolute_error', optimizer='sgd', metrics=['mean_squared_error'])
model.summary()
######################## Simulating Input and Output data ########################
n = 2
alpha = np.random.rand(1,seq_dim)
m = 1
X_train = np.random.rand(seq_dim,k_dim)
X_train[:,0] = 0
u = [float] * seq_dim
y_train = np.zeros((seq_dim,k_dim))
for j in range (seq_dim):
u = X_train[j,:]
for k in range(k_dim-1):
for i in range(n-1):
y_train[j,k+1] = alpha[0,i] * y_train[j, k-i] + g_func(u[k])
alpha = np.transpose(alpha)
print('Learning rate before first fit:', model.optimizer.learning_rate.numpy())
history = model.fit([X_train,y_train], alpha, batch_size=64, epochs=3000)
print("Learning rate before second fit:", model.optimizer.learning_rate.numpy())
K.set_value(model.optimizer.learning_rate, 0.001)
history = model.fit([X_train,y_train], alpha, batch_size=64, epochs=1000)
# Plot the lossfunction and mean squared error
p.plot(history.history['loss'], 'b-', label='LOSS')
p.plot(history.history['mean_squared_error'], 'r-', label='Mean squared error')
p.legend(loc='best')
p.draw()
print('Model trained...')
time.sleep(2)
alpha = None
X_train = None
y_train = None
u = None
seq_dim = 1
#####
###Model has been trained. Lets test with new x and y to get one alpha###
####
X_train = np.random.rand(seq_dim,k_dim)
u = [float] * seq_dim
y_train = np.zeros((seq_dim,k_dim))
alpha = np.array([0.9])
for j in range (seq_dim):
u = X_train[j, :]
for k in range(k_dim-1):
for i in range(n-1):
y_train[j,k+1] = alpha[i] * y_train[j,k-i] + g_func(u[k])
z = model.predict([X_train, y_train])
#Compare the real value with the predicted value
print('Comparing real value with predicted value')
for i,j in zip(alpha,z):
print('{} => {}'.format(i,j))
p.show()
The problem might be using sigmoid. This causes vanishing gradients too often and explodes gradients sometimes while backpropogating because its derivates are in range 0-0.25.
You have to know how your data is spread using plot. Also remove outliers in regression problems.
If the input data is too spread in your plot then the model will not predict close to the correct value all the time.
Related
I implement an MLP neural network model on the data, for optimization 4 variables a function base on the MLP model is defined, and simulated annealing run on this function. I don't know why I get this error (attached below).
Neural network code:
# mlp for regression
from numpy import sqrt
from pandas import read_csv
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
import tensorflow
from tensorflow import keras
from matplotlib import pyplot
from keras.layers import Dropout
from tensorflow.keras import regularizers
# determine the number of input features
n_features = X_train.shape[1]
# define model
model = Sequential()
model.add(Dense(150, activation='tanh', kernel_initializer='zero',kernel_regularizer=regularizers.l2(0.001), input_shape=(n_features,))) #relu/softmax/tanh
model.add(Dense(100, activation='tanh', kernel_initializer='zero',kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(50, activation='tanh', kernel_initializer='zero',kernel_regularizer=regularizers.l2(0.001)))
model.add(Dropout(0.0))
model.add(Dense(1))
# compile the model
opt= keras.optimizers.Adam(learning_rate=0.001)
#opt = tensorflow.keras.optimizers.RMSprop(learning_rate=0.001,rho=0.9,momentum=0.0,epsilon=1e-07,centered=False,name="RMSprop")
model.compile(optimizer=opt, loss='mse')
# fit the model
history=model.fit(X_train, y_train, validation_data = (X_test,y_test), epochs=100, batch_size=10, verbose=0,validation_split=0.3)
# evaluate the model
error = model.evaluate(X_test, y_test, verbose=0)
print('MSE: %.3f, RMSE: %.3f' % (error, sqrt(error)))
# plot learning curves
pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('Cross Entropy')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='val')
pyplot.legend()
pyplot.show()
function code:
def objective_function(X):
wob = X[0]
torque= X[1]
RPM = X[2]
pump = X[3]
input=[wob,torque,RPM, 0.00017,0.027,pump,0,0.5,0.386,0.026,0.0119,0.33,0.83,0.48]
input = pd.DataFrame(input)
obj= model.predict(input)
return obj
simulated annealing for optimization:
import time
import random
import math
import numpy as np
## custom section
initial_temperature = 100
cooling = 0.8 # cooling coef.
number_variables = 4
upper_bounds = [1,1,1,1]
lower_bounds = [0,0,0,0]
computing_time = 1 # seconds
## simulated Annealing algorithm
## 1. Genertate an initial solution randomly
initial_solution = np.zeros((number_variables))
for v in range(number_variables):
initial_solution[v] = random.uniform(lower_bounds[v], upper_bounds[v])
current_solution = initial_solution
best_solution = initial_solution
n=1 # no of solutions accepted
best_fitness = objective_function(best_solution)
current_temperature = initial_temperature # current temperature
start = time.time()
no_attemps = 100 # number of attemps in each level of temperature
record_best_fitness = []
for i in range(9999999):
for j in range(no_attemps):
for k in range(number_variables):
## 2. generate a candidate solution y randomly based on solution x
current_solution[k] = best_solution[k] + 0.1*(random.uniform(lower_bounds[k], upper_bounds[k]))
current_solution[k] = max(min(current_solution[k], upper_bounds[k]), lower_bounds[k]) # repaire the solution respecting the bounds
## 3. check if y is better than x
current_fitness = objective_function(current_solution)
E = abs(current_fitness - best_solution)
if i==0 and j==0:
EA = E
if current_fitness < best_fitness:
p = math.exp(-E/(EA*current_temperature))
# make a decision to accept the worse solution or not
## 4. make a decision whether r < p
if random.random()<p:
accept = True # this worse solution is not accepted
else:
accept = False # this worse solution is not accepted
else:
accept = True # accept better solution
## 5. make a decision whether step comdition of inner loop is met
if accept == True:
best_solution = current_solution # update the best solution
best_fitness = objective_function(best_solution)
n = n + 1 #count the solutions accepted
EA = (EA*(n-1)+E)/n # accept EA
print('interation : {}, best_solution:{}, best_fitness:{}'. format(i, best_solution, best_fitness))
record_best_fitness.append(best_fitness)
## 6. decrease the temperature
current_temperature = current_temperature * cooling
## 7. stop condition of outer loop is met
end = time.time()
if end-start >= computing_time:
break
The error picture:
it's for your input shape, in MLP neural network your input shape is [none,14], but in your function's input id [14,1], so you need transpose it.
def objective_function(X):
wob = X[0]
torque= X[1]
RPM = X[2]
pump = X[3]
input=[wob,torque,RPM, 0.00017,0.027,pump,0,0.5,0.386,0.026,0.0119,0.33,0.83,0.48]
input = pd.DataFrame(input)
input=input.T
model1.predict(input)
return obj
I am trying to use keras dense neural networks to forecast some time series.
When fitting my model on complex real datasets, my model converges toward a constant output, i.e. whatever the input, the model gives the same output (which seems to be a reasonable estimate of the mean of my dataset).
I reduced the problem up to very simple simulated datasets, and still have the same issue. Here is a minimal working example:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
X = []
Y = []
for jh in range(10000):
x = np.arange(-1, 1, 0.01)
y = 1+x*((np.random.random()-0.5))
y += np.random.randn(len(x))/(100)
X.append(y[:100])
Y.append(y[100:])
X = np.array(X)[:,:,None]
Y = np.array(Y)[:,:,None]
model = models.Sequential()
model.add(layers.Input((100,1,)))
model.add(layers.Flatten())
model.add(layers.Dense(100, activation='sigmoid'))
model.add(layers.Dense(100, activation='sigmoid'))
model.add(layers.Dense(100, activation='sigmoid'))
model.add(tf.keras.layers.Reshape((100,1)))
model.compile(loss = tf.keras.losses.MeanSquaredError(),optimizer="adam")
# model.summary()
print("Fit model on training data")
print("Fit model on training data")
history = model.fit(x=X, y=Y, batch_size=10000, epochs=200)
for k in np.arange(0,10000,1000):
plt.plot(np.arange(len(X[k])), X[k])
plt.plot(np.arange(len(X[k]), len(X[k])+len(Y[k])), model(X)[k])
plt.plot(np.arange(len(X[k]), len(X[k])+len(Y[k])), Y[k])
In this example, the model returns exactly same output regardless of the input.
I tried to change the number of layers, the loss function, the learning rate, the batch size and the number of epochs, without any noticeable improvement.
Do you have any suggestion on this issue?
If you rearrange your random inputs to be like
y = np.array(1. + x)
y += 1. / 100.
also
J, K = [] , []
for jh in range(10000):
j = np.arange(-1, 1, 0.01)
k = -np.array(1. - j)
k += 1. / 100
J.append(k[:100])
K.append(k[100:])
J = np.array(J)[:, :, None]
K = np.array(K)[:, :, None]
and finally add
plt.plot(np.arange(len(X[k]), len(X[k]) + len(Y[k])), model(J)[k])
in the plotting loop, then you will see two different results. Probably you should check your datasets diversity.
I'm trying to learn some PyTorch and am referencing this discussion here
The author provides a minimum working piece of code that illustrates how you can use PyTorch to solve for an unknown linear function that has been polluted with random noise.
This code runs fine for me.
However, when I change the function such that I want t = X^2, the parameter does not seem to converge.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
# Let's make some data for a linear regression.
A = 3.1415926
b = 2.7189351
error = 0.1
N = 100 # number of data points
# Data
X = Variable(torch.randn(N, 1))
# (noisy) Target values that we want to learn.
t = X * X + Variable(torch.randn(N, 1) * error)
# Creating a model, making the optimizer, defining loss
model = nn.Linear(1, 1)
optimizer = optim.SGD(model.parameters(), lr=0.05)
loss_fn = nn.MSELoss()
# Run training
niter = 50
for _ in range(0, niter):
optimizer.zero_grad()
predictions = model(X)
loss = loss_fn(predictions, t)
loss.backward()
optimizer.step()
print("-" * 50)
print("error = {}".format(loss.data[0]))
print("learned A = {}".format(list(model.parameters())[0].data[0, 0]))
print("learned b = {}".format(list(model.parameters())[1].data[0]))
When I execute this code, the new A and b parameters are seemingly random thus it does not converge. I think this should converge because you can approximate any function with a slope and offset function. My theory is that I'm using PyTorch incorrectly.
Can any identify a problem with my t = X * X + Variable(torch.randn(N, 1) * error) line of code?
You cannot fit a 2nd degree polynomial with a linear function. You cannot expect more than random (since you have random samples from the polynomial).
What you can do is try and have two inputs, x and x^2 and fit from them:
model = nn.Linear(2, 1) # you have 2 inputs now
X_input = torch.cat((X, X**2), dim=1) # have 2 inputs per entry
# ...
predictions = model(X_input) # 2 inputs -> 1 output
loss = loss_fn(predictions, t)
# ...
# learning t = c*x^2 + a*x + b
print("learned a = {}".format(list(model.parameters())[0].data[0, 0]))
print("learned c = {}".format(list(model.parameters())[0].data[0, 1]))
print("learned b = {}".format(list(model.parameters())[1].data[0]))
I've been working on this neural network with the intent to predict TBA (time based availability) of simulated windmill parks based on certain attributes. The neural network runs just fine, and gives me some predictions, however I'm not quite satisfied with the results. It fails to notice some very obvious correlations that I can clearly see by myself. Here is my current code:
`# Import
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
maxi = 0.96
mini = 0.7
# Make data a np.array
data = pd.read_csv('datafile_ML_no_avg.csv')
data = data.values
# Shuffle the data
shuffle_indices = np.random.permutation(np.arange(len(data)))
data = data[shuffle_indices]
# Training and test data
data_train = data[0:int(len(data)*0.8),:]
data_test = data[int(len(data)*0.8):int(len(data)),:]
# Scale data
scaler = MinMaxScaler(feature_range=(mini, maxi))
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)
# Build X and y
X_train = data_train[:, 0:5]
y_train = data_train[:, 6:7]
X_test = data_test[:, 0:5]
y_test = data_test[:, 6:7]
# Number of stocks in training data
n_args = X_train.shape[1]
multi = int(8)
# Neurons
n_neurons_1 = 8*multi
n_neurons_2 = 4*multi
n_neurons_3 = 2*multi
n_neurons_4 = 1*multi
# Session
net = tf.InteractiveSession()
# Placeholder
X = tf.placeholder(dtype=tf.float32, shape=[None, n_args])
Y = tf.placeholder(dtype=tf.float32, shape=[None,1])
# Initialize1s
sigma = 1
weight_initializer = tf.variance_scaling_initializer(mode="fan_avg",
distribution="uniform", scale=sigma)
bias_initializer = tf.zeros_initializer()
# Hidden weights
W_hidden_1 = tf.Variable(weight_initializer([n_args, n_neurons_1]))
bias_hidden_1 = tf.Variable(bias_initializer([n_neurons_1]))
W_hidden_2 = tf.Variable(weight_initializer([n_neurons_1, n_neurons_2]))
bias_hidden_2 = tf.Variable(bias_initializer([n_neurons_2]))
W_hidden_3 = tf.Variable(weight_initializer([n_neurons_2, n_neurons_3]))
bias_hidden_3 = tf.Variable(bias_initializer([n_neurons_3]))
W_hidden_4 = tf.Variable(weight_initializer([n_neurons_3, n_neurons_4]))
bias_hidden_4 = tf.Variable(bias_initializer([n_neurons_4]))
# Output weights
W_out = tf.Variable(weight_initializer([n_neurons_4, 1]))
bias_out = tf.Variable(bias_initializer([1]))
# Hidden layer
hidden_1 = tf.nn.relu(tf.add(tf.matmul(X, W_hidden_1), bias_hidden_1))
hidden_2 = tf.nn.relu(tf.add(tf.matmul(hidden_1, W_hidden_2),
bias_hidden_2))
hidden_3 = tf.nn.relu(tf.add(tf.matmul(hidden_2, W_hidden_3),
bias_hidden_3))
hidden_4 = tf.nn.relu(tf.add(tf.matmul(hidden_3, W_hidden_4),
bias_hidden_4))
# Output layer (transpose!)
out = tf.transpose(tf.add(tf.matmul(hidden_4, W_out), bias_out))
# Cost function
mse = tf.reduce_mean(tf.squared_difference(out, Y))
# Optimizer
opt = tf.train.AdamOptimizer().minimize(mse)
# Init
net.run(tf.global_variables_initializer())
# Fit neural net
batch_size = 10
mse_train = []
mse_test = []
# Run
epochs = 10
for e in range(epochs):
# Shuffle training data
shuffle_indices = np.random.permutation(np.arange(len(y_train)))
X_train = X_train[shuffle_indices]
y_train = y_train[shuffle_indices]
# Minibatch training
for i in range(0, len(y_train) // batch_size):
start = i * batch_size
batch_x = X_train[start:start + batch_size]
batch_y = y_train[start:start + batch_size]
# Run optimizer with batch
net.run(opt, feed_dict={X: batch_x, Y: batch_y})
# Show progress
if np.mod(i, 50) == 0:
mse_train.append(net.run(mse, feed_dict={X: X_train, Y: y_train}))
mse_test.append(net.run(mse, feed_dict={X: X_test, Y: y_test}))
pred = net.run(out, feed_dict={X: X_test})
print(pred)`
Have tried to tweak around with the number of hidden layers, number of nodes per layer, number of epochs to run and trying different activation functions and optimizers. However, I am quite new to neural networks, so there might be something very obvious that I'm missing.
Thanks in advance to anyone who managed to read through all of that.
It will make is much easier you you will share a small dataset that illustrate the problem. However, I will state some of the issues with non-standards datasets and how to overcome them.
Possible solutions
Regularization and validation-based optimization - are methods that are always good to try when looking for some extra-accuracy. See dropout methods here (original paper), and some overview here.
Unbalanced data - Sometimes of the time series categories/events behave like anomalies, or just in unbalanced ways. If you read a book, words like the or it will appear much more times than warehouse or such. This can become a problem if your main task is to detect the word warehouse and you train your network (even lstms) in traditional ways. A way to overcome this problem is by balancing the samples (creating balanced datasets) or to give more weight to low-frequent categories.
Model structure - sometimes fully connected layers are not enough. See computer vision problems for instance, where we train using convolution layers. The convolution and pooling layers enforce structure on the model, which is suitable for images. This is also some sort of regulation, since we have less parameters in those layers. In time-series problems, convolutions are also possible and turns out that works just fine. See example in Conditional Time Series Forecasting with Convolution Neural Networks.
The above suggestions are presented in the order I would suggest to try.
Good luck!
Last semester I took an online machine learning course from Standford taught by Professor Ng. http://www.ml-class.org/course/auth/welcome I thought it was pretty informative. To brush up/understand neural networks better I tried to write my own in python. Here it is:
import numpy
class NN:
def __init__(self, sl):
#sl = number of units (not counting bias unit) in layer l
self.sl = sl
self.layers = len(sl)
#Create weights
self.weights = []
for idx in range(1, self.layers):
self.weights.append(numpy.matrix(numpy.random.rand(self.sl[idx-1]+1, self.sl[idx])/5))
self.cost = []
def update(self, input):
if input.shape[1] != self.sl[0]:
raise ValueError, 'The first layer must have a node for every feature'
self.z = []
self.a = []
#Input activations. I'm expecting inputs as numpy matrix (Examples x Featrues)
self.a.append(numpy.hstack((numpy.ones((input.shape[0], 1)), input)))#Set inputs ai + bias unit
#Hidden activations
for weight in self.weights:
self.z.append(self.a[-1]*weight)
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), numpy.tanh(self.z[-1])))) #tanh is a fancy sigmoid
#Output activation
self.a[-1] = self.z[-1] #Not logistic regression thus no sigmoid function
del self.z[-1]
def backPropagate(self, targets, lamda):
m = float(targets.shape[0]) #m is number of examples
#Calculate cost
Cost = -1/m*sum(numpy.power(self.a[-1] - targets, 2))
for weight in self.weights:
Cost = Cost + lamda/(2*m)*numpy.power(weight[1:, :], 2).sum()
self.cost.append(abs(float(Cost)))
#Calculate error for each layer
delta = []
delta.append(self.a[-1] - targets)
for idx in range(1, self.layers-1): #No delta for the input layer because it is the input
weight = self.weights[-idx][1:, :] #Ignore bias unit
dsigmoid = numpy.multiply(self.a[-(idx+1)][:,1:], 1-self.a[-(idx+1)][:,1:]) #dsigmoid is a(l).*(1-a(l))
delta.append(numpy.multiply(delta[-1]*weight.T, dsigmoid)) #Ignore Regularization
Delta = []
for idx in range(self.layers-1):
Delta.append(self.a[idx].T*delta[-(idx+1)])
self.weight_gradient = []
for idx in range(len(Delta)):
self.weight_gradient.append(numpy.nan_to_num(1/m*Delta[idx] + numpy.vstack((numpy.zeros((1, self.weights[idx].shape[1])), lamda/m*self.weights[idx][1:, :]))))
def train(self, input, targets, alpha, lamda, iterations = 1000):
#alpha: learning rate
#lamda: regularization term
for i in range(iterations):
self.update(input)
self.backPropagate(targets, lamda)
self.weights = [self.weights[idx] - alpha*self.weight_gradient[idx] for idx in range(len(self.weights))]
def predict(self, input):
self.update(input)
return self.a[-1]
But it doesn't work =(. Inspecting the cost vs. iteration I can see a blip in the cost and the prediction for A is all the same. Can someone help me understand why my neural network is not converging?
Thanks,
Sorry about the amount of code (maybe someone will find it useful).
Update:
Instead of using random data I've got some structured data from the UCI Machine Learning Repository. The particular data set is the burned area of forest fires, in the northeast region of Portugal, using meteorological and other data: http://archive.ics.uci.edu/ml/datasets/Forest+Fires I modified the data so that days and months were numbers: https://docs.google.com/spreadsheet/ccc?key=0Am3oTptaLsExdC1PeXl1eTczRnRNejl3QUo5RjNLVVE
data = numpy.loadtxt(open('FF-data.csv', 'rb'), delimiter = ',', skiprows = 1)
features = data[:,0:11]
targets = numpy.matrix(data[:,12]).T
nfeatures = (features-features.mean(axis=0))/features.std(axis=0)
n = NN([11, 10, 1]) #The class takes the list of how many nodes in each layer
n.train(nfeatures, targets, 0.003, 0.0)
import matplotlib.pyplot
matplotlib.pyplot.subplot(221)
matplotlib.pyplot.plot(n.cost)
matplotlib.pyplot.title('Cost vs. Iteration')
matplotlib.pyplot.subplot(222)
matplotlib.pyplot.scatter(n.predict(nfeatures), targets)
matplotlib.pyplot.title('Data vs. Predicted')
matplotlib.pyplot.savefig('Report.png', format = 'png')
matplotlib.pyplot.close()
Why does the cost bottom out around 4000 and why does the Data Vs. Predicted not have any trend? You can see the graphs here: https://docs.google.com/open?id=0B23oTptaLsExMTQ0OTAxNWEtYjE2NS00MjA5LTg1MjMtNDBhYjVmMTFhZDhm
(Sorry, I don't have enough rep to add comments, so I'll just keep posting answers instead.)
Yes, it does seem strange. If, however, after training you generate a new matrix B:
B = numpy.random.rand(5, 4)/5
Targets = B*X
print n.predict(B)
print B*X
it will work fine (most of the times - sometimes it will still give the average(Targets) as the answer).
Note: I switched from using 100 features to using just 4 in my example.
Also, I don't think that running 5000 iterations on 50 elements of the data set will do you any good. You should generally try to use as much training data as you can - and here you can use as much as you want, but you use even less examples than you have features.
This is fun, I'll think about it some more :) I was using your network for a more simple example - as Input I provided two numbers, and expected their sum as Output. It worked more or less okay.
The neural network was unable to train on the Forest Fire data https://docs.google.com/spreadsheet/ccc?key=0Am3oTptaLsExdC1PeXl1eTczRnRNejl3QUo5RjNLVVE for a few reasons.
First the numpy.tanh() sigmoid function is not behaving as expected. The code should be changed from:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)),numpy.tanh(self.z[-1])))) #tanh is a fancy sigmoid
To:
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), 1/(1+numpy.exp(-self.z[-1])))))
Second numpy and matplotlib are not playing nice. The numpy matrices seem to be plotted backwards. This can be fixed by using matrix.tolist(). Code changed from:
matplotlib.pyplot.scatter(n.predict(nfeatures), targets)
To:
matplotlib.pyplot.scatter(n.predict(nfeatures).tolist(), targets.tolist())
Finally the number of nodes should be approximately 10% of the example size. Instead of 10 it is better to use 50 nodes.
The working neural network code is posted below with a new function autoparam which tries to find the best learning rate and regularization constant. You can see the graphs for the Forest Fire cost vs iteration and data vs predicted here: https://docs.google.com/open?id=0B23oTptaLsExMWQ4ZWM1ODYtZDMzMC00M2VkLWI1OWUtYzg3NzgxNWYyMTIy
Thanks for reading! I hope my neural network can help people.
import numpy
class NN:
def __init__(self, sl):
#sl = number of units (not counting bias unit) in layer l
self.sl = sl
self.layers = len(sl)
#Create weights
self.weights = []
for idx in range(1, self.layers):
self.weights.append(numpy.matrix(numpy.random.rand(self.sl[idx-1]+1, self.sl[idx]))/5)
self.cost = []
def update(self, input):
if input.shape[1] != self.sl[0]:
raise ValueError, 'The first layer must have a node for every feature'
self.z = []
self.a = []
#Input activations. Expected inputs as numpy matrix (Examples x Featrues)
self.a.append(numpy.hstack((numpy.ones((input.shape[0], 1)), input)))#Set inputs ai + bias unit
#Hidden activations
for weight in self.weights:
self.z.append(self.a[-1]*weight)
self.a.append(numpy.hstack((numpy.ones((self.z[-1].shape[0], 1)), 1/(1+numpy.exp(-self.z[-1]))))) #sigmoid
#Output activation
self.a[-1] = self.z[-1] #Not logistic regression thus no sigmoid function
del self.z[-1]
def backPropagate(self, targets, lamda):
m = float(targets.shape[0]) #m is number of examples
#Calculate cost
Cost = -1/m*sum(numpy.power(self.a[-1] - targets, 2))
for weight in self.weights:
Cost = Cost + lamda/(2*m)*numpy.power(weight[1:, :], 2).sum()
self.cost.append(abs(float(Cost)))
#Calculate error for each layer
delta = []
delta.append(self.a[-1] - targets)
for idx in range(1, self.layers-1): #No delta for the input layer because it is the input
weight = self.weights[-idx][1:, :] #Ignore bias unit
dsigmoid = numpy.multiply(self.a[-(idx+1)][:,1:], 1-self.a[-(idx+1)][:,1:]) #dsigmoid is a(l).*(1-a(l))
delta.append(numpy.multiply(delta[-1]*weight.T, dsigmoid)) #Ignore Regularization
Delta = []
for idx in range(self.layers-1):
Delta.append(self.a[idx].T*delta[-(idx+1)])
self.weight_gradient = []
for idx in range(len(Delta)):
self.weight_gradient.append(numpy.nan_to_num(1/m*Delta[idx] + numpy.vstack((numpy.zeros((1, self.weights[idx].shape[1])), lamda/m*self.weights[idx][1:, :]))))
def train(self, input, targets, alpha, lamda, iterations = 1000):
#alpha: learning rate
#lamda: regularization term
for i in range(iterations):
self.update(input)
self.backPropagate(targets, lamda)
self.weights = [self.weights[idx] - alpha*self.weight_gradient[idx] for idx in range(len(self.weights))]
def autoparam(self, data, alpha = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3], lamda = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]):
#data: numpy matrix with targets in last column
#alpha: learning rate
#lamda: regularization term
#Create training, cross validation, and test sets
while 1:
try:
numpy.seterr(invalid = 'raise')
numpy.random.shuffle(data) #Shuffle data
training_set = data[0:data.shape[0]/10*6, 0:-1]
self.ntraining_set = (training_set-training_set.mean(axis=0))/training_set.std(axis=0)
self.training_tgt = numpy.matrix(data[0:data.shape[0]/10*6, -1]).T
cv_set = data[data.shape[0]/10*6:data.shape[0]/10*8, 0:-1]
self.ncv_set = (cv_set-cv_set.mean(axis=0))/cv_set.std(axis=0)
self.cv_tgt = numpy.matrix(data[data.shape[0]/10*6:data.shape[0]/10*8, -1]).T
test_set = data[data.shape[0]/10*8:, 0:-1]
self.ntest_set = (test_set-test_set.mean(axis=0))/test_set.std(axis=0)
self.test_tgt = numpy.matrix(data[data.shape[0]/10*8:, -1]).T
break
except FloatingPointError:
pass
numpy.seterr(invalid = 'warn')
cost = 999999
for i in alpha:
for j in lamda:
self.__init__(self.sl)
self.train(self.ntraining_set, self.training_tgt, i, j, 2000)
current_cost = 1/float(cv_set.shape[0])*sum(numpy.square(self.predict(self.ncv_set) - self.cv_tgt)).tolist()[0][0]
print current_cost
if current_cost < cost:
cost = current_cost
self.learning_rate = i
self.regularization = j
self.__init__(self.sl)
def predict(self, input):
self.update(input)
return self.a[-1]
Loading data, Plotting, etc...
data = numpy.loadtxt(open('FF-data.csv', 'rb'), delimiter = ',', skiprows = 1)#Load
numpy.random.shuffle(data)
features = data[:,0:11]
nfeatures = (features-features.mean(axis=0))/features.std(axis=0)
targets = numpy.matrix(data[:, 12]).T
n = NN([11, 50, 1])
n.train(nfeatures, targets, 0.07, 0.0, 2000)
import matplotlib.pyplot
matplotlib.pyplot.subplot(221)
matplotlib.pyplot.plot(n.cost)
matplotlib.pyplot.title('Cost vs. Iteration')
matplotlib.pyplot.subplot(222)
matplotlib.pyplot.scatter(n.predict(nfeatures).tolist(), targets.tolist())
matplotlib.pyplot.plot(targets.tolist(), targets.tolist(), c = 'r')
matplotlib.pyplot.title('Data vs. Predicted')
matplotlib.pyplot.savefig('Report.png', format = 'png')
matplotlib.pyplot.close()
I think that your bias should be subtracted somewhere from the weighted inputs (or set to -1). From what I see in your code, the neurons add all the inputs, including the bias (which is set to +1.