When training my binary neural network I'm observing something curious. Despite the test and training data and labels being balanced and symmetric, the network's predictions are not.
After 100 epochs this is what I get:
1 prediction: 0.89635 0 prediction: 0.4742
I was expecting an even 0.5, 0.5 split.
Why does the network skew towards one side?
My network is trying to predict the winner in a basketball game given an input vector of the scores of all 10 players. The output is a sigmoid indicating whether team 1 is winning. The network should be symmetric, i.e if [team1_scores,team2_scores] = 1 then [team2_scores,team1_scores] = 0. To ensure this I flip the training data and labels so that the winning and the losing team are in both places in the input vector.
Here is my code:
from tflearn.layers.core import fully_connected, input_data
from tflearn.layers.estimator import regression
import tflearn
import numpy as np
#flip data so that [team1_scores, team2_scores] becomes [team2_scores, team1_scores]
def flip(x):
return np.concatenate([x[:,5:], x[:,:5]], axis=1)
#this function interweaves 2 vectors so that [0,0,0] and [1,1,1] becomes [0,1,0,1,0,1]
def interweave(a,b):
c = np.empty((a.shape[0] + b.shape[0],a.shape[1]), dtype=a.dtype)
c[0::2] = a
c[1::2] = b
return c
net = input_data(shape=[None, 10])
net = fully_connected(net, 32, activation='relu')
net = fully_connected(net, 16, activation='relu')
net = fully_connected(net, 1, activation='sigmoid')
net = regression(net, shuffle_batches=True, loss='binary_crossentropy')
model = tflearn.DNN(net)
x = np.load("scores.npy")
x_flipped = flip(x)
#x is sorted such that the winning team always comes first in the input vector, so the labels are all 1
y = np.ones((x.shape[0], 1))
y_flipped = np.zeros((x.shape[0], 1))
x_symmetric = interweave(x, x_flipped)
y_symmetric = interweave(y, y_flipped)
for epoch in range(100):
model.fit(x_symmetric, y_symmetric, n_epoch=1, shuffle=True, validation_set=None, show_metric=True, batch_size=128)
acc_reg = model.evaluate(x, y)[0]
acc_flip = model.evaluate(x_flipped, y_flipped)[0]
print(f"1 prediction: {acc_reg} 0 prediction: {acc_flip}")
And here is the training data: scores.npy
The training data is standardized and sorted so that the winning team comes before the losing team. Thus all labels are 1
I tried to implement my own Neural Network for predicting the classes for MNIST Dataset. As I was just getting started, I used sigmoid as the activation function instead of softmax to check if the general structure I implemented was right. My output layer consists of 10 nodes which I intended to produce an output such as this:
[0.01213, 0.23432, 0.1412 .... 0.8123, 0.02323]
where the argmax will give me the prediction of which number the image represents.
Instead, the model predicted very similar values across different inputs
>> pred= model.feedforward(testX.T, model.sigmoid)
>> pred.argmax(axis=0)
Output: array([5, 5, 5, ..., 5, 5, 5])
>> print(testY[0])
>> print(pred[:, 0])
Output: [0 0 0 0 0 0 0 1 0 0]
[0.37764784 0.28244381 0.2299524 0.54792815 0.73187166 0.86594748
0.09725093 0.45846115 0.65113219 0.22196521]
>> print(testY[1])
>> print(pred[:, 1])
Output: [0 1 0 0 0 0 0 0 0 0]
[0.37654529 0.28523675 0.23014656 0.54476446 0.72874274 0.86488462
0.09627901 0.45961867 0.65288013 0.22308308]
I used Andrew NG's course as a reference while making the model. Below is the code I implemented:
class Sequential():
def __init__(self):
self.nodes= []
self.weights= []
self.biases= []
self.total_layers= -1
self.lr= None
self.loss= None
def Dense(self, n, input_features=None):
if self.total_layers==-1 and input_features== None:
raise Exception("Input shape of Sequence not defined")
if input_features != None:
#if first layer and input_shape given, append it to shapes
if self.total_layers==-1:
r= input_features
self.total_layers=0 #input layer done
elif not (self.nodes[-1]==input_features):
raise Exception("Previous layer output and given input shape doesnt match")
r= self.nodes[-1]
r= self.nodes[-1] #output shape of the previous layer
w= np.random.randn(r,n)* np.sqrt(1/r) #number of features, number of nodes
b= np.random.randn(n,1)
return (w, b)
def add(self, layer):
#getting the weights and biases from the layer
w, b= layer
#Increase the number of layers
#add the weights and biases to the network
def sigmoid(self, x):
return 1.0/(1.0+np.exp(-x))
def sigmoid_bp(self, x):
return (x)*(1-x)
def feedforward(self, A, activation_fn):
if A.shape[0] != self.nodes[0]:
raise Exception("Given X doesnt match input shape")
for i in range(0, len(self.nodes)-1):
A= activation_fn((np.dot(self.weights[i].T, A)+ self.biases[i]))
return A
def compile(self, loss= "crossentropy", lr=0.001):
self.loss= loss
def train(self, X, y, epochs, activation_fn=sigmoid):
if not self.lr:
raise Exception("Compile the Model before running")
for i in range(epochs):
self.SGD(X,y, activation_fn)
def SGD(self, X, Y, activation_fn):
l= X.shape[1]
for i in range(0, len(self.nodes)-1):
A.append(activation_fn((np.dot(self.weights[i].T, A[i])+ self.biases[i])))
#output layer alone
dZ= (A[-1]- Y) * self.sigmoid_bp(A[-1])
dW= np.dot(A[-2], dZ.T)
db= np.sum(dZ, axis=1).reshape(-1,1)
self.weights[-1]+= -(self.lr/l)*dW
self.biases[-1]+= -(self.lr/l)*db
#number of values in A= 1 + (input, hidden, output (number of weights/ biases is 1 less than nodes)
#output layer is taken care by the previous
for i in range(2, len(A)):
dA= np.dot(self.weights[-i+1], dZ)
dZ= dA* self.sigmoid_bp(A[-i])
dW= np.dot(A[-i-1], dZ.T)
db= np.sum(dZ, axis=1).reshape(-1,1)
self.weights[-i]-= (self.lr/l)*dW
self.biases[-i]-= (self.lr/l)*db
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import datasets
digits = datasets.load_digits()
data = digits.data.astype("float")
data = (data - data.min()) / (data.max() - data.min())
(trainX, testX, trainY, testY) = train_test_split(data,
digits.target, test_size=0.25)
trainY = LabelBinarizer().fit_transform(trainY)
testY = LabelBinarizer().fit_transform(testY)
import numpy as np
model= Sequential()
model.add(model.Dense(16, input_features= 8**2))
model.train(trainX.T, trainY.T, 500, model.sigmoid)
On checking other solutions, I tried varying the learning rate between 0.1 to 1e-05, but the outputs are all similar regardless. I have also initialized the weights (as per the formula meant for sigmoid fn), varied the number of nodes in the hidden layers, altered the number of epochs, verified whether the dimensions were right, but none of it seemed to correct the output. I have also trained the model on randomly generated inputs and outputs, but the results are similar to one another.
I believe that there is an error in my implementation. I havent been able to figure out where I am going wrong. I hope you can point me in the right direction
I have Encoder-Decoder LSTM model that learns to predict 12 months data in advance, while looking back 12 months. If it helps at all, my dataset has around 10 years in total (120 months). I keep 8 years for training/validation, and 2 years for testing. My understanding is that my model does not have access to the testing data at the training time.
The puzzling thing is that my model predictions are simply a shift of previous points. But how did my model know the actual previous points at the time of prediction? I did not give the monthly values in the testing set to the model! If we say that it simply copies the previous point which you give as input, then I am saying that I am giving it 12 months with completely different values than the ones it predicts (so it does not copy the 12 months I am giving), but the forecasted values are shifts of actual ones (which have never been seen).
Below is an example:
My code source is from here:
Below is my code:
#train/test splitting
split_position=int(len(scaled_data)*0.8)# 8 years for training
print('length of train=',len(train))
print('length of test=',len(test))
# split train and test data into yearly train/test sets (3d)[observation,year, month]
def split_data_yearly(train, test):
# restructure into windows of yearly data
train = array(split(train, len(train)/12))
test = array(split(test, len(test)/12))
return train, test
# evaluate one or more yearly forecasts against expected values
def evaluate_forecasts(actual, predicted):
scores = list()
# calculate an RMSE score for each day
for i in range(actual.shape[1]):
# calculate mse
mse = mean_squared_error(actual[:, i], predicted[:, i])
# calculate rmse
rmse = math.sqrt(mse)
# store
# calculate overall RMSE
s = 0
for row in range(actual.shape[0]):
for col in range(actual.shape[1]):
s += (actual[row, col] - predicted[row, col])**2
score = math.sqrt(s / (actual.shape[0] * actual.shape[1]))
################plot prediction vs actual###############################
inv_scores = list()
for i in range(len(predicted)):
sample_predicted = predicted[i,:]
#inverse normalization
sample_predicted_inv= scaler.inverse_transform(sample_predicted.reshape(-1, 1))
sample_actual_inv= scaler.inverse_transform(sample_actual.reshape(-1, 1))
#inverse differencing
pyplot.plot( months,sample_actual_inv,'b-',label='Actual')
pyplot.plot(months,sample_predicted_inv,'--', color="orange",label='Predicted')
pyplot.title('Encoder Decoder LSTM Prediction', y=1.08)
################### determine RMSE after inversion ################################
mse = mean_squared_error(sample_actual_inv, sample_predicted_inv)
rmse = math.sqrt(mse)
return score, scores,inv_scores
# summarize scores
def summarize_scores(name, score, scores):
s_scores = ', '.join(['%.1f' % s for s in scores])
print('%s: [%.3f] %s' % (name, score, s_scores))
# convert history into inputs and outputs
def to_supervised(train, n_input, n_out=12):
# flatten data
data = train.reshape((train.shape[0]*train.shape[1], train.shape[2]))
X, y = list(), list()
in_start = 0
# step over the entire history one time step at a time
for _ in range(len(data)):
# define the end of the input sequence
in_end = in_start + n_input
out_end = in_end + n_out
# ensure we have enough data for this instance
if out_end <= len(data):
X.append(data[in_start:in_end, :])
y.append(data[in_end:out_end, 0])
# move along one time step
in_start += 1
return array(X), array(y)
# train the model
def build_model(train, n_input):
# prepare data
train_x, train_y = to_supervised(train, n_input)
#take portion for validation
test_x,test_y=train_x[-val_size:], train_y[-val_size:]
# define parameters
verbose, epochs, batch_size = 1,25, 8
n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
# reshape output into [samples, timesteps, features]
train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
# define model
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(n_timesteps, n_features)))
model.add(LSTM(64, activation='relu', return_sequences=True))
model.add(TimeDistributed(Dense(100, activation='relu')))
#sgd = optimizers.SGD(lr=0.004, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mse', optimizer='adam')
# fit network
train_history= model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, validation_data=(test_x, test_y),verbose=verbose)
loss = train_history.history['loss']
val_loss = train_history.history['val_loss']
pyplot.legend(['loss', 'val_loss'])
return model
# make a forecast
def forecast(model, history, n_input):
# flatten data
data = array(history)
data = data.reshape((data.shape[0]*data.shape[1], data.shape[2]))
# retrieve last observations for input data
input_x = data[-n_input:, :]
# reshape into [1, n_input, n]
input_x = input_x.reshape((1, input_x.shape[0], input_x.shape[1]))
# forecast the next year
yhat = model.predict(input_x, verbose=0)
# we only want the vector forecast
yhat = yhat[0]
return yhat
# evaluate a single model
def evaluate_model(train, test, n_input):
# fit model
model = build_model(train, n_input)
# history is a list of yearly data
history = [x for x in train]
# walk-forward validation over each year
predictions = list()
for i in range(len(test)):
# predict the year
yhat_sequence = forecast(model, history, n_input)
# store the predictions
# get real observation and add to history for predicting the next year
# evaluate predictions days for each year
predictions = array(predictions)
score, scores, inv_scores = evaluate_forecasts(test[:, :, 0], predictions)
return score, scores,inv_scores
# split into train and test
train, test = split_data_yearly(train, test)
# evaluate model and get scores
n_input = 12
score, scores, inv_scores = evaluate_model(train, test, n_input)
# summarize scores
summarize_scores('lstm', score, scores)
print('RMSE score after inversion:',inv_scores)
# plot scores
#pyplot.plot(months, scores, marker='o', label='lstm')
Differencing is the key here!
After further investigation, I found out that my model produces values that is almost zero before differencing (not learning).....When I invert the differencing, I am adding zero to the actual value in the previous timestep, which results in the shifted pattern above.
Therefore, I need to tune my LSTM model to make it learn or maybe remove the zeros part in the data itself since I have many of those.
I'm trying to execute a Bayesian Neural Network that I found on the paper "Uncertainty on Deep Learning", Yarin Gal. I found this code on GitHub:
import math
from scipy.misc import logsumexp
import numpy as np
from keras.regularizers import l2
from keras import Input
from keras.layers import Dropout
from keras.layers import Dense
from keras import Model
import time
class net:
def __init__(self, X_train, y_train, n_hidden, n_epochs = 40,
normalize = False, tau = 1.0, dropout = 0.05):
Constructor for the class implementing a Bayesian neural network
trained with the probabilistic back propagation method.
#param X_train Matrix with the features for the training data.
#param y_train Vector with the target variables for the
training data.
#param n_hidden Vector with the number of neurons for each
hidden layer.
#param n_epochs Number of epochs for which to train the
network. The recommended value 40 should be
#param normalize Whether to normalize the input features. This
is recommended unless the input vector is for
example formed by binary features (a
fingerprint). In that case we do not recommend
to normalize the features.
#param tau Tau value used for regularization
#param dropout Dropout rate for all the dropout layers in the
# We normalize the training data to have zero mean and unit standard
# deviation in the training set if necessary
if normalize:
self.std_X_train = np.std(X_train, 0)
self.std_X_train[ self.std_X_train == 0 ] = 1
self.mean_X_train = np.mean(X_train, 0)
self.std_X_train = np.ones(X_train.shape[ 1 ])
self.mean_X_train = np.zeros(X_train.shape[ 1 ])
X_train = (X_train - np.full(X_train.shape, self.mean_X_train)) / \
np.full(X_train.shape, self.std_X_train)
self.mean_y_train = np.mean(y_train)
self.std_y_train = np.std(y_train)
y_train_normalized = (y_train - self.mean_y_train) / self.std_y_train
y_train_normalized = np.array(y_train_normalized, ndmin = 2).T
# We construct the network
N = X_train.shape[0]
batch_size = 128
lengthscale = 1e-2
reg = lengthscale**2 * (1 - dropout) / (2. * N * tau)
inputs = Input(shape=(X_train.shape[1],))
inter = Dropout(dropout)(inputs, training=True)
inter = Dense(n_hidden[0], activation='relu', W_regularizer=l2(reg))(inter)
for i in range(len(n_hidden) - 1):
inter = Dropout(dropout)(inter, training=True)
inter = Dense(n_hidden[i+1], activation='relu', W_regularizer=l2(reg))(inter)
inter = Dropout(dropout)(inter, training=True)
outputs = Dense(y_train_normalized.shape[1], W_regularizer=l2(reg))(inter)
model = Model(inputs, outputs)
model.compile(loss='mean_squared_error', optimizer='adam')
# We iterate the learning process
start_time = time.time()
model.fit(X_train, y_train_normalized, batch_size=batch_size, nb_epoch=n_epochs, verbose=0)
self.model = model
self.tau = tau
self.running_time = time.time() - start_time
# We are done!
def predict(self, X_test, y_test):
Function for making predictions with the Bayesian neural network.
#param X_test The matrix of features for the test data
#return m The predictive mean for the test target variables.
#return v The predictive variance for the test target
#return v_noise The estimated variance for the additive noise.
X_test = np.array(X_test, ndmin = 2)
y_test = np.array(y_test, ndmin = 2).T
# We normalize the test set
X_test = (X_test - np.full(X_test.shape, self.mean_X_train)) / \
np.full(X_test.shape, self.std_X_train)
# We compute the predictive mean and variance for the target variables
# of the test data
model = self.model
standard_pred = model.predict(X_test, batch_size=500, verbose=1)
standard_pred = standard_pred * self.std_y_train + self.mean_y_train
rmse_standard_pred = np.mean((y_test.squeeze() - standard_pred.squeeze())**2.)**0.5
T = 10000
Yt_hat = np.array([model.predict(X_test, batch_size=500, verbose=0) for _ in range(T)])
Yt_hat = Yt_hat * self.std_y_train + self.mean_y_train
MC_pred = np.mean(Yt_hat, 0)
rmse = np.mean((y_test.squeeze() - MC_pred.squeeze())**2.)**0.5
# We compute the test log-likelihood
ll = (logsumexp(-0.5 * self.tau * (y_test[None] - Yt_hat)**2., 0) - np.log(T)
- 0.5*np.log(2*np.pi) + 0.5*np.log(self.tau))
test_ll = np.mean(ll)
# We are done!
return rmse_standard_pred, rmse, test_ll
I'm new at programming, so I have to study Classes on Python to understand the code. But my answer goes when I try to execute the code, but it ask a "vector with the numbers of neurons for each hidden layer", and I don't know how to create this vector, and which does it mean for the code. I've tried to create different vectors, like
vector = np.array([1, 2, 3]) but sincerely I don't know the correct answer. The only I have is the feature data and the target data. I hope you can help me.
That syntax is correct vector = np.array([1, 2, 3]). That is the way to define a vector in python's numpy.
A neural network can have any number o hidden (internal) layers. And each layer will have a certain number of neurons.
So in this code, a vector=np.array([100, 150, 100]), means that the network should have 3 hidden layers (because the vector has 3 values), and the hidden layers should have, from input to output 100, 150, 100 neurons respectively.
Hello i m trying to complete an assignment based on training a perceptron (without any hidden layer) to perform binary classification using sigmoid activation function. but due to some reason my code is not working correctly. although the error is decreasing after each epoch but accuracy is not increasing. i have target labels 1 and 0, but my predicted labels are almost all close to one. none of my predicted label is representing the 0 class.
below is my code. anyone please tell me what have i done wrong.
<# Create a Neural_Network class
class Neural_Network(object):
def __init__(self,inputSize = 2,outputSize = 1 ):
# size of layers
self.inputSize = inputSize
self.outputSize = outputSize
self.W1 = 0.01*np.random.randn(inputSize+1, outputSize) # randomly initialize W1 using random function of numpy
# size of the wieght will be (inputSize +1, outputSize) that +1 is for bias
def feedforward(self, X): #forward propagation through our network
Xbias = np.ones((n,1)) #bias term in input
Xnew = np.hstack((Xbias,X)) #adding biasterm in input to match the dimension with the weigth
self.product=np.dot(Xnew,self.W1) # dot product of X (input) and set of weights
output=self.sigmoid(self.product) # apply activation function (i.e. sigmoid)
return output # return your answer with as a final output of the network
def sigmoid(self, s):# apply sigmoid function on s and return its value
return (1./(1. + np.exp(-s))) #activation sigmoid function
def sigmoid_derivative(self, s):#derivative of sigmoid
#derivative of sigmoid = sigmoid(x)*(1-sigmoid(x))
return s*(1-s) # here s will be sigmoid(x)
def backwardpropagate(self,X, Y, y_pred, lr):
# backward propagate through the network
# compute error in output which is loss, compute cross entropy loss function
self.output_error=self.crossentropy(Y,y_pred) #output error
# applying derivative of sigmoid to the error
# adjust set of weights
Xbias = np.ones((n,1)) #bias term in input
Xnew = np.hstack((Xbias,X)) #adding biasterm in input to match the dimension with the weigth
self.W1 += lr*(Xnew.T.dot(self.error_deriv)) # W1=W1+ learningrate*errorderiv*input
#self.W1 += X.T.dot(self.z2_delta)
def crossentropy(self, Y, Y_pred):
# compute error based on crossentropy loss
#Cross entropy= sum(Y_actual*log(y_predicted))/N. here 1e-6 is used to avoid log 0
N = Y_pred.shape[0]
return cr_entropy #error
def train(self, trainX, trainY,epochs = 100, learningRate = 0.001, plot_err = True ,validationX = Null, validationY = Null):
for i in range(epochs):
# feed forward trainX and trainY and recievce predicted value
# backpropagation with trainX, trainY, predicted value and learning rate.
# """"""if validationX and validationY are not null than show validation accuracy and error of the model.""""""
# plot error of the model if plot_err is true
plt.plot(epocharray,tr_error,'r',linewidth=3.0) #plotting error vs. no. of epochs
plt.xlabel('No. of Epochs')
plt.ylabel('Cross Entropy Error')
plt.title('Error Vs. Epoch')
def predict(self, testX):
# predict the value of testX
def accuracy(self, testX, testY):
import math
# predict the value of trainX
# compare it with testY
for j in range(len(testY)):
if testY[j] == q:
acc +=1
print("Percentage Accuracy is", accuracy,"%")
# compute accuracy, print it and """"""show in the form of picture""""""
return accuracy # return accuracy>
# generating dataset point
no_of_samples = 2000
dims = 2
#Generating random points of values between 0 to 1
#To add separability we will add a bias of 1.1
class_1_label=np.array([1 for n in range(no_of_samples)])
class_2_label=np.array([0 for n in range(no_of_samples)])
#Lets visualize the dataset
plt.scatter(class1[:,0],class1[:,1], marker='^', label="class 1")
plt.scatter(class2[:,0],class2[:,1], marker='o', label="class 2")
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
# Data concatenation
data = np.concatenate((class1,class2),axis=0)
label = np.concatenate((class_1_label,class_2_label),axis=0)
#Note: shuffle this dataset before dividing it into three parts
# now using train_test_split command to split data into 60% training data, 20% testing data and 20% validation data
trainX, testX, trainY, testY = train_test_split(data, label, test_size=0.2, random_state=1)
trainX, validX, trainY, validY = train_test_split(trainX, trainY, test_size=0.25, random_state=1)
model = Neural_Network(2,1)
# try different combinations of epochs and learning rate
model.train(trainX, trainY, epochs = 100, learningRate = 0.000001, validationX = validX, validationY = validY)
model.accuracy( testX,testY)
the Results are coming like this(no label going near 0)
0 [[0.49670809]
[0.4958389 ]
[0.4966064 ]
[0.4961255 ]]
0 828.1069658303942
0 [[0.48311074]
1 [[0.69813116]
1 250.96538025031356
1 [[0.56983781]
2 [[0.72602796]
2 210.645081151866
2 [[0.63353102]
3 [[0.74507968]
3 186.2933734713245
3 [[0.6846678 ]
4 [[0.75952936]
4 169.32091332021724
4 [[0.72771826]
5 [[0.77112943]
5 156.53923256347372
Please help me to solve this problem
I see you have set learning rate too small. Set it to 0.001 and Increase epoch to 20k and you will see your model learning well.
Plotting error vs epoch's should give you better idea where to stop.
I'm trying to perform a hyperparameter optimization on a neural net, but as soon as I try a larger number of hidden layers, my neural network will always predict the same output, so my list of (negative) losses looks like:
this is my neural net:
def nn(learningRate, layers, neurons, dropoutIn, dropoutHidden, miniBatch, activationFun, epoch):
x_data = []
y_data = []
x_data_train = []
y_data_train = []
x_data_test = []
y_data_test = []
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
x_data = np.loadtxt('../4nodes_demand_vector')
x_data = x_data - x_data.min()
x_data = x_data / x_data.max() * 2
x_data = x_data - 1
y_data = np.loadtxt('../4nodes_vlink_vector')
input_dim = x_data.shape[1]
output_dim = y_data.shape[1]
split_ratio = 0.75
number_of_samples = x_data.shape[0]
# train data
x_data_train = x_data[:int(number_of_samples*split_ratio), ]
y_data_train = y_data[:int(number_of_samples*split_ratio), ]
# test data
x_data_test = x_data[int(number_of_samples*split_ratio):, ]
y_data_test = y_data[int(number_of_samples*split_ratio):, ]
adam = Adam(lr=learningRate)
model = Sequential()
model.add(Dropout(dropoutIn, input_shape=(input_dim,)))
model.add(Dense(units=neurons, input_shape=(input_dim,), kernel_constraint=maxnorm(3)))
for i in range(layers-1):
model.add(Dense(units=neurons, activation=activationFun, kernel_constraint=maxnorm(3)))
model.add(Dense(units=output_dim, activation='sigmoid'))
model.fit(x_data_train, y_data_train, batch_size=miniBatch, validation_split=0.1, epochs=epoch, verbose=2)
predict = model.predict(x_data_test)
round_predict = np.round(predict)
correct = np.sum(np.all(round_predict == y_data_test, axis=1))
number_of_test_data = x_data_test.shape[0]
loss = -1.0 + (correct / float(number_of_test_data))
print("Loss: ", loss)
return loss
The neural net is trained on (unfortunately) private data, with 12 input neurons and 12 output neurons and I have 43000 data samples.
The idea of setting kernel_constraint to maxnorm(3) came from http://jmlr.org/papers/v15/srivastava14a.html as I was running in several NaN problems.
I know this question is from 2 years ago, but...
My guess is that the problem is because you are using a final activation of 'sigmoid' (typical for classification) with a loss function of 'mean_squared_error' (a regression loss)
Based on the final loss calculation it looks like you are trying to do a binary classification. So maybe try changing the loss function to binary crossentropy.