tf.nn.softmax_cross_entropy_with_logits for binary classification?

tf.nn.softmax_cross_entropy_with_logits for binary classification? - python

I am working on a model where I have to classify my data into two classes. Most of the codes use tf.nn.sigmoid_cross_entropy_with_logits for calculating cross entropy for binary classification.
When I use the same function to I train my model, I am getting negative values of entropy. I want to ask if I can use tf.nn.softmax_cross_entropy_with_logits to overcome the negative entropy?
x = tf.placeholder(tf.float32, [None, Pixels])
W1 = tf.Variable(tf.random_normal([Pixels, Nodes1], stddev=0.01))
b1 = tf.Variable(tf.zeros([Nodes1]))
y1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1)
W2 = tf.Variable(tf.random_normal([Nodes1, Labels], stddev=0.01))
b2 = tf.Variable(tf.zeros([Labels]))
y = tf.matmul(y1, W2) + b2
cross_entropy =
tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y,
logits=y_))

Related

Specific linear classifier in TensorFlow: input element as vector

How can I implement such linear classifier in TensorFlow:
x1*w1 + x2*w2 + x3*w3 = y_pred,
where x1, x2, x3 - vectors and w1, w2 and w3 - scalars?
I have nice tutorial for case where x1, x2, x3 - scalars (link),
but for case where x1, x2, x3 are vectors I have no realization ideas.
UPDATE
That is I am trying to implement the following model:
x1*w1+ x2*w1+x3*w1+x4*w2+x5*w2+x6*w2+x7*w3+x8*w3+x9*w3=y_pred,
where x1..x9 and w1..w9 are scalars.

The linear multiclass classifier to be implemented:
pred = w1 * (x1 + x2 + x3) + w2 * (x4 + x5 + x6) + w3 * (x7 + x8 + x9)
in which all variables are scalars.
In this model, since pred is a scalar, you cannot use cross-entropy loss for training the classifier (pred is not a distribution). You have to treat it as a regression problem.
Example dataset
import numpy as np
x1 = np.ones((100, 3)) # for w1
x2 = np.ones((100, 3)) * 2 # for w2
x3 = np.ones((100, 3)) * 3 # for w3
# set(y) is {0, 1, 2, 3}, corresponds to the four class labels
y = np.random.randint(0, 4, 100).reshape(-1, 1)
Example tensorflow code:
import tensorflow as tf
tf.reset_default_graph()
f1 = tf.placeholder('float32', shape=[None, 3], name='f1')
f2 = tf.placeholder('float32', shape=[None, 3], name='f2')
f3 = tf.placeholder('float32', shape=[None, 3], name='f3')
target = tf.placeholder('float32', shape=[None, 1], name='target')
# the three scalars
w1 = tf.get_variable('w1', shape=[1], initializer=tf.random_normal_initializer())
w2 = tf.get_variable('w2', shape=[1], initializer=tf.random_normal_initializer())
w3 = tf.get_variable('w3', shape=[1], initializer=tf.random_normal_initializer())
pred_1 = tf.reduce_sum(tf.multiply(f1, w1), axis=1)
pred_2 = tf.reduce_sum(tf.multiply(f2, w2), axis=1)
pred_3 = tf.reduce_sum(tf.multiply(f3, w3), axis=1)
# till now the linear classifier has been constructed
# pred = w1(x1 + x2 + x3) + w2(x4 + x5 + x6) + w3(x7 + x8 + x9)
pred = tf.add_n([pred_1, pred_2, pred_3])
# treat it as a regression problem
loss = tf.reduce_mean(tf.square(pred - target))
optimizer = tf.train.GradientDescentOptimizer(1e-5)
updates = optimizer.minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for t in range(50):
loss_val, _ = sess.run([loss, updates],
feed_dict={f1: x1, f2: x2, f3: x3, target: y})
print(t, loss_val)
Below is a simple example uses cross-entropy loss for training a multiclass classifier. As you can notice, this model is a neural network model
import numpy as np
import tensorflow as tf
x1 = np.ones((100, 3)) # for w1
x2 = np.ones((100, 3)) * 2 # for w2
x3 = np.ones((100, 3)) * 3 # for w3
y = np.random.randint(0, 4, 400).reshape(100, 4)
tf.reset_default_graph()
f1 = tf.placeholder('float32', shape=[None, 3], name='f1')
f2 = tf.placeholder('float32', shape=[None, 3], name='f2')
f3 = tf.placeholder('float32', shape=[None, 3], name='f3')
target = tf.placeholder('float32', shape=[None, 4], name='target')
# the three scalars
w1 = tf.get_variable('w1', shape=[1], initializer=tf.random_normal_initializer())
w2 = tf.get_variable('w2', shape=[1], initializer=tf.random_normal_initializer())
w3 = tf.get_variable('w3', shape=[1], initializer=tf.random_normal_initializer())
w = tf.get_variable('w', shape=[3, 4], initializer=tf.random_normal_initializer())
pred_1 = tf.reduce_sum(tf.multiply(f1, w1), axis=1)
pred_2 = tf.reduce_sum(tf.multiply(f2, w2), axis=1)
pred_3 = tf.reduce_sum(tf.multiply(f3, w3), axis=1)
pred = tf.stack([pred_1, pred_2, pred_3], axis=1)
pred = tf.matmul(pred, w)
loss = tf.losses.softmax_cross_entropy(onehot_labels=target, logits=pred)
optimizer = tf.train.GradientDescentOptimizer(1e-5)
updates = optimizer.minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for t in range(50):
loss_val, _ = sess.run([loss, updates],
feed_dict={f1: x1, f2: x2, f3: x3, target: y})
print(t, loss_val)

I used created an array that looks like [w1, w1, w1, w2, w2, w2 ...] and multiplied it (element-wise) by x before summing all terms up. I could not get model.fit to work so I copied the train_step code from https://www.tensorflow.org/tutorials/quickstart/advanced. It seems to work just fine. I left my test code at the bottom for you to inspect.
This makes use of tensorlfow 2.0 and the intergration with keras models
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam
print(tf.executing_eagerly())
class ProductAdd(Model):
def __init__(self):
super(ProductAdd, self).__init__()
self.vars = list(np.empty([3])) # Creates an empty list (same as [ , , ])
for i in range(3):
self.vars[i] = tf.Variable( # Creates 3 variables to act as weights
np.random.standard_normal(), # Assigns variables random value to start
name='var'+str(i)) # Names them var0 var1...
def call(self, x):
extended_vars = [self.vars[int(np.floor(i/3))] # "Extends" var array to look like:
for i in range(9)] # [w1, w1, w1, w2, w2, w2, w3, w3, w3]
return np.sum(np.multiply(x, extended_vars)) # Perfoms element-wise multiplication on x and sums
loss_object = MeanSquaredError() # Create loss and optimizer
optimizer = Adam()
#tf.function # This function perfoms trains the model
def train_step(images, labels): # I got it from https://www.tensorflow.org/tutorials/quickstart/advanced
with tf.GradientTape() as tape:
predictions = model(images)
loss = loss_object(labels, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
model = ProductAdd()
for _ in range(100):
train_step([1.0, 2.0 ,3.0 ,4.0, 5.0, 6.0, 7.0, 8.0, 9.0], [0.0])
print(model([1.0, 2.0 ,3.0 ,4.0, 5.0, 6.0, 7.0, 8.0, 9.0]).numpy())

This question is ill-posed. You say you want x_1, x_2, x_3 to be vectors, however it's not clear what you would do with w_1, w_2, w_3. There are two possibilities.
If you want to keep them as scalars, as your question seems to imply, then the model is not really a vector model, you're just doing the same scalar operation on all the entries of the x vectors, but at once. This is equivalent to a scalar model.
Otherwise, you can define w_1, w_2, w_3 as matrices, or row vectors, if the label is scalar. In this case, there is no reason to write the equation as you wrote it, because you could stack the xs in a single vector and the ws in a single vector and write wx = y. In any case, this is a multivariate linear regression, of which you can find many examples, and tutorials on how to solve it in Tensorflow and Torch.
Update, given OP's clarification
In your comment, you now say you're interested in solving the following equation:
w1*(x1 + x2 + x3) + w2*(x4 + x5 + x6) + w3*(x7 + x8 + x9) == y
where all variables are scalars. Note that the x variables are known, so we can define (a simple arithmetic operation):
z1 = x1 + x2 + x3; z2 = x4 + x5 + x6; z3 = x7 + x8 + x9
And the equation becomes
w1*z1 + w2*z2 + w3*z3 = y.
So this is more like a linear algebra question rather than a tensorflow/torch question, because this equation can be solved analytically, and does not require numerical fitting. However, it is still ill-defined, because it has 3 unknowns (w1, w2, w3) for one linear equation. So it will not have a unique solution, but a two-dimensional linear space of solutions (it identifies a plane in the 3-dimensional w-space). To get some solutions, you can arbitrarily decide to set, for example, w1 = w2 = 0, from which you automatically get w3 = z3/y. Then do the same for the other two, and you'll get three different and linearly independent solutions.
Hope this helps. In summary, you don't need code at all.
Second update (from comment)
Why does it need to solved using optimization? If the problem is as you presented it, it clearly does not. Unless you mean you have many values for the Xs and Ys. In that case, you're doing multivariate linear regression. MLR can be solved using ordinary least squares, see for example https://towardsdatascience.com/simple-and-multiple-linear-regression-in-python-c928425168f9

How to extract the output of tensorflow model?

I've been trying to train a model as usual with train/test data. I was able to have my accuracy, cost + the valid accuracy and cost. So I presume that the model is working and the result is enough with an 85%.
Now, after I finished with my train/test data, I have a csv file with the same type and structure of data but without one column (default -indicate if client will pay or be delayed). I'm trying to predict this value with the model. I'm bugging on how to insert those data and get back with the missing column.
Problem section :
This is my code for restoring and predict on the new data -> (y_pred [5100x41])
with tf.Session() as sess:
saver = tf.train.import_meta_graph('my_test_model101.meta')
print("Model found.")
saver.restore(sess, tf.train.latest_checkpoint('./'))
print("Model restored compl.")
z = tf.placeholder(tf.float32, shape= (None,5100))
y_pred= y_pred.as_matrix()
output =sess.run(z,feed_dict={x: y_pred})
print(output)
Can anyone help me to understand what's I am doing wrong here ?!!!
Error message is:
InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'Placeholder_4' with dtype float and shape [?,5100]
[[Node: Placeholder_4 = Placeholder[dtype=DT_FLOAT, shape=[?,5100], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Expecting:
My input [5100 x 41] but the last column had initially Nan value, I want it with the predicted value which is supposed to be 0 or 1.
To see the trained model architecure :
Model architecture :
# Number of input nodes.
input_nodes = 41
# Multiplier maintains a fixed ratio of nodes between each layer.
mulitplier = 3
# Number of nodes in each hidden layer
hidden_nodes1 = 41
hidden_nodes2 = round(hidden_nodes1 * mulitplier)
hidden_nodes3 = round(hidden_nodes2 * mulitplier)
# Percent of nodes to keep during dropout.
pkeep = tf.placeholder(tf.float32)
# input
x = tf.placeholder(tf.float32, [None, input_nodes])
# layer 1
W1 = tf.Variable(tf.truncated_normal([input_nodes, hidden_nodes1], stddev = 0.15))
b1 = tf.Variable(tf.zeros([hidden_nodes1]))
y1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1)
# layer 2
W2 = tf.Variable(tf.truncated_normal([hidden_nodes1, hidden_nodes2], stddev = 0.15))
b2 = tf.Variable(tf.zeros([hidden_nodes2]))
y2 = tf.nn.sigmoid(tf.matmul(y1, W2) + b2)
# layer 3
W3 = tf.Variable(tf.truncated_normal([hidden_nodes2, hidden_nodes3], stddev = 0.15))
b3 = tf.Variable(tf.zeros([hidden_nodes3]))
y3 = tf.nn.sigmoid(tf.matmul(y2, W3) + b3)
y3 = tf.nn.dropout(y3, pkeep)
# layer 4
W4 = tf.Variable(tf.truncated_normal([hidden_nodes3, 2], stddev = 0.15))
b4 = tf.Variable(tf.zeros([2]))
y4 = tf.nn.softmax(tf.matmul(y3, W4) + b4)
# output
y = y4
y_ = tf.placeholder(tf.float32, [None, 2])
After building the model, I understand you need to add Placeholder to stock what you're looking for. So :
# Parameters
training_epochs = 5 # These proved to be enough to let the network learn
training_dropout = 0.9
display_step = 1 # 10
n_samples = y_train.shape[0]
batch_size = 2048
learning_rate = 0.001
# Cost function: Cross Entropy
cost = -tf.reduce_sum(y_ * tf.log(y))
# We will optimize our model via AdamOptimizer
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
# Correct prediction if the most likely value (default or non Default) from softmax equals the target value.
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
Till now everything is working well and I saved the model. I was able to restore this model (printed the variables and all was there---So restore is fine)

The placeholder 'z' has nothing in it and nothing is assigned to it. So when you run the session, nothing needs to be done because 'z' depends on nothing in the model. I think you want,
output =sess.run(y,feed_dict={x: y_pred})
Because 'y' is the output tensor.
Having said that, I think you might want to read up a little more on the flow graph used by tensorflow to understand how the calculations happen. Currently, it doesn't sound like you have fully understood the placeholder variables.

What is wrong with my neural network model?

I got a dataset of 178 elements, and each contains 13 features and 1 label.
Label is stored as one-hot array. My training dataset is made of 158 elements.
Here is what my model looks like :
x = tf.placeholder(tf.float32, [None,training_data.shape[1]])
y_ = tf.placeholder(tf.float32, [None,training_data_labels.shape[1]])
node_1 = 300
node_2 = 300
node_3 = 300
out_n = 3
#1
W1 = tf.Variable(tf.random_normal([training_data.shape[1], node_1]))
B1 = tf.Variable(tf.random_normal([node_1]))
y1 = tf.add(tf.matmul(x,W1),B1)
y1 = tf.nn.relu(y1)
#2
W2 = tf.Variable(tf.random_normal([node_1, node_2]))
B2 = tf.Variable(tf.random_normal([node_2]))
y2 = tf.add(tf.matmul(y1,W2),B2)
y2 = tf.nn.relu(y2)
#3
W3 = tf.Variable(tf.random_normal([node_2, node_3]))
B3 = tf.Variable(tf.random_normal([node_3]))
y3 = tf.add(tf.matmul(y2,W3),B3)
y3 = tf.nn.relu(y3)
#output
W4 = tf.Variable(tf.random_normal([node_3, out_n]))
B4 = tf.Variable(tf.random_normal([out_n]))
y4 = tf.add(tf.matmul(y3,W4),B4)
y = tf.nn.softmax(y4)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(200):
sess.run(optimizer,feed_dict={x:training_data, y_:training_data_labels})
correct = tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy:',accuracy.eval({x:eval_data, y_:eval_data_labels}))
But the accuracy is very low, i tried increase the range 200 to some higher number but it still remains low.
What could I do to improve the results ?

The problem is that you're taking the softmax of y4 and then passing that to tf.nn.softmax_cross_entropy_with_logits. This error is common enough that there's actually a note about it in the documentation for softmax_cross_entropy_with_logits:
WARNING: This op expects unscaled logits, since it performs a softmax on logits internally
for efficiency. Do not call this op with the output of softmax, as it will produce
incorrect results.
The rest of your code looks fine, so just replace y4 with y and get rid of y = tf.nn.softmax(y4).

Classification NN defaults to one answer - Better cost function

Above picture is the distributions of a binary output from a Neural Network that I wrote. (Y-Axis is percentage of Yes output, X-Axis is iteration)
I trained a NN on a binary classification dataset that has labels 0-No, 1-Yes
However I try to change the parameters of the NN (tried different layer count, layer size, activation function) the NN output oscillates from outputting all Zeros or all Ones.
I am using tensorflow to build the NN with this as the cost function
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction))
I am 90% sure that the NN is sound in term of architecture as it is the same one I used for the MNIST dataset. Also have tried many different datasets and it seems to be able to classify datasets well.
The training dataset contains 58% of No labels and 42% of Yes labels.
I have a suspicion that the feature inputted into the NN is uninformative to the NN to make a decision hence why defaulting to one answer is better for it.
I would like to know what kind of Cost function would better penalize the current behavior of the network?
Code:
def neural_network_model(data):
w1 = tf.Variable(tf.random_normal([n_input, n_nodes_hl1]), name="w1", collections=["weights", tf.GraphKeys.GLOBAL_VARIABLES])
b1 = tf.Variable(tf.random_normal([n_nodes_hl1]), name="b1", collections=["bias", tf.GraphKeys.GLOBAL_VARIABLES])
w2 = tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2]), name="w2", collections=["weights", tf.GraphKeys.GLOBAL_VARIABLES])
b2 = tf.Variable(tf.random_normal([n_nodes_hl2]), name="b2", collections=["bias", tf.GraphKeys.GLOBAL_VARIABLES])
wout = tf.Variable(tf.random_normal([n_nodes_hl2, n_output]), name="wout", collections=["weights", tf.GraphKeys.GLOBAL_VARIABLES])
bout = tf.Variable(tf.random_normal([n_output]), name="bout", collections=["bias", tf.GraphKeys.GLOBAL_VARIABLES])
l1 = tf.add(tf.matmul(data, w1), b1)
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1, w2), b2)
l2 = tf.nn.relu(l2)
output = tf.matmul(l2, wout) + bout
return output
n_input = len(features[1, :])
n_nodes_hl1 = 2000
n_nodes_hl2 = 2000
n_output = 2
batch_size = 100
iterations = 10000000
x = tf.placeholder("float", name="x")
y = tf.placeholder("float", name="y")
prediction = neural_network_model(x)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=prediction))
optimizer = tf.train.AdamOptimizer().minimize(cost)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for iteration in range(iterations):
iter_x, iter_y = nextbatch(features, label, batch_size, iteration)
_, c = sess.run([optimizer, cost], feed_dict={x: iter_x, y: iter_y})
if iteration % 500 == 0:
print("Iter:", iteration, "Cost:", c)
print("Count:", correct.eval({x: val_features, y:val_label})/len(val_label))
EDIT:
After shuffling the time-series input as suggested by #FlorentinHennecker, the oscillation seems to be reduced.

ReluGrad input is not finite on multi-layer network in TensorFlow

I'm doing the udacity course of TensotFlow, I'm trying to train a neural network on the notMNIST set.
When using a 1-hidden layer network all works fine, but when I try to add another layer, after ~150 steps I get this error:
InvalidArgumentError: ReluGrad input is not finite. : Tensor had NaN values
This is the network model:
def model(x, w_h,w_h2,w_0,b_h,b_h2,b_0,p_drop):
h = tf.nn.relu(tf.matmul(x,w_h)+b_h)
h = tf.nn.dropout(h,p_drop)
h2 = tf.nn.relu(tf.matmul(h, w_h2)+b_h2)
h2 = tf.nn.dropout(h2,p_drop)
return (tf.matmul(h2,w_0)+b_0)
And the error is pointing at a specific line:
h = tf.nn.relu(tf.matmul(x,w_h)+b_h)
I guess the with two-layer network the w_h are becoming very small so the matmul product go to zero, but I don't understand how I can solve it
Notice that I'm using this optimizer:
net = model(tf_train_dataset,w_h,w_h2,w_0,b_h,b_h2,b_0,0.5)
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(net, tf_train_labels))
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, 100, 0.95)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)
The net is 784->1024->512->10
Any help would be appreciated...

I was having the same problem when my weights were being initialized randomly and by biases with zeros. Using Xavier and Yoshua's initialization solved the problem, and here is my full example:
hidden_size = 1024
batch_size = 256
def multilayer(x, w, b):
for i, (wi, bi) in enumerate(zip(w, b)):
if i == 0:
out = tf.nn.relu(tf.matmul(x, wi) + bi)
elif i == len(w) - 1:
out = tf.matmul(out, wi) + bi
else:
out = tf.nn.relu(tf.matmul(out, wi) + bi)
print(out.shape, x.shape)
return out
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Defining Xavier and Yoshua's initializer
initializer = tf.contrib.layers.xavier_initializer()
# Variables
W1 = tf.Variable(initializer([image_size * image_size, hidden_size]))
b1 = tf.Variable(initializer([hidden_size]))
W2 = tf.Variable(initializer([hidden_size, hidden_size]))
b2 = tf.Variable(initializer([hidden_size]))
W3 = tf.Variable(initializer([hidden_size, hidden_size]))
b3 = tf.Variable(initializer([hidden_size]))
W4 = tf.Variable(initializer([hidden_size, hidden_size]))
b4 = tf.Variable(initializer([hidden_size]))
W5 = tf.Variable(initializer([hidden_size, num_labels]))
b5 = tf.Variable(initializer([num_labels]))
Ws = [W1, W2, W3, W4, W5]
bs = [b1, b2, b3, b4, b5]
# Training computation
logits = multilayer(tf_train_dataset, Ws, bs)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels))
#NOTE loss is actually a scalar value that represents the effectiveness of the
# current prediction. A minimized loss means that the weights and biases
# are adjusted at their best for the training data.
# Optimizer
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(multilayer(tf_valid_dataset, Ws, bs))
test_prediction = tf.nn.softmax(multilayer(tf_test_dataset, Ws, bs))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.