Distributed TF: multi-PS version LR doesn't converge - python

I built a custom LR model which supports sparse feature vector with the following code:
def custom_model_fn(features, labels, mode, params):
linear_bias = tf.get_variable(name='linear_bias',
shape=[1],
dtype=tf.float32,
initializer=tf.random_normal_initializer(stddev=0.0001))
linear_w = tf.get_variable(name='linear_w',
shape=[params['feature_size'], 1],
dtype=tf.float32,
initializer=tf.random_normal_initializer(stddev=0.0001),
partitioner=self.partitioner)
# wx
# size: [batch_size, 1]
logits_wide = tf.nn.embedding_lookup_sparse(params=linear_w,
sp_ids=features['featureID'],
sp_weights=None,
combiner='sum')
# wx + b
logits = linear_bias + logits_wide
logits_adjusted = logits + tf.math.log(params['negative_sampling_rate'])
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'probabilities': tf.nn.sigmoid(logits_adjusted),
'logits': logits,
'logits_adjusted': logits_adjusted
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
else:
loss = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(labels, dtype=tf.float32),
logits=logits))
if mode == tf.estimator.ModeKeys.EVAL:
auc = tf.metrics.auc(
labels=labels,
predictions=1 / (1 + tf.math.exp(-logits_adjusted)),
num_thresholds=400,
curve='ROC',
summation_method='careful_interpolation')
logloss = tf.metrics.mean(tf.nn.sigmoid_cross_entropy_with_logits(
labels=tf.cast(labels, dtype=tf.float32),
logits=logits_adjusted))
tf.summary.scalar('True_AUC', auc)
tf.summary.scalar('True_Logloss', logloss)
metrics = {
'True_AUC': auc,
'True_Logloss': logloss
}
predictions = {
'probabilities': tf.nn.sigmoid(logits_adjusted),
'logits': logits,
'logits_adjusted': logits_adjusted
}
return tf.estimator.EstimatorSpec(mode, loss=loss, predictions=predictions,
eval_metric_ops=metrics)
elif mode == tf.estimator.ModeKeys.TRAIN:
train_op = self.optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
The partitioner I used is tf.fixed_size_partitioner with number of ps as the parameter. When I run this code with only one PS, I got auc=0.87 which was correct. However, when I used multiple ps (ps_num > 1), I always got auc=0.5. I have checked the graph, the partitioner successfully distributed linear_w among PS. Also the global_step went up to 30,000+ which also suggested the optimizer was working. Is there anything I missed in distributed TF to cause this issue?

You need to keep code identical in train and evaluate. It means that you need to use tf.get_variable(xxx,parititioner=tf.fixed_size_partitioner(6)) even if on single worker, the number 6 depends on how many you used in train.

Related

TensorFlow training - matching results with simple one layer estimator

I am new to TensorFlow but experienced at analytics. I am attempting to reproduce in TensorFlow the performance I obtain using a legacy Neural Network trainer, comprised simply of a single hidden-layer, trained using back-prop with momentum update and a mean-squared error objective function.
Unfortunately all my attempts have thus far failed, with TensorFlow models being produced that range from being roughly "only" 10-15% inferior to my home-brewed trainer when using the canned tf.estimator.DNNClassifier, to much, much worse when I try to construct the net myself.
Can someone identify the mistakes being made in how I am calling the tool in the code below? Using a command line input, I toggle between the "canned_dnn" and my "custom" set up
def get_classifier( model_name, cur_model_dir, feature_columns ):
if model_name == 'canned_dnn':
return tf.estimator.DNNClassifier( feature_columns=feature_columns,
model_dir=cur_model_dir,
hidden_units=[18],
activation_fn=tf.nn.relu )
elif model_name == 'custom':
return tf.estimator.Estimator( model_fn=my_model,
model_dir=cur_model_dir,
params={
'feature_columns': feature_columns,
'n_classes': 2
})
else:
raise RuntimeError()
def my_model(features, labels, mode, params):
# input layer
net = tf.feature_column.input_layer( features, params['feature_columns'] )
# hidden layer 1
net = tf.layers.dense(net, units=18, activation=tf.nn.relu, use_bias=True, bias_initializer=tf.zeros_initializer(), name='LEARN_1')
# output layer computes logits
logits = tf.layers.dense(net, params['n_classes'], activation=tf.nn.sigmoid, name='OUTPUT')
# compute predictions
predicted_classes = tf.argmax(logits, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'class_ids': predicted_classes[:, tf.newaxis],
'probabilities': tf.nn.softmax(logits),
'logits': logits,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
# mean squared error
predicted_classes = tf.argmax(logits, 1)
loss = tf.losses.mean_squared_error(labels=tf.one_hot(labels, 2), predictions=logits)
# metrics
# loss report
tf.summary.scalar('loss', loss )
# accuracy
accuracy = tf.metrics.accuracy(labels=labels,
predictions=predicted_classes,
name='acc_op')
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy[1])
# EVALUATION MODE
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
# TRAINING MODE
assert mode == tf.estimator.ModeKeys.TRAIN
with tf.name_scope( 'OPTIMIZE' ):
optimizer=tf.train.MomentumOptimizer(learning_rate=.02,momentum=.5)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

Using a placeholder to initialize the initial state for a RNN Cell in a TensorFlow Estimator model_fn

I'm having a problem making the cell initial_state configurable so I can use different batch sizes for training and prediction. Essentially at training, I am going to feed fixed size mini batches, while at prediction, I'll predict one input at a time, and feed that back into the model to get the next output.
However, I am unable to create a graph with the first dimension for the cell initial_state configurable. Here is a simple model_fn to model character input
def model_fn(features, labels, mode, params):
inputs = tf.one_hot(features, params["VOCAB_SIZE"], 1.0, 0.0)
cell = tf.nn.rnn_cell.MultiRNNCell([
tf.nn.rnn_cell.GRUCell(params["INTERNAL_SIZE"]) for _ in range(params["NUM_LAYERS"])
], state_is_tuple=False)
pkeep = params["DROPOUT_PKEEP"] if mode == tf.estimator.ModeKeys.TRAIN else 1.0
cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=pkeep)
initial_state = tf.get_variable(
"initial_state",
dtype=tf.float32,
initializer=cell.zero_state(params["BATCH_SIZE"], dtype=tf.float32),
)
if mode == tf.estimator.ModeKeys.EVAL:
initial_state = cell.zero_state(1, dtype=tf.float32)
outputs, final_state = tf.nn.dynamic_rnn(cell, inputs, initial_state=initial_state)
if mode != tf.estimator.ModeKeys.EVAL:
tf.assign(initial_state, final_state)
logits = ...
if mode == tf.estimator.ModeKeys.PREDICT:
logits = tf.reshape(logits, [-1, 1, 98])
else:
logits = tf.reshape(logits, [-1, features.shape[1], 98])
probabilities = tf.nn.softmax(logits)
predictions = tf.argmax(probabilities, 2)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = { "predictions": predictions, "probabilities": probabilities }
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
loss = ...
if mode == tf.estimator.ModeKeys.EVAL:
accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions)
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops={
"accuracy": accuracy,
})
optimizer = tf.train.AdamOptimizer(learning_rate=params["LEARNING_RATE"])
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
The problem is that the requires me to pass BATCH_SIZE in params that is used to define the initial_state, which at train time is a value like 200. However at test time, for a single batch, it gives an error saying that [1, 384] dimensional tensor cannot be assigned to a [200, 384] dimensional variable. How do I make the dimension of the initial_state configurable based on the training mode?

Divergence of tensorflow model while trying to stop batch_norm training during calibration

I am trying to stop the training of batch normalization variables during training at a given step. To do so, i wrote the following code:
early_training_stop = 1000
global_step = tf.train.get_or_create_global_step()
train_phase = tf.cond(global_step < early_training_stop, lambda: True, lambda: False)
out = tf.layers.conv2D(inputs, filters=16, kernel_size=4)
out_bn = tf.contrib.layers.batch_norm(
input_layer,
decay=decay,
scale=scale,
epsilon=epsilon,
is_training=train_phase,
fused=True,
data_format=self.data_format,
scope=scope)
loss_op = tf.losses.mean_squared_error(labels=targets, predictions=inputs)
learning_rate = 1e-4
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
train_op = optimizer_op.minimize(loss_op, global_step=global_step)
When training the network, it starts properly (batch norm variables are trained). Unfortunately, when global_step reach early_training_stop, I get the following error: Model diverged with loss = NaN.. How to avoid this error ? Maybe in removing batch_norm ops from training variables ? (I though doing so would do this).

How to set parts of positive samples weight in TensorFlow for binary classfication

I want to set the same weight for parts of positive samples. However,tf.nn.weighted_cross_entropy_with_logits can only set the weight for all positive samples in my opinion.
for example, in the ctr predicition, I want set 10 weights for the order samples, and the weight of click samples and the unclick sample is still 1.
Here is my unweighted code
def my_model(features, labels, mode, params):
net = tf.feature_column.input_layer(features, params['feature_columns'])
for units in params['hidden_units']:
net = tf.layers.dense(net, units=units, activation=params["activation"])
logits = tf.layers.dense(net, params['n_classes'], activation=None)
predicted_classes = tf.argmax(logits, 1)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'class_ids': predicted_classes, #predicted_classes[:, tf.newaxis],
'probabilities': tf.nn.softmax(logits),
'logits': logits,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)
metrics = {'auc': tf.metrics.auc(labels=labels, predictions=tf.nn.softmax(logits)[:,1])}
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
assert mode == tf.estimator.ModeKeys.TRAIN
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
Train
train_input_fn = tf.estimator.inputs.pandas_input_fn(x=data_train, y=data_train_click, batch_size = 1024, num_epochs=1, shuffle=False)
classifier.train(input_fn=train_input_fn)
Here data_train_click is a Series, which the click samples are 1 and the unclicked samples are 0. And I have a Series named data_train_order, which the order samples are 1 and the others are 0
The easiest way to do this is by using keras
https://keras.io/models/model/
The fit function has a sample_weight parameter.
You can weigh each samples differently by passing a weight parameter to the loss function which is a tensor of shape [batch_size] containing corresponding weights for each samples.
loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits, weights=weights)

tf.accuracy is all over the place for binary classification in tensorflow

Beginner here. I'm using tensorflow for a binary classification problem and my tf.accuracy is all over the place. It should be at a minimum 0.5 for random guessing but somehow I've managed to get 0.000 or 1.000 (and I'm positive that's not because of the supreme quality of my estimator) Here is my model function:
def my_model(features, labels, mode, params):
net = tf.feature_column.input_layer(features, params['feature_columns'])
for units in params['hidden_units']:
net = tf.layers.dense(net, units=units, activation=tf.nn.sigmoid)
#output layer
sigmoid_activations = tf.layers.dense(net, params['n_classes'], activation=tf.sigmoid)
# Compute predictions.
predicted_classes = tf.round(sigmoid_activations)
if mode == tf.estimator.ModeKeys.PREDICT:
predictions = {
'class_ids': predicted_classes[:, tf.newaxis],
'probabilities': sigmoid_activations,
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
# Compute loss.
loss = tf.losses.log_loss(labels, sigmoid_activations)
# Compute evaluation metrics.
accuracy = tf.metrics.accuracy(labels=labels,
predictions=predicted_classes,
name='acc_op')
metrics = {'accuracy': accuracy}
tf.summary.scalar('accuracy', accuracy[1])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
# Create training op.
assert mode == tf.estimator.ModeKeys.TRAIN
optimizer = tf.train.AdagradOptimizer(learning_rate=LEARNING_RATE)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
The following configuration gives an accuracy of ~=0.5 after the first epoch (there may be no signal in my data, but that's a different issue)
HIDDEN_UNITS=[10]
LEARNING_RATE=0.1
BATCH_SIZE=10
EPOCHS=3
Playing around with hyperparameters and changing the hidden units to
HIDDEN_UNITS=[128,64,32]
gives accuracy of 0.000 which makes no sense, because even random guessing should score around 0.5. What am I missing here?
Answering my own question:
Turns out my test set was shuffled poorly and only had positive samples in it. For some reason (different issue) my model always predicts 1 so I got a perfect 1.0 accuracy with a terrible model.

Categories