Inconsistent gradient - python

I have a very simple Keras model and I want to compute the gradient of the different layers using TensorFlow.
I start by creating the computational graph in the first cell of a Jupyter notebook.
Here is the code of the computational graph:
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
import numpy as np
from tensorflow.keras.layers import Dense, Input, Layer
from tensorflow.keras.models import Model
input_tensor = Input(shape=(20,), name="input")
print(input_tensor.name)
hidden = Dense(100, activation='relu')(input_tensor)
out1 = Dense(10, activation='relu', name="out1")(hidden)
model = Model(inputs=input_tensor, outputs=[out1])
grad = []
for i in range(4):
grad.append(tf.gradients(out1, model.trainable_weights[i]))
model.compile(loss={"out1": "mse"},
optimizer=tf.train.AdamOptimizer(learning_rate=0.001))
np.random.seed(0)
X = np.random.random((3, 20)).astype(np.float32)
Y = np.random.random((3, 10)).astype(np.float32)
model.fit(x={'input' : X}, y={'out1' : Y}, batch_size=1, epochs=10)
Then each time I run the tf.gradients operator, I get a different gradient vector (the gradient changes). This result is not reasonable. Where is the problem in my code?
And here is the code for the created Session:
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
out_grad = sess.run(grad, feed_dict={'input:0':X})
print(out_grad)

I solved this problem by using tf.keras.backend.get_session().
I think that the problem is that tf.global_variables_initializer() reinitialize the training weights each time it is called.
When I run this code multiple times, I get consistent results.
sess = tf.keras.backend.get_session({'input:0':X})
out_grad = sess.run(grad, feed_dict={'input:0':X})

Related

Convolutional LSTM Model Dimension Incompatibility when making predictions & prediction dimension issues

I structured a Convolutional LSTM model to predict the forthcoming Bitcoin price data, using the analyzed past data of the Bitcoin close price and other features.
Let me jump straight to the code:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import tensorflow.keras as keras
import keras_tuner as kt
from keras_tuner import HyperParameters as hp
from keras.models import Sequential
from keras.layers import InputLayer, ConvLSTM1D, LSTM, Flatten, RepeatVector, Dense, TimeDistributed
from keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
import keras.backend as K
from keras.losses import Huber
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
DIR = '../input/btc-features-targets'
SEG_DIR = '../input/segmented'
segmentized_features = os.listdir(SEG_DIR)
btc_train_features = []
for seg in segmentized_features:
train_features = pd.read_csv(f'{SEG_DIR}/{seg}')
train_features.set_index('date', inplace=True)
btc_train_features.append(scaler.fit_transform(train_features.values))
btc_train_targets = pd.read_csv(f'{DIR}/btc_train_targets.csv')
btc_train_targets.set_index('date', inplace=True)
btc_test_features = pd.read_csv(f'{DIR}/btc_test_features.csv')
btc_tef1 = btc_test_features.iloc[:111]
btc_tef2 = btc_test_features.iloc[25:]
btc_tef1.set_index('date', inplace=True)
btc_tef2.set_index('date', inplace=True)
btc_test_targets = pd.read_csv(f'{DIR}/btc_test_targets.csv')
btc_test_targets.set_index('date', inplace=True)
btc_trt_log = np.log(btc_train_targets)
btc_tefs1 = scaler.fit_transform(btc_tef1.values)
btc_tefs2 = scaler.fit_transform(btc_tef2.values)
btc_tet_log = np.log(btc_test_targets)
scaled_train_features = []
for features in btc_train_features:
shape = features.shape
scaled_train_features.append(np.expand_dims(features, [0,3]))
shape_2 = btc_tefs1.shape
btc_tefs1 = np.expand_dims(btc_tefs1, [0,3])
shape_3 = btc_tefs2.shape
btc_tefs2 = np.expand_dims(btc_tefs2, [0,3])
btc_trt_log = btc_trt_log.values[0]
btc_tet_log = btc_tet_log.values[0]
def build(hp):
model = keras.Sequential()
# Input Layer
model.add(InputLayer(input_shape=(111,32,1)))
# ConvLSTM1D
convLSTM_hp_filters = hp.Int(name='convLSTM_filters', min_value=32, max_value=512, step=32)
convLSTM_hp_kernel_size = hp.Choice(name='convLSTM_kernel_size', values=[3,5,7])
convLSTM_activation = hp.Choice(name='convLSTM_activation', values=['selu', 'relu'])
model.add(ConvLSTM1D(filters=convLSTM_hp_filters,
kernel_size=convLSTM_hp_kernel_size,
padding='same',
activation=convLSTM_activation,
use_bias=True,
bias_initializer='zeros'))
# Flatten
model.add(Flatten())
# RepeatVector
model.add(RepeatVector(5))
# LSTM
LSTM_hp_units = hp.Int(name='LSTM_units', min_value=32, max_value=512, step=32)
LSTM_activation = hp.Choice(name='LSTM_activation', values=['selu', 'relu'])
model.add(LSTM(units=LSTM_hp_units, activation=LSTM_activation, return_sequences=True))
# TimeDistributed Dense
dense_units = hp.Int(name='dense_units', min_value=32, max_value=512, step=32)
dense_activation = hp.Choice(name='dense_activation', values=['selu', 'relu'])
model.add(TimeDistributed(Dense(units=dense_units, activation=dense_activation)))
# TimeDistributed Dense_Output
model.add(Dense(1))
# Set Learning Rate
hp_learning_rate = hp.Choice(name='learning_rate', values=[1e-2, 1e-3, 1e-4])
# Compile Model
model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
loss=Huber(),
metrics=[RootMeanSquaredError()])
return model
tuner = kt.Hyperband(build,
objective=kt.Objective('root_mean_squared_error', direction='min'),
max_epochs=10,
factor=3)
early_stop = EarlyStopping(monitor='root_mean_squared_error', patience=5)
opt_hps = []
for train_features in scaled_train_features:
tuner.search(train_features, btc_trt_log, epochs=50, callbacks=[early_stop])
opt_hps.append(tuner.get_best_hyperparameters(num_trials=1)[0])
models, epochs = ([] for _ in range(2))
for hps in opt_hps:
model = tuner.hypermodel.build(hps)
models.append(model)
history = model.fit(train_features, btc_trt_log, epochs=70, verbose=0)
rmse = history.history['root_mean_squared_error']
best_epoch = rmse.index(min(rmse)) + 1
epochs.append(best_epoch)
hypermodel = tuner.hypermodel.build(opt_hps[0])
for train_features, epoch in zip(scaled_train_features, epochs): hypermodel.fit(train_features, btc_trt_log, epochs=epoch)
tp1 = hypermodel.predict(btc_tefs1).flatten()
tp2 = hypermodel.predict(btc_tefs2).flatten()
test_predictions = np.concatenate((tp1, tp2[86:]), axis=None)
The hyperparameters of the model are configured using keras_tuner; as there were ResourceExhaustError issues output by the notebook when training is done with the full features dataset, sequentially segmented datasets are used instead (and apparently, referring to the study done utilizing the similar model architecture, training is able to be efficiently done through this training approach).
The input dimension of each segmented dataset is (111,32,1).
There aren't any issues reported until before the last code block. The models work fine. Yet, when the .predict() function is executed, the notebook prints out an error, which states that the dimension of the input features for making predictions is incompatible with the dimension of the input features used while training. I did not understand the reason behind its occurrence, since as far as I know, the input dimensions of a train dataset for a DNN model cannot be identical as the input dimensions of a test dataset.
Even though all the price data from 2018 to early 2021 are used as training datasets, predictions are only needed for the mid 2021 timeframe.
The dataset used for prediction has a dimension of (136,32,1).
I tried matching the dimension of this dataset to (111,32,1), through index slicing.
Now this showed issues in the output dimension. While predictions should be made for 136 data points, the result only returned 10.
Are there any issues relevant to the model configuration? Cannot interpret the current situation.

How to mix many distributions in one tensorflow probability layer?

I have several DistributionLambda layers as the outputs of one model, and I would like to make a Concatenate-like operation into a new layer, in order to have only one output that is the mix of all the distributions, assuming they are independent. Then, I can apply a log-likelihood loss to the output of the model. Otherwise, I cannot apply the loss over a Concatenate layer, because it lost the log_prob method. I have been trying with the Blockwise distribution, but with no luck so far.
Here an example code:
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers
from tensorflow_probability import distributions
from tensorflow_probability import layers as tfp_layers
def likelihood_loss(y_true, y_pred):
"""Adding negative log likelihood loss."""
return -y_pred.log_prob(y_true)
def distribution_fn(params):
"""Distribution function."""
return distributions.Normal(
params[:, 0], math.log(1.0 + math.exp(params[:, 1])))
output_steps = 3
...
lstm_layer = layers.LSTM(10, return_state=True)
last_layer, l_h, l_c = lstm_layer(last_layer)
lstm_state = [l_h, l_c]
dense_layer = layers.Dense(2)
last_layer = dense_layer(last_layer)
last_layer = tfp_layers.DistributionLambda(
make_distribution_fn=distribution_fn)(last_layer)
output_layers = [last_layer]
# Get output sequence, re-injecting the output of each step
for number in range(1, output_steps):
last_layer = layers.Reshape((1, 1))(last_layer)
last_layer, l_h, l_c = lstm_layer(last_layer, initial_state=lstm_states)
# Storing state for next time step
lstm_states = [l_h, l_c]
last_layer = tfp_layers.DistributionLambda(
make_distribution_fn=distribution_fn)(dense_layer(last_layer))
output_layers.append(last_layer)
# This does not work
# last_layer = distributions.Blockwise(output_layers)
# This works for the model but cannot compute loss
# last_layer = layers.Concatenate(axis=1)(output_layers)
the_model = models.Model(inputs=[input_layer], outputs=[last_layer])
the_model.compile(loss=likelihood_loss, optimizer=optimizers.Adam(lr=0.001))
The problem is your Input, not your output layer ;)
Input:0 is referenced in your error message.
Could you try to be more specific about your input?

Problem initializing Tensorflow variables

I am trying to compute the input signal "maximizing" the activation of a given neuron of an encoder NN (the goal is to understand what my latent features are modelling).
I wrote a little python script which loads the .h5 file with the trained encoder model and builds a tensorflow graph to compute iteratively the "best activation signal".
It seems like my tensorflow implementation is not right. Despite the fact that I run tf.initialize_all_variables(), a FailedPreconditionError: Attempting to use uninitialized value X error is raised.
I am a little new in the use of tensorflow without using keras so this may be a trivial mistake but I could really use some help on this. Here is my code. Thanks a lot.
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import matplotlib.pyplot as plt
input_sequence_size = 20
input_dim = 4
encoding_dim = 10
model_save = 'siple_autoencoder_encoder.h5'
model = keras.models.load_model(model_save)
lambda_param = 0.1
n_steps = 100
X = tf.Variable(tf.random_uniform([1, input_sequence_size * input_dim], -1.0, 1.0), name = 'X')
prediction = model.predict(X, steps = 1)
y = tf.gather_nd(prediction, [[0]], batch_dims=0, name=None)
gradient = tf.gradients(y, [X])[0]
step = tf.assign(X, X + lambda_param * gradient)
init = tf.initialize_all_variables()
with tf.Session() as sess:
sess.run(init)
# output = y.eval()
for i in range(n_steps):
sess.run(step)
activation_signal_1 = X.eval()

How to generate CNN heatmaps using built-in Keras in TF2.0 (tf.keras)

I used to generate heatmaps for my Convolutional Neural Networks, based on the stand-alone Keras library on top of TensorFlow 1. That worked fine, however, after my switch to TF2.0 and built-in tf.keras implementation (with eager execution) I cannot use my old heatmap generation code any longer.
So I re-wrote parts of my code for TF2.0 and ended up with the following:
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.models import load_model
from tensorflow.keras import preprocessing
from tensorflow.keras import backend as K
from tensorflow.keras import models
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
image_size = 150
image_path = "/tmp/images/test-image.jpg"
model_path = "/tmp/models/prototype/basic_vgg16.h5"
# Load pre-trained Keras model and the image to classify
model = load_model(model_path) # VGG16 CNN with custom classifier head
image = load_img(image_path, target_size=(image_size, image_size))
img_tensor = preprocessing.image.img_to_array(image)
img_tensor = np.expand_dims(img_tensor, axis=0)
img_tensor = preprocess_input(img_tensor)
input_layer = model.get_layer("model_input")
conv_layer = model.get_layer("block5_conv3")
heatmap_model = models.Model([model.inputs], [conv_layer.output, model.output])
# Get gradient of the winner class w.r.t. the output of the (last) conv. layer
with tf.GradientTape() as gtape:
conv_output, predictions = heatmap_model(img_tensor)
loss = predictions[:, np.argmax(predictions[0])]
grads = gtape.gradient(loss, conv_output)
pooled_grads = K.mean(grads, axis=(0, 1, 2))
# Get values of pooled grads and model conv. layer output as Numpy arrays
iterate = K.function([model.inputs], [pooled_grads, conv_layer.output[0]])
pooled_grads_value, conv_layer_output_value = iterate([img_tensor])
# Multiply each channel in the feature-map array by "how important it is"
for i in range(pooled_grads_value.shape[0]):
conv_layer_output_value[:, :, i] *= pooled_grads_value[i]
# Channel-wise mean of resulting feature-map is the heatmap of class activation
heatmap = np.mean(conv_layer_output_value, axis=-1)
heatmap = np.maximum(heatmap, 0)
max_heat = np.max(heatmap)
if max_heat == 0:
max_heat = 1e-10
heatmap /= max_heat
# Render heatmap via pyplot
plt.matshow(heatmap)
plt.show()
But now the following line:
iterate = K.function([model.inputs], [pooled_grads, conv_layer.output[0]])
leads to this error message:
AttributeError: Tensor.op is meaningless when eager execution is enabled.
I always used Keras and did not work with TF directly, so I am bit lost here.
Any ideas what could be the problem here?
PS: If you want to c&p this code, you can create the VGG16-based model like so:
# Create Keras model from pre-trained VGG16 and custom classifier
input_layer = layers.Input(shape=(image_size, image_size, 3), name="model_input")
vgg16_model = VGG16(weights="imagenet", include_top=False, input_tensor=input_layer)
model_head = vgg16_model.output
model_head = layers.Flatten(name="model_head_flatten")(model_head)
model_head = layers.Dense(256, activation="relu")(model_head)
model_head = layers.Dense(3, activation="softmax")(model_head)
model = models.Model(inputs=input_layer, outputs=model_head)
model.compile(loss="categorical_crossentropy", optimizer=optimizers.Adam(), metrics=["accuracy"])
At the end of the GradientTape loop, conv_output and grads already holds the value. The iterate function is no longer need to compute the values.
Working example below:
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.models import load_model
from tensorflow.keras import preprocessing
from tensorflow.keras import backend as K
from tensorflow.keras import models
import tensorflow as tf
import numpy as np
image_size = 224
# Load pre-trained Keras model and the image to classify
model = tf.keras.applications.vgg16.VGG16()
image = np.random.random((image_size, image_size, 3))
img_tensor = preprocessing.image.img_to_array(image)
img_tensor = np.expand_dims(img_tensor, axis=0)
img_tensor = preprocess_input(img_tensor)
conv_layer = model.get_layer("block5_conv3")
heatmap_model = models.Model([model.inputs], [conv_layer.output, model.output])
# Get gradient of the winner class w.r.t. the output of the (last) conv. layer
with tf.GradientTape() as gtape:
conv_output, predictions = heatmap_model(img_tensor)
loss = predictions[:, np.argmax(predictions[0])]
grads = gtape.gradient(loss, conv_output)
pooled_grads = K.mean(grads, axis=(0, 1, 2))
heatmap = tf.reduce_mean(tf.multiply(pooled_grads, conv_output), axis=-1)
heatmap = np.maximum(heatmap, 0)
max_heat = np.max(heatmap)
if max_heat == 0:
max_heat = 1e-10
heatmap /= max_heat
print(heatmap.shape)
one more issue to view the heatmap
import matplotlib.pyplot as plt
hm=np.squeeze(heatmap)
hm.shape
(14, 14)
plt.imshow(hm)

Batch Normalization results inconsistent across Keras and Tensorflow for toy example: why?

I'm trying to get BatchNormalization working properly in Keras (2.2.4), and haven't had luck. Its behavior seems inconsistent across model.fit() and model.predict()/evaluate().
My original problem was in the context of a complex GAN setup with various layers that were switching between frozen and unfrozen. In an attempt to grok why things were failing, I created this toy example, in which there is only a single BatchNormalization layer trying to learn the identity function, and there is no more freezing/unfreezing nonsense:
import numpy as np
import keras
from keras.layers import Input, BatchNormalization
if __name__ == '__main__':
x = np.random.uniform(low=-1.0, high=1.0, size=(20,1,))
y = x
ip = tx =Input(shape=[1,])
tx = BatchNormalization()(tx)
mod = keras.Model(inputs=ip, outputs=tx)
mod.compile(optimizer=keras.optimizers.SGD(lr=0.01), loss="mse")
mod.fit(x, y, epochs=2000)
print("mod evaluate", mod.evaluate(x, y))
I also created a pure tensorflow implementation that tries to do the exact same thing:
import tensorflow as tf
import numpy as np
if __name__ == '__main__':
x = np.random.uniform(low=-1.0, high=1.0, size=(20,1))
y = x
input = tx = tf.Variable(initial_value=x)
is_training = tf.Variable(initial_value=False)
tx = tf.layers.batch_normalization(tx, training=is_training)
output = tx
train_output = tf.placeholder(tf.float32, shape=(None, 1))
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
loss = tf.losses.mean_squared_error(train_output, output)
train_op = tf.train.MomentumOptimizer(learning_rate=0.01, momentum=0.9).minimize(loss)
train_op = tf.group([train_op, update_ops])
sess = tf.Session()
sess.run(tf.global_variables_initializer())
is_training.load(False, sess)
py = sess.run(output)
print("py", py)
print("mse", np.mean(np.square(py - x)))
print("training...")
for i in range(2000):
is_training.load(True, sess)
sess.run(train_op, {train_output: y})
is_training.load(False, sess)
py = sess.run(output)
print("step", i, "mse", np.mean(np.square(py - x)))
is_training.load(False, sess)
py = sess.run(output)
print("mse", np.mean(np.square(py - x)))
I expect the Keras and Tensorflow results to be similar. But the Keras code prints out a MSE error approx 0.0002 at the end, while showing a loss of about 1e-12 during training. The Tensorflow code gives a way lower error, at approx 1e-16 for both training and test. The Tensorflow result is the expected one, since it should be trivial to learn the identity function, and once the moving mean and variance of BN converge, the results should be identical across train/test.
Why is this happening? Why is the behavior inconsistent across Tensorflow and Keras, and how can I get a lower error using pure Keras? Any insight into the situation would be appreciated.

Categories