I have an LSTM network that outputs a decision via tf.tanh. Then I use a tf.sign to get the binary action of either 1, -1 or 0. However, I do not want the model to make a non zero action when output of tf.tanh is small, so I want to implement a threshold T before tf.sign so that only output larger than T or smaller than -T will be transformed to +1 and -1.
Example tensor output from tf.tanh:
logits = [0.6,0.4,-0.6,-0.4]
threshold = 0.5
Desired tensor output while preserving the gradient:
action = [1,0,-1,0]
It's sort of like a double side ReLU with a threshold, but I don't know how to do it inside tensorflow graph. Any help is appreciated.
Here is something that works for me. The code is pretty self-explanatory if you look at the variable names. Let me know if something is unclear.
import tensorflow as tf
def suppress_range(x, a, b):
"""Sets all the elements in the range [a, b) to 0"""
assert (a < b), "a must be less than b"
significant = tf.logical_or(x <= a, x > b)
zero = tf.zeros_like(x)
return tf.where(significant, x, zero)
def main():
logits = tf.placeholder(tf.float32)
output = tf.sign(suppress_range(logits, -0.5, 0.5))
with tf.Session() as sess:
x = [[0.6, 0.4, -0.6, -0.4], [0.5, 0.1, -0.7, -0.2]]
print(sess.run(output, feed_dict={logits: x}))
if __name__ == '__main__':
main()
Related
While building some code to train a tensorflow deep model, I am using tensorflow tf.map_fn and tf.py_function as a wrapper to apply a scipy python function as a loss function mapping each 2 rows of a batch of 2 probability vectors p and q of shape [batch_size,num_classes]. When using KL_divergence over this batch of vectors (p,q), the training works fine with this computation and there is no shape incompatibility issue:
tf.reduce_sum(p*(tf.log(p + 1e-16) - tf.log(q + 1e-16)), axis=1) #KL divergence
However, when I tried to use Wasserstein distance or the energy_distance functions from scipy, I get an error dealing with incompatible shapes [] and [5000]. 5000 is here the number of classes (p and q of shape [batch_size, 5000])
import tensorflow as tf
def compute_kld(p_logit, q_logit, divergence_type):
p = tf.nn.softmax(p_logit)
q = tf.nn.softmax(q_logit)
if divergence_type == "KL_divergence":
return tf.reduce_sum(p*(tf.log(p + 1e-16) - tf.log(q + 1e-16)), axis=1)
elif divergence_type == "Wasserstein_distance":
def wasserstein_distance(x,y):
import scipy
from scipy import stats
return stats.wasserstein_distance(x,y)
#tf.function
def func(p,q):
return tf.map_fn(lambda x: tf.py_function(func=wasserstein_distance, inp=[x[0], x[1]], Tout=tf.float32), (p, q), dtype=(tf.float32)) #, parallel_iterations=10)
return func(p, q)
elif divergence_type == "energy_distance": # The Cramer Distancedef energy_distance(x,y):
def energy_distance(x,y):
import scipy
from scipy import stats
return stats.energy_distance(x,y)
#tf.function
def func(p,q):
return tf.map_fn(lambda x: tf.py_function(func=energy_distance, inp=[x[0], x[1]], Tout=tf.float32), (p, q), dtype=(tf.float32)) #, parallel_iterations=10)
return func(p, q)
This is the code to test the loss functions with a batch of 5 and 3 classes, which all work fine individually:
import tensorflow as tf
p = tf.constant([[1, 2, 3], [1, 2, 3], [14, 50, 61], [71, 83, 79], [110,171,12]])
q = tf.constant([[1, 2, 3], [1.2, 2.3, 3.2], [4.2, 5.3, 6.4], [7.5, 8.6, 9.4], [11.2,10.1,13]])
p = tf.reshape(p, [-1,3])
q = tf.reshape(q, [-1,3])
p = tf.cast(p, tf.float32)
q = tf.cast(q, tf.float32)
with tf.Session() as sess:
divergence_type = "KL_divergence"
res = compute_kld(p, q, divergence_type = divergence_type)
divergence_type = "Wasserstein_distance"
res2 = compute_kld(p, q, divergence_type = divergence_type)
divergence_type = "energy_distance"
res3 = compute_kld(p, q, divergence_type = divergence_type)
print("############################## p")
print(sess.run(tf.print(p)))
print("##")
print(sess.run(tf.print(tf.shape(p))))
print("############################## KL_divergence")
print(sess.run(tf.print(res)))
print("##")
print(sess.run(tf.print(tf.shape(res))))
print("############################## Wasserstein_distance")
print(sess.run(tf.print(res2)))
print("##")
print(sess.run(tf.print(tf.shape(res2))))
print("############################## energy_distance")
print(sess.run(tf.print(res3)))
print("##")
print(sess.run(tf.print(tf.shape(res3))))
This is the output:
############################## p
[[1 2 3]
[1 2 3]
[14 50 61]
[71 83 79]
[110 171 12]]
None
##
[5 3]
None
############################## KL_divergence
[0 0.000939823687 0.367009342 1.1647588 3.09911442]
None
##
[5]
None
############################## Wasserstein_distance
[0 0.0126344115 0.204870835 0.237718046 0.120362818]
None
##
[5]
None
############################## energy_distance
[0 0.0917765796 0.41313991 0.438246906 0.316672504]
None
##
[5]
None
However, when using the wasserstein distance or the energy distance inside my training code, I get incompatible shape error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Tried to set a tensor with incompatible shape at a list index. Item element shape: [] list shape: [5000]
[[{{node gradients/TensorArrayV2Read/TensorListGetItem_grad/TensorListSetItem}}]]
I am wondering if the dtype for tf.map_fn or tf.py_function I am using is wrong or if I have to specify/impose shape somewhere ?
Here is a link for the whole code where I tried to replace KL-divergence with Wasserstein distance in method "compute_kld": https://github.com/shenyuanyuan/IMSAT/blob/master/imsat_cluster.py
Thank you in advance for your kind help!
== UPDATE ==
I inspected all the provided batches and the shapes of p and q seem correct
shape(p)
(?, 5000)
shape(q)
(?, 5000)
However, the type of func's returned object is . Thus, I have tried to reshape it with:
return tf.reshape(func(p, q), [p.shape[0]])
However, this doesn't seem to change anything as the error is still the same. After providing the first batch, the code crashes before starting to process the second batch.
Without seeing your training code, what I can help is to fetch the docs and try to shed some light.
map_fn Transforms elems by applying fn to each element unstacked on axis 0.
If elems is a tuple (or nested structure) of tensors, then those tensors must all have the same outer-dimension size (num_elems); and fn is used to transform each tuple (or structure) of corresponding slices from elems. E.g., if elems is a tuple (t1, t2, t3), then fn is used to transform each tuple of slices (t1[i], t2[i], t3[i]) (where 0 <= i < num_elems).
energy_distance Computes the energy distance between two 1D distributions.
wasserstein_distance Computes the first Wasserstein distance between two 1D distributions.
To begin, you hould make sure you are passing only 2D p_logit and q_logit to compute_kld.
I am using tensorflow to develop a VAE, so the cost I am using for the model is ELBO (Evidence Lower Bound). In order to apply the error to the gradients, reduce_mean() has to be used in the last step so that the cost function returns a scalar.
def vae_cost(x_true, model, analytic_kl=False, kl_weight=4):
x_true = tf.cast(x_true, tf.float32)
z_sample, mu, sd = model.encode(x_true)
x_recons_logits = model.decoder(z_sample)
# compute mean squared error
recons_error = tf.cast(
tf.reduce_mean((x_true - x_recons_logits) ** 2, axis=[1, 2, 3]),
tf.float32)
# compute reverse KL divergence, either analytically
# or through MC approximation with one sample
if analytic_kl:
kl_divergence = -0.5 * tf.math.reduce_sum(
1 + tf.math.log(tf.math.square(sd)) - tf.math.square(mu) - tf.math.square(sd),
axis=1) # shape=(batch_size,)
else:
log_pz = normal_log_pdf(z_sample, 0., 1.) # shape=(batch_size,)
logqz_x = normal_log_pdf(z_sample, mu, tf.math.square(sd))
kl_divergence = logqz_x - log_pz
elbo = tf.reduce_mean(-kl_weight * kl_divergence - recons_error)
return -elbo
(Note: this is code I took from here and barely modified)
The model trains perfectly; there is no issue in that sense. What I am having problems with is the fact of printing the error. I have little knowledge of how tensorflow works internally, but I know you cannot use python's built-in print() function, since that prints the computation graph, if I am not mistaken. Therefore, tf.print() seemed to be the solution, but instead of a single value this is what shows up in the console:
2.72147369
2.37455082
3.83512926
2.00962853
2.3469491
3.15436459
2.25914431
2.40686131
2.98925924
2.75991917
1.94956458
3.1419673
2.06783676
2.53439474
2.18458319
2.31454301
1.79345393
1.81354737
2.27693963
1.60603094
2.71092319
1.90332329
2.64296
1.94370067
2.07476187
2.32125258
And then, if I use python's print():
<tf.Tensor 'Neg:0' shape=() dtype=float32>
If the vector has shape=(), then how is it possible to get so many values with tf.print()? Am I actually confusing how this function works? In that case, how do I actually print the error? I would appreciate it if you could explain what "Neg:0" means as well. Thank you in advance.
The output of tf.print() is a list of values, one for each element in the input tensor. In this case, the input tensor is a vector with shape=(), so the output is a list of values, each corresponding to an element in the input vector.
Here is the revised code:
def vae_cost(x_true, model, analytic_kl=False, kl_weight=4):
x_true = tf.cast(x_true, tf.float32)
z_sample, mu, sd = model.encode(x_true)
x_recons_logits = model.decoder(z_sample)
# compute mean squared error
recons_error = tf.cast(
tf.reduce_mean((x_true - x_recons_logits) ** 2, axis=[1, 2, 3]),
tf.float32)
# compute reverse KL divergence, either analytically
# or through MC approximation with one sample
if analytic_kl:
kl_divergence = -0.5 * tf.math.reduce_sum(
1 + tf.math.log(tf.math.square(sd)) - tf.math.square(mu) - tf.math.square(sd),
axis=1) # shape=(batch_size,)
else:
log_pz = normal_log_pdf(z_sample, 0., 1.) # shape=(batch_size,)
logqz_x = normal_log_pdf(z_sample, mu, tf.math.square(sd))
kl_divergence = logqz_x - log_pz
elbo = tf.reduce_mean(-kl_weight * kl_divergence - recons_error)
return -elbo, kl_divergence, recons_error
I'm trying to create a 2-layer neural network, for that I first initialize weights and biases to random floats between 0 an 1 using numpy.random.rand. However, for some reason this process produces floats bigger than 1 for W1 (weight 1) whereas it works correctly for all other weights an biases. I can't understand why this happens, I thought maybe something affects the function from outside the function where I initialized the parameters, but I couldn't detect any part in the function that could be affected from outside the function.
import numpy as np
### CONSTANTS DEFINING THE MODEL ####
n_x = 12288 # num_px * num_px * 3
n_h = 7
n_y = 1
layers_dims = (n_x, n_h, n_y)
def initialize_parameters_deep(layer_dims):
"""
Arguments:
layer_dims -- python array (list) containing the dimensions of each layer in our network
Returns:
parameters -- python dictionary containing your parameters "W1", "b1","W2", "b2":
"""
np.random.seed(1)
parameters = {}
parameters["W1"] = np.random.rand(n_h, n_x) #(7, 12288)
parameters["b1"] = np.random.rand(n_h, 1) #(7)
parameters["W2"] = np.random.rand(n_y, n_h) #(7, 1)
parameters["b2"] = np.random.rand(n_y, 1) #(1)
return parameters
parameters = initialize_parameters_deep(layers_dims)
print(parameters)
Output:
{'W1': array([[4.17022005e-01, 7.20324493e-01, 1.14374817e-04, ...,
3.37562919e-01, 1.12292153e-01, 5.37047221e-01],
[7.07934286e-01, 3.37726007e-01, 7.07954162e-01, ...,
4.22040811e-01, 7.78593215e-01, 3.49866021e-01],
[9.01338451e-01, 7.95132845e-03, 1.03777034e-01, ...,
2.78602449e-01, 5.05813021e-02, 8.26828833e-01],
...,
[5.62717083e-03, 6.58208224e-01, 3.88407263e-01, ...,
5.56312618e-01, 8.69650932e-01, 1.00112287e-01],
[4.16278934e-01, 4.56060621e-01, 9.33378848e-01, ...,
9.52798385e-01, 9.41894584e-01, 4.44342962e-01],
[8.89254832e-01, 6.42558949e-01, 2.29427262e-01, ...,
8.05884494e-01, 1.80676088e-01, 6.12694420e-01]]), 'b1': array([[0.11933315],
[0.50073416],
[0.21336813],
[0.14223935],
[0.60809243],
[0.41994954],
[0.43137737]]), 'W2': array([[0.81360697, 0.44638382, 0.41794085, 0.08649817, 0.29957473,
0.33706742, 0.24721952]]), 'b2': array([[0.92363097]])}
It's not generating floats bigger than 1, it's just representing them differently.
4.17022005e-01 is the same as 0.417022005, and 1.14374817e-04 is the same as 0.000114374817.
See here or here.
The e-01, e-02, e-03, etc at the end of the W1 numbers just mean that the numbers are written in exponential format. So if you have for example 2.786e-01 that is the same as if it was written like (2.786/10) and that is the same as 0.2786. Same thing goes for: 2.786e-03 == (2.786/1000) == 0.002786. e+2 is 10^2 and e-2 is 1/(10^2).
Pay attention to the final few characters printed when you print your weights parameter tensor, which gives e.g. e-01. This represents base-10 exponentiation, i.e. meaning that the value of a given weight is the number printed times 10 to the given power.
All of the powers are negative, meaning the weights have small but positive values in the range [0, 1].
For example, 4.17022005e-01 equals 0.417022005.
I need to calculate the covariance matrix for RGB values across an image dataset, and then apply Cholesky decomposition to the final result.
The covariance matrix for RGB values is a 3x3 matrix M, where M_(i, i) is the variance of channel i and M_(i, j) is the covariance between channels i and j.
The end result should be something like this:
([[0.26, 0.09, 0.02],
[0.27, 0.00, -0.05],
[0.27, -0.09, 0.03]])
I'd prefer to stick to PyTorch functions even though Numpy has a Cov function.
I attempted to recreate the numpy Cov function in PyTorch here based on other cov implementations and clones:
def pytorch_cov(tensor, tensor2=None, rowvar=True):
if tensor2 is not None:
tensor = torch.cat((tensor, tensor2), dim=0)
tensor = tensor.view(1, -1) if tensor.dim() < 2 else tensor
tensor = tensor.t() if not rowvar and tensor.size(0) != 1 else tensor
tensor = tensor - torch.mean(tensor, dim=1, keepdim=True)
return 1 / (tensor.size(1) - 1) * tensor.mm(tensor.t())
def cov_vec(x):
c = x.size(0)
m1 = x - torch.sum(x, dim=[1],keepdims=True)/ c
out = torch.einsum('ijk,ilk->ijl',m1,m1) / (c - 1)
return out
The dataset loading would be like this:
dataset = torchvision.datasets.ImageFolder(data_path)
loader = torch.utils.data.DataLoader(dataset)
for images, _ in loader:
batch_size = images.size(0)
...
For the moment I'm just experimenting with images created with torch.randn(batch_size, 3, height, width).
Edit:
I'm attempting to replicate the matrix from Tensorflow's Lucid here, and somewhat explained on distill.pub here.
Second Edit:
In order to make the output resemble the example one, you have to do this instead of using Cholesky:
rgb_cov_tensor = rgb_cov_tensor / len(loader.dataset)
U,S,V = torch.svd(rgb_cov_tensor)
epsilon = 1e-10
svd_sqrt = U # torch.diag(torch.sqrt(S + epsilon))
The resulting matrix can then be used to perform color decorrelation, which is useful for visualizing features (DeepDream). I've implemented it in my project here.
Here is a function for computing the (unbiased) sample covariance matrix on a 3 channel image, named rgb_cov. Cholesky decomposition is straightforward with torch.cholesky:
import torch
def rgb_cov(im):
'''
Assuming im a torch.Tensor of shape (H,W,3):
'''
im_re = im.reshape(-1, 3)
im_re -= im_re.mean(0, keepdim=True)
return 1/(im_re.shape[0]-1) * im_re.T # im_re
#Test:
im = torch.randn(50,50,3)
cov = rgb_cov(im)
L_cholesky = torch.cholesky(cov)
I am wondering if TF has the capacity to temporarily store data during the training phase? Below is an example:
import tensorflow as tf
import numpy as np
def loss_function(values, a, b):
N = values.shape[0]
i = tf.constant(0)
values_array = tf.get_variable(
"values", values.shape, initializer=tf.constant_initializer(values), dtype=tf.float32) # The temporary data solution in this example
result = tf.constant(0, dtype=tf.float32)
def body1(i):
op2 = tf.assign(values_array[i, 0],
234.0) # Here is where it should be updated. The value being assigned is actually calculated from variable a and b.
with tf.control_dependencies([op2]):
return i + 1
def condition1(i): return tf.less(i, N)
i = tf.while_loop(condition1, body1, [i])
op1 = tf.assign(values_array[0, 0],
9999.0) # Here is where it should be updated
result = result + tf.reduce_mean(values_array) # The final cost is calculated based on the entire values_array
with tf.control_dependencies([op1]):
return result
# The parameters we want to calculate in the end
a = tf.Variable(tf.random_uniform([1], 0, 700), name='a')
b = tf.Variable(tf.random_uniform([1], -700, 700), name='b')
values = np.ones([2, 4], dtype=np.float32)
# cost function
cost_function = loss_function(values, a, b)
# training algorithm
optimizer = tf.train.MomentumOptimizer(
0.1, momentum=0.9).minimize(cost_function)
# initializing the variables
init = tf.global_variables_initializer()
# starting the session session
sess = tf.Session()
sess.run(init)
_, training_cost = sess.run([optimizer, cost_function])
print tf.get_collection(
tf.GraphKeys.GLOBAL_VARIABLES, scope="values")[0].eval(session=sess)
Currently, what I get from the console is:
[[ 0.98750001 0.98750001 0.98750001 0.98750001]
[ 0.98750001 0.98750001 0.98750001 0.98750001]]
What expected to get from this example is (if the temporary data can be printed out):
[[ 9999.0 1.0 1.0 1.0]
[ 234.0 1.0 1.0 1.0]]
Overall, what I want is that the cost function calculates a temporary 2D array based on the input numpy 2D array and parameters a and b. Then, the final cost is calculated from the temporary 2D array. But I think using a TF variable as the temporary storage is probably not correct...
Any help?
Thanks!
Your while loop never runs because i is never used again. use tf.control_dependencies to make it run.
Also, you are adding the mean of values_array, when you seem to just want to add the array as-is. Get rid of reduce_mean to get your desired output.
op1 = tf.assign(values_array[0, 0], 9999.0) wasn't being done because there was no op in the following control_dependencies context. Move the op to the context to ensure that the assignment op is included in the graph.
def loss_function(values, a, b):
N = values.shape[0]
i = tf.constant(0)
values_array = tf.get_variable(
"values", values.shape, initializer=tf.constant_initializer(values), dtype=tf.float32, trainable=False)
temp_values_array = tf.get_variable(
"temp_values", values.shape, dtype=tf.float32)
# copy previous values for calculations & gradients
temp_values_array = tf.assign(temp_values_array, values_array)
result = tf.constant(0, dtype=tf.float32)
def body1(i):
op2 = tf.assign(temp_values_array[i, 0],
234.0) # Here is where it should be updated. The value being assigned is actually calculated from variable a and b.
with tf.control_dependencies([op2]):
return [i+1]
def condition1(i): return tf.less(i, N)
i = tf.while_loop(condition1, body1, [i])
with tf.control_dependencies([i]):
op1 = tf.assign(temp_values_array[0, 0],
9999.0) # Here is where it should be updated
with tf.control_dependencies([op1]):
result = result + temp_values_array # The final cost is calculated based on the entire values_array
# save the calculations for later
op3 = tf.assign(values_array, temp_values_array)
with tf.control_dependencies([op3]):
return tf.identity(result)
Also, you are fetching optimizer so the non-assigned elements of your output are going to be smaller than you expect. Your results would be closer if you did:
training_cost = sess.run([cost_function])
_ = sess.run([optimizer])
This will ensure that you don't optimize before getting the results of cost_function