`tf.svd` fails during GradientTape - python

I'm trying to contract a network with multiple tensors and using singular value decomposition during contraction to simplify the contraction process. Whilst this works perfectly when I'm not taking any gradient, it fails once gradient tape starts to watch the tensors (I'm not sure why this is related). Below I wrote my simple contraction function and the function that I'm taking svd:
import tensorflow as tf
#tf.function
def contraction_step(network, max_singular_values: int):
bottom = network[-1]
uppper = network[-2]
def contract_up_down(up,dn):
shu = tf.shape(up)
shd = tf.shape(dn)
c = tf.einsum("ijkxlm,nkpyqr->injpxylqmr", up, dn)
return tf.reshape(c, (
shu[0]*shd[0], shu[1], shd[2], shu[-3], shd[-3], shu[-2]*shd[-2], shu[-1]*shd[-1]
))
new = []
multiplier = tf.eye(tf.shape(bottom[-1])[-1]*tf.shape(uppper[-1])[-1], dtype=uppper[-1].dtype)
for ix in reversed(range(len(bottom))):
tensor = contract_up_down(uppper[ix], bottom[ix])
t = tf.einsum("ludpxor,ij->ludpxoj",tensor, multiplier)
u, s, vh = svd(t, 1, max_singular_values = max_singular_values)
multiplier = tf.tensordot(u, s/tf.norm(s), axes=(-1,0))
new.insert(0, vh)
new[-1] = tf.tensordot(new[-1], multiplier, axes=(-1,0))
return network[:-2] + [new]
def svd(tensor,pivot,max_singular_values = None,cutoff = 0.0):
left_dims = tf.shape(tensor)[:pivot]
right_dims = tf.shape(tensor)[pivot:]
tensor = tf.reshape(tensor, (tf.reduce_prod(left_dims), tf.reduce_prod(right_dims)))
s, u, v = tf.linalg.svd(tensor)
s_shape = tf.math.count_nonzero(
tf.cast(s >= cutoff, dtype = tf.int32), dtype = tf.int32
)
if max_singular_values is None:
max_singular_values = s_shape
else:
max_singular_values = tf.cast(tf.constant(max_singular_values), dtype = tf.int32)
num_sing_vals_keep = tf.maximum(
tf.minimum(max_singular_values, s_shape), tf.constant(1, dtype = tf.int32)
)
s = tf.slice(s, [0], [num_sing_vals_keep])
u = tf.slice(u, [0, 0], [tf.shape(u)[0], num_sing_vals_keep])
v = tf.slice(v, [0, 0], [tf.shape(v)[0], num_sing_vals_keep])
vh = tf.linalg.adjoint(v)
dim_s = tf.shape(s)[0] # must use tf.shape (not s.shape) to compile
u = tf.reshape(u, tf.concat([left_dims, [dim_s]], axis = -1))
vh = tf.reshape(vh, tf.concat([[dim_s], right_dims], axis = -1))
return u, tf.linalg.diag(s), vh
These functions work perfectly while using standalone:
upper = [tf.random.uniform((5,3,3,2,1,5), dtype=tf.float64) for _ in range(5)]
lower = [tf.random.uniform((5,3,3,2,1,5), dtype=tf.float64) for _ in range(5)]
contracted = contraction_step([upper, lower], 2)[0]
print(f"shapes: {', '.join([str(x.shape) for x in contracted])}")
# shapes: (2, 3, 3, 2, 2, 1, 2), (2, 3, 3, 2, 2, 1, 2), (2, 3, 3, 2, 2, 1, 2), (2, 3, 3, 2, 2, 1, 2), (2, 3, 3, 2, 2, 1, 2)
However, with the gradient, I get the following error:
with tf.GradientTape() as tape:
tape.watch(upper + lower)
contracted = contraction_step([upper, lower], 2)[0]
NotImplementedError: SVD gradient has not been implemented for input with unknown inner matrix shape.
It seems like for some reason during gradient mode TensorFlow loses the shape information of the tensors. Note that I get the same error when I set the tensors as tf.Variable instead of watching them manually. Any help would be highly appreciated!
Thanks
System information
OS Platform and Distribution: macOS v12.0.1
TensorFlow version: v2.6.0-rc2-32-g919f693420e 2.6.0
Python version: 3.8.9

I found a temporary solution that does not include all the aspects of the previous svd function but it works. TensorFlow requires object shapes to be set after slicing or manipulation (this might not be for every case but specific to mine). Thus I modified the svd function accordingly;
from functools import reduce
def svd(tensor, pivot, max_singular_values = None):
multip = lambda x, y: x * y
left_dims = tensor.get_shape()[:pivot]#tf.shape(tensor)[:pivot]
right_dims = tensor.get_shape()[pivot:]#tf.shape(tensor)[pivot:]
tensor = tf.reshape(tensor, (reduce(multip, left_dims), reduce(multip, right_dims)))
s, u, v = tf.linalg.svd(tensor)
if max_singular_values is None:
max_singular_values = s.shape[0]
num_sing_vals_keep = min(s.shape[0], max_singular_values)
s = tf.slice(s, [0], [num_sing_vals_keep])
tf.ensure_shape(s, tf.TensorShape(num_sing_vals_keep))
u = tf.slice(u, [0, 0], [tf.shape(u)[0], num_sing_vals_keep])
v = tf.slice(v, [0, 0], [tf.shape(v)[0], num_sing_vals_keep])
vh = tf.linalg.adjoint(v)
dim_s = s.shape[0]#tf.shape(s)[0]
u = tf.reshape(u, left_dims+(dim_s,))
vh = tf.reshape(vh, (dim_s,) + right_dims)
tf.ensure_shape(u, tf.TensorShape(left_dims+(dim_s,)))
tf.ensure_shape(vh, tf.TensorShape((dim_s,) + right_dims))
tf.ensure_shape(s, tf.TensorShape((dim_s,)))
return u, tf.linalg.diag(s), vh
And this seems to be working both with tf.GradientTape() but I'm getting the following warnings when I run it in tf.vectorized_map
u,s,v = tf.vectorized_map(lambda vec: svd(vec, 1, 10), tf.random.uniform((10, 5, 5)))
WARNING:tensorflow:Using a while_loop for converting Svd
so if anyone has a better solution it's highly appreciated.

Related

fastest way to calculate edges (derivatives) of a big torch tensor

Given a tensor with shape (b,c,h,w), I want to extract edges of the spatial data, that is, calculate x, y direction derivatives of the (h,w) and calculate the magnitude I=sqrt(|x_amplitude|^2+|y_amplitude|^2)
My current implementation is as followed
row_mat = np.asarray([[0, 0, 0], [1, 0, -1], [0, 0, 0]])
col_mat = row_mat.T
row_mat = row_mat[None, None, :, :] # expand dim to convolve with tensor (batch,channel,width,height)
col_mat = col_mat[None, None, :, :] # expand dim to convolve with tensor (batch,channel,width,height)
def derivative(batch: torch.Tensor) -> torch.Tensor:
"""
uses convolution to perform x and y derivatives
:param batch: input tensor batch
:return: image derivative magnitudes
"""
x_amplitude = ndimage.convolve(batch, row_mat)
y_amplitude = ndimage.convolve(batch, col_mat)
magnitude = np.sqrt(np.abs(x_amplitude) ** 2 + np.abs(y_amplitude) ** 2)
return torch.tensor(magnitude)
I was wondering if there's a faster way, as this approach actually convolves using the definition of a derivative, so there might be downsides to that.
PS. to test this you can use the tensor torch.randn(1000,128,28,28), as these are the dimension I'm dealing with
For this specific operation you might be able to speed things up a bit by doing it "manually":
import torch.nn.functional as nnf
def derivative(batch: torch.Tensor) -> torch.Tensor:
# pad batch
x = nnf.pad(batch, (1, 1, 1, 1), mode='reflect')
dx2 = (x[..., 1:-2, :-2] - x[..., 1:-2, 2:])**2
dy2 = (x[..., :-2, 1:-2] - x[..., 2:, 1:-2])**2
mag = torch.sqrt(dx2 + dy2)
return mag

Expectation Maximization Algorithm (EM) for Gaussian Mixture Models (GMMs)

I'm trying to apply the Expectation Maximization Algorithm (EM) to a Gaussian Mixture Model (GMM) using Python and NumPy. The PDF document I am basing my implementation on can be found here.
Below are the equations:
When applying the algorithm I get the mean of the first and second cluster equal to:
array([[2.50832195],
[2.51546208]])
When the actual vector means for the first and second cluster are, respectively:
array([[0],
[0]])
and:
array([[5],
[5]])
The same thing happens when getting the values of the covariance matrices I get:
array([[7.05168736, 6.17098629],
[6.17098629, 7.23009494]])
When it should be:
array([[1, 0],
[0, 1]])
for both clusters.
Here is the code:
np.random.seed(1)
# first cluster
X_11 = np.random.normal(0, 1, 1000)
X_21 = np.random.normal(0, 1, 1000)
# second cluster
X_12 = np.random.normal(5, 1, 1000)
X_22 = np.random.normal(5, 1, 1000)
X_1 = np.concatenate((X_11,X_12), axis=None)
X_2 = np.concatenate((X_21,X_22), axis=None)
# data matrix of k x n dimensions (2 x 2000 dimensions)
X = np.concatenate((np.array([X_1]),np.array([X_2])), axis=0)
# multivariate normal distribution function gives n x 1 vector (2000 x 1 vector)
def normal_distribution(x, mu, sigma):
mvnd = []
for i in range(np.shape(x)[1]):
gd = (2*np.pi)**(-2/2) * np.linalg.det(sigma)**(-1/2) * np.exp((-1/2) * np.dot(np.dot((x[:,i:i+1]-mu).T, np.linalg.inv(sigma)), (x[:,i:i+1]-mu)))
mvnd.append(gd)
return np.reshape(np.array(mvnd), (np.shape(x)[1], 1))
# Initialized parameters
sigma_1 = np.array([[10, 0],
[0, 10]])
sigma_2 = np.array([[10, 0],
[0, 10]])
mu_1 = np.array([[10],
[10]])
mu_2 = np.array([[10],
[10]])
pi_1 = 0.5
pi_2 = 0.5
Sigma_1 = np.empty([2000, 2, 2])
Sigma_2 = np.empty([2000, 2, 2])
for i in range(10):
# E-step:
w_i1 = (pi_1*normal_distribution(X, mu_1, sigma_1))/(pi_1*normal_distribution(X, mu_1, sigma_1) + pi_2*normal_distribution(X, mu_2, sigma_2))
w_i2 = (pi_2*normal_distribution(X, mu_2, sigma_2))/(pi_1*normal_distribution(X, mu_1, sigma_1) + pi_2*normal_distribution(X, mu_2, sigma_2))
# M-step:
pi_1 = np.sum(w_i1)/2000
pi_2 = np.sum(w_i2)/2000
mu_1 = np.array([(1/(np.sum(w_i1)))*np.sum(w_i1.T*X, axis=1)]).T
mu_2 = np.array([(1/(np.sum(w_i2)))*np.sum(w_i2.T*X, axis=1)]).T
for i in range(2000):
Sigma_1[i:i+1, :, :] = w_i1[i:i+1,:]*np.dot((X[:,i:i+1]-mu_1), (X[:,i:i+1]-mu_1).T)
Sigma_2[i:i+1, :, :] = w_i2[i:i+1,:]*np.dot((X[:,i:i+1]-mu_2), (X[:,i:i+1]-mu_2).T)
sigma_1 = (1/(np.sum(w_i1)))*np.sum(Sigma_1, axis=0)
sigma_2 = (1/(np.sum(w_i2)))*np.sum(Sigma_2, axis=0)
Would really appreciate if someone could point out the mistake in my code or in my misunderstanding of the algorithm..

'Lossy' cumsum in numpy

I have an array a of length N and need to implement the following operation:
With p in [0..1]. This equation is a lossy sum, where the first indexes in the sum are weighted by a greater loss (p^{n-i}) than the last ones. The last index (i=n) is always weigthed by 1. if p = 1, then the operation is a simple cumsum.
b = np.cumsum(a)
If if p != 1, I can implement this operation in a cpu-inefficient way:
b = np.empty(np.shape(a))
# I'm using the (-1,-1,-1) idiom for reversed ranges
p_vec = np.power(p, np.arange(N-1, 0-1, -1))
# p_vec[0] = p^{N-1}, p_vec[-1] = 1
for n in range(N):
b[n] = np.sum(a[:n+1]*p_vec[-(n+1):])
Or in a memory-inefficient but vectorized way (IMO is cpu inefficient too, since a lot of work is wasted):
a_idx = np.reshape(np.arange(N+1), (1, N+1)) - np.reshape(np.arange(N-1, 0-1, -1), (N, 1))
a_idx = np.maximum(0, a_idx)
# For N=4, a_idx looks like this:
# [[0, 0, 0, 0, 1],
# [0, 0, 0, 1, 2],
# [0, 0, 1, 2, 3],
# [0, 1, 2, 3, 4]]
a_ext = np.concatenate(([0], a,), axis=0) # len(a_ext) = N + 1
p_vec = np.power(p, np.arange(N, 0-1, -1)) # len(p_vec) = N + 1
b = np.dot(a_ext[a_idx], p_vec)
Is there a better way to achieve this 'lossy' cumsum?
What you want is a IIR filter, you can use scipy.signal.lfilter(), here is the code:
Your code:
import numpy as np
N = 10
p = 0.8
np.random.seed(0)
x = np.random.randn(N)
y = np.empty_like(x)
p_vec = np.power(p, np.arange(N-1, 0-1, -1))
for n in range(N):
y[n] = np.sum(x[:n+1]*p_vec[-(n+1):])
y
the output:
array([1.76405235, 1.81139909, 2.42785725, 4.183179 , 5.21410119,
3.19400307, 3.50529088, 2.65287549, 2.01908154, 2.02586374])
By using lfilter():
from scipy import signal
y = signal.lfilter([1], [1, -p], x)
print(y)
the output:
array([1.76405235, 1.81139909, 2.42785725, 4.183179 , 5.21410119,
3.19400307, 3.50529088, 2.65287549, 2.01908154, 2.02586374])

Custom loss with loops in tensorflow

I have a function in my data prepocessing which performs a blockwise DCT on 3D numpy arrays in YCbCr-mode.
def perform_blockwise_dct(img, ratio):
imsize = img.shape
dct_blocks = np.zeros(imsize)
for i in np.r_[:imsize[0]:8]:
for j in np.r_[:imsize[1]:8]:
dct_blocks[i:(i+8),j:(j+8), 0] = dct(dct(img[i:(i+8),j:(j+8), 0].T, norm='ortho').T, norm='ortho')
dct_blocks[i:(i+8),j:(j+8), 1] = dct(dct(img[i:(i+8),j:(j+8), 1].T, norm='ortho').T, norm='ortho')
dct_blocks[i:(i+8),j:(j+8), 2] = dct(dct(img[i:(i+8),j:(j+8), 2].T, norm='ortho').T, norm='ortho')
return dct_blocks
To be able to implement a custom mean square error function I would like to reverse this function. The problem is that when implementing the loss function it is a tensorflow tensor. There is an inverse DCT function to use. However, I do not know how to perform an equivilent double for-loop to do it block-wise. Currently it is done on the entire image, like this:
def mse_custom_loss(a, b)
y = tf.spectral.idct(a[:,:,0], norm='ortho')
cb = tf.spectral.idct(a[:,:,1], norm='ortho')
cr = tf.spectral.idct(a[:,:,2], norm='ortho')
a = K.stack([y, cb, cr], axis=-1)
y = tf.spectral.idct(b[:,:,0], norm='ortho')
cb = tf.spectral.idct(b[:,:,1], norm='ortho')
cr = tf.spectral.idct(b[:,:,2], norm='ortho')
b = K.stack([y, cb, cr], axis=-1)
return mean_square_error(a, b)
Any ideas on how to do it correctly? I assume lambda functions might be a possibility?
I think this is a TensorFlow equivalent to your NumPy/SciPy function:
import tensorflow as tf
def perform_blockwise_dct_tf(img):
shape = tf.shape(img)
x, y, c = shape[0], shape[1], shape[2]
img_res = tf.reshape(img, [x // 8, 8, y // 8, 8, c])
img_dct1 = tf.spectral.dct(tf.transpose(img_res, [0, 1, 2, 4, 3]), norm='ortho')
img_dct2 = tf.spectral.dct(tf.transpose(img_dct1, [0, 2, 4, 3, 1]), norm='ortho')
out = tf.reshape(tf.transpose(img_dct2, [0, 4, 1, 2, 3]), shape)
return out
A small test:
import numpy as np
from scipy.fftpack import dct
def perform_blockwise_dct(img):
imsize = img.shape
dct_blocks = np.zeros(imsize, dtype=img.dtype)
for i in np.r_[:imsize[0]:8]:
for j in np.r_[:imsize[1]:8]:
dct_blocks[i:(i+8), j:(j+8), 0] = dct(dct(img[i:(i+8), j:(j+8), 0].T, norm='ortho').T, norm='ortho')
dct_blocks[i:(i+8), j:(j+8), 1] = dct(dct(img[i:(i+8), j:(j+8), 1].T, norm='ortho').T, norm='ortho')
dct_blocks[i:(i+8), j:(j+8), 2] = dct(dct(img[i:(i+8), j:(j+8), 2].T, norm='ortho').T, norm='ortho')
return dct_blocks
np.random.seed(100)
# DCT in TensorFlow only supports float32
img = np.random.rand(128, 256, 3).astype(np.float32)
out1 = perform_blockwise_dct(img)
with tf.Graph().as_default(), tf.Session() as sess:
out2 = sess.run(perform_blockwise_dct_tf(img))
# There is a bit of error
print(np.allclose(out1, out2, rtol=1e-5, atol=1e-6))
# True

tensorflow tf.pad shape of output

I have this function:
def resize_image(input_layer, counter ,width):
shape = input_layer.get_shape().as_list()
H = tf.cast((width * shape[2] / shape[1]), tf.int32)
print (H)
resized_images = tf.image.resize_images(input_layer, [width, H], tf.image.ResizeMethod.BICUBIC)
print (resized_images)
pad_diff = width - H
padd_images = tf.pad(resized_images, [[0, 0], [0, pad_diff], [0, 0], [0, 0]] , 'CONSTANT')
return padd_images, counter
When I run this :
sess = tf.InteractiveSession()
I = tf.random_uniform([15, 15, 13, 5], minval = -5, maxval = 10, dtype = tf.float32)
padd_images, counter = resize_image(I, 1, 5)
print (I)
print(padd_images)
sess.run(padd_images)
I get this:
Tensor("Cast/x:0", shape=(), dtype=int32)
Tensor("ResizeBicubic:0", shape=(15, 5, 4, 5), dtype=float32)
Tensor("random_uniform:0", shape=(15, 15, 13, 5), dtype=float32)
Tensor("Pad:0", shape=(?, ?, ?, ?), dtype=float32)
Why there are ? in the shape of padd_images? Is there a way to know its shape?
The problem is a the line
H = tf.cast((width * shape[2] / shape[1]), tf.int32)
Here you're defining a tensor. Thus when you compute:
pad_diff = width - H
you're defining an operation into the graph.
Thus you don't know at compile time what the pad_diff value is, but you'll now it only at runtime.
Since you don't need to have H as a tensor, just use the regular python cast operation, changing thus H with
H = int(width * shape[2] / shape[1])
In this way, the next operations that use H are executed within the python environment and thus the value are known at "compile time".
After that you'll see:
Tensor("Pad:0", shape=(15, 6, 4, 5), dtype=float32)

Categories