Gradient not defined Tensorflow - python

I had asked a question and was implementing the solution when I found out that the operation tf.math.count_nonzero does not have gradient defined. So I tried the following round about method:
eps = 1e-6
a = tf.ones((4, 4, 2, 2), tf.float32)
h = tf.linalg.svd(a, full_matrices=False, compute_uv=False)
cond = tf.less(h, eps)
h = tf.where(cond, tf.zeros(tf.shape(h)), h)
i = tf.reduce_sum(h, axis=-1)
j = h[:, :, 0]
rank_mat = tf.multiply(2., tf.ones((4, 4)))
cond = tf.not_equal(i, j)
rank_mat = tf.where(cond, rank_mat, tf.ones(tf.shape(rank_mat)))
cond = tf.equal(i, tf.zeros(shape=tf.shape(i), dtype=tf.float32))
rank_mat = tf.where(cond, tf.zeros(tf.shape(rank_mat)), rank_mat)
min_rank = tf.reduce_min(rank_mat)
Still the same error persists. I partly understand why this is happening, but is there a differentiable way of implementing this? Thanks.


NotImplementedError: Cannot convert a symbolic tf.Tensor (Log_2:0) to a numpy array

I have the following code which is based on The original code was based on TF v1 and I am in the process of migrating it to TF v2. However, I am facing some issues while trying to perform a numpy operation on a tensor. I am running the code on Google Colab.
The reproducible code is as below, sorry it is quite long:
import tensorflow as tf
import numpy as np
import scipy as sp
from datetime import datetime
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
HERMITE = [[1, 0, -3, 2], [0, 0, 3, -2], [0, 1, -2, 1], [0, 0, -1, 1]]
FORMAT = 'float32'
def real_hermite_interp(xi, x, m, p):
# Hermite polynomial coefficients
h = tf.Variable(np.array(HERMITE).astype(FORMAT), trainable=False)
xx = tf.stack([x[:, :-1], x[:, 1:]], axis=2)
# The concatenated coefficients are of shape (n_knots - 1, 2)
mm = tf.stack([m[:-1], m[1:]], axis=1)
pp = tf.stack([p[:-1], p[1:]], axis=1)
y = tf.concat([mm, pp], axis=1)
# Extract Hermite polynomial coefficients from y (n_knots - 1, 4)
yh = tf.matmul(y, h)
xi_ = tf.expand_dims(tf.expand_dims(xi, 0), 0)
x0_ = tf.expand_dims(xx[:, :, 0], 2)
x1_ = tf.expand_dims(xx[:, :, 1], 2)
xn = (xi_ - x0_) / (x1_ - x0_)
# Calculate powers of normalized interpolation vector
mask = tf.logical_and(tf.greater_equal(xn, 0.), tf.less(xn, 1.))
mask = tf.cast(mask, tf.float32)
xp = tf.pow(tf.expand_dims(xn, -1), [0, 1, 2, 3])
# Interpolate
return tf.einsum('rf,srtf->st', yh, xp * tf.expand_dims(mask, -1))
class Scattering:
"""Learnable scattering network layer."""
def __init__(self, x, j=None, q=None, k=None, pooling_type='average',
decimation=2, pooling=2, index=0, **filters_kw):
"""Scattering network layer.
Computes the convolution modulus and scattering coefficients of the
input signal.
x: :class:`~tensorflow.Tensor()`
Input data of shape ``(batch_size, channels, patch_shape).
# Filter bank properties
self.shape_input = x.get_shape().as_list()
self.j = j = j[index] if type(j) is list else j
self.q = q = q[index] if type(q) is list else q
self.k = k = k[index] if type(k) is list else k
filters = self.init_filters(j, q, k, **filters_kw)
n_filters, kernel_size = filters.get_shape().as_list()
filters_concat = tf.concat([tf.math.real(filters), tf.math.imag(filters)], 0)
filters_kernel = tf.expand_dims(tf.transpose(filters_concat), 1)
# Pad input in the time dimension before convolution with half the size
# of filters temporal dimension (kernel_size).
shape_fast = [[:-1]), 1, self.shape_input[-1]]
paddings = [0, 0], [0, 0], [kernel_size // 2 - 1, kernel_size // 2 + 1]
x_reshape = tf.reshape(x, shape_fast)
x_pad = tf.pad(x_reshape, paddings=paddings, mode='SYMMETRIC')
# Differentiate the case of one input channel or multiple
# which needs reshaping in order to treat them independently
# The "NCW" format stores data as batch_shape + [in_channels, in_width]
x_conv = tf.nn.conv1d(x_pad, filters_kernel, stride=decimation,
padding='VALID', data_format='NCW')
u = tf.sqrt(tf.square(x_conv[:, :n_filters]) +
tf.square(x_conv[:, n_filters:]))
self.u = tf.reshape(u, (*self.shape_input[:-1], n_filters, -1))
pool = tf.keras.layers.AveragePooling1D
# Pooling for the scattering coefficients
if pooling > 1:
pooled = pool(
pooling // (decimation ** (index + 1)),
pooling // (decimation ** (index + 1)),
padding='valid', data_format='channels_first')
pooled = pooled(u)
self.s = tf.reshape(pooled, self.shape_input[:-1] + [j * q] + [-1])
self.output = self.s
inverse = tf.gradients(x_conv, x, x_conv)[0]
self.reconstruction_loss = tf.nn.l2_loss(
inverse - tf.stop_gradient(x)) /
def init_filters(self, j, q, k, learn_scales=False, learn_knots=False,
learn_filters=True, hilbert=False):
extra_octave = 1 if learn_scales else 0
self.filter_samples = k * 2 ** (j + extra_octave)
time_max = np.float32(k * 2**(j - 1 + extra_octave))
time_grid = tf.linspace(-time_max, time_max, self.filter_samples)
scales_base = 2**(tf.range(j * q, dtype=tf.float32) / np.float32(q))
scales_delta = tf.Variable(
tf.zeros(j * q), trainable=learn_scales, name='scales')
scales = scales_base + scales_delta
nyquist_offset = scales + \
tf.stop_gradient(tf.one_hot(0, j * q) * tf.nn.relu(1 - scales[0]))
scales_correction = tf.concat(
tf.nn.relu(nyquist_offset[:-1] - nyquist_offset[1:])], 0)
self.scales = nyquist_offset + \
knots_base = tf.Variable(
tf.ones(k), trainable=learn_knots, name='knots')
knots_sum = tf.cumsum(
tf.expand_dims(knots_base, 0) * tf.expand_dims(self.scales, 1),
1, self.filter_samples - k), exclusive=True, axis=1)
self.knots = knots_sum - (k // 2) * tf.expand_dims(self.scales, 1)
if hilbert is True:
m = (np.cos(np.arange(k) * np.pi) * np.hamming(k)).astype(FORMAT)
p = (np.zeros(k)).astype(FORMAT)
self.m = tf.Variable(m, name='m', trainable=learn_filters)
self.p = tf.Variable(p, name='p', trainable=learn_filters)
# Boundary Conditions and centering
mask = np.ones(k, dtype=np.float32)
mask[0], mask[-1] = 0, 0
m_null = self.m - tf.reduce_mean(self.m[1:-1])
filters = real_hermite_interp(
time_grid, self.knots, m_null * mask, self.p * mask)
# Renorm and set filter-bank
filters_renorm = filters / tf.reduce_max(filters, 1, keepdims=True)
filters_fft = tf.signal.rfft(filters_renorm) # was spectral.rfft
filters = tf.signal.ifft(
tf.concat([filters_fft, tf.zeros_like(filters_fft)], 1))
# Define the parameters for saving
self.parameters = self.m, self.p, self.scales, self.knots
return filters
def renorm(self, parent, epsilon=1e-3):
# Extract all shapes.
if epsilon > 0:
s = self.s / (tf.expand_dims(parent.s, -2) + epsilon)
batch_size, *_, samples = s.get_shape().as_list()
return tf.reshape(s, [batch_size, -1, samples])
return tf.reshape(self.s, [batch_size, -1, samples])
# testing
data = tf.random.uniform((4,3,16800), dtype=tf.float32)
batch_size = 4
args = {'layers': {'j': [4, 6, 8], 'q': [8, 2, 1], 'k': 7, 'pooling_type': 'average', 'decimation': 4, 'pooling': 1024, 'learn_scales': False, 'learn_knots': False, 'learn_filters': True, 'hilbert': True}, 'eps_norm': 0.001, 'eps_log': 0.0001, 'learning': {'epochs': 3, 'rate': 0.001}, 'pca': {'n_components': 5}, 'gmm': {'gmm_type': 'natural', 'trainable': False}, 'gmm_init': {'n_components': 10, 'max_iter': 1000, 'covariance_type': 'full', 'warm_start': True}}
# Run over batches
epochs = args['learning']['epochs']
learning_rate = args['learning']['rate']
for epoch in range(epochs):
# Gradually decrease learning rate over epochs
if epoch == epochs // 2:
learning_rate /= 5
if epoch == 3 * epochs // 4:
learning_rate /= 5
# Calculate scattering coefficients for all batches
scat_all = list()
n_batches = data.shape[0] // batch_size
for b in range(n_batches):
layers = [Scattering(data, index=0, **args['layers'])]
for i in range(1, 3):
layer = Scattering(layers[-1].u, index=i, **args['layers'])
# Extract parameters.
net = [layer.parameters for layer in layers]
# Get reconstruction losses.
rl = tf.add_n([a.reconstruction_loss for a in layers])
# Renormalize coefficients.
r = list()
for i in range(1, 3):
r.append(layers[i].renorm(layers[i - 1], args['eps_norm']))
# Concatenate.
sx = tf.transpose(tf.concat(r, axis=1), [1, 0, 2])
sx = tf.reshape(sx, [sx.get_shape().as_list()[0], -1])
sx = tf.transpose(sx)
sx = tf.math.log(sx + args['eps_log'])
sx[np.isnan(sx)] = np.log(args['eps_log'])
sx[np.isinf(sx)] = np.log(args['eps_log'])
The issue is from the line 'sx[np.isnan(sx)] = np.log(args['eps_log'])'. The full error is shown below:
NotImplementedError Traceback (most recent call last)
Cell In [6], line 34
31 print("sx:", sx)
32 print("sx shape: ", sx.shape)
---> 34 sx[np.isnan(sx)] = np.log(args['eps_log'])
35 sx[np.isinf(sx)] = np.log(args['eps_log'])
36 scat_all.append(sx)
File c:\Python310\lib\site-packages\tensorflow\python\framework\, in Tensor.__array__(***failed resolving arguments***)
920 def __array__(self, dtype=None):
921 del dtype
--> 922 raise NotImplementedError(
923 f"Cannot convert a symbolic tf.Tensor ({}) to a numpy array."
924 f" This error may indicate that you're trying to pass a Tensor to"
925 f" a NumPy call, which is not supported.")
NotImplementedError: Cannot convert a symbolic tf.Tensor (Log_2:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported.
Based on solutions on previous stackoverflow posts, I have tried to upgrade my tensorflow (2.11.0) and numpy versions (1.23.5) but that did not solve the problem. I saw some suggestions on downgrading numpy but because of other dependencies that did not work. My Python version is 3.8.16. Any suggestions on how to proceed? Thanks in advance.

Solving Sylvester equations in PyTorch

I'm trying to solve a Sylvester matrix equation of the form
AX + XB = C
From what I've seen, these equations are usually solved with the Bartels-Stewart algorithm taking successive Schur decompositions. I'm aware scipy.linalg already has a solve_sylvester function, but I'm integrating the solution to the Sylvester equation into a neural network, so I need a way to calculate gradients to make A, B, and C learnable. Currently, I'm just solving a linear system with torch.linalg.solve using the Kronecker product and vectorization trick, but this has terrible runtime complexity. I haven't found any PyTorch support for Sylvester equations, let alone Schur decompositions, but before I try to implement Barters-Stewart on the GPU, is there a simpler way to find the gradients?
Initially I wrote a solution that would give complex X based on Bartels-Stewart algorithm for the m=n case. I had some problems because the eigenvector matrix is not accurate enough. Also the real part gives the real solution, and the imaginary part must be a solution for AX - XB = 0
import torch
def sylvester(A, B, C, X=None):
m = B.shape[-1];
n = A.shape[-1];
R, U = torch.linalg.eig(A)
S, V = torch.linalg.eig(B)
F = torch.linalg.solve(U, (C + 0j) # V)
W = R[..., :, None] - S[..., None, :]
Y = F / W
X = U[...,:n,:n] # Y[...,:n,:m] # torch.linalg.inv(V)[...,:m,:m]
return X.real if all(torch.isreal(x.flatten()[0])
for x in [A, B, C]) else X
As can be verified on the GPU with
# Try different dimensions
for batch_size, M, N in [(1, 4, 4), (20, 16, 16), (6, 13, 17), (11, 29, 23)]:
print(batch_size, (M, N))
A = torch.randn((batch_size, N, N), dtype=torch.float64,
device=device, requires_grad=True)
B = torch.randn((batch_size, M, M), dtype=torch.float64,
device=device, requires_grad=True)
X = torch.randn((batch_size, N, M), dtype=torch.float64,
device=device, requires_grad=True)
C = A # X - X # B
X_ = sylvester(A, B, C)
C_ = (A) # X_ - X_ # (B)
print(torch.max(abs(C - C_)))
A faster algorithm, but inaccurate in the current pytorch version is
def sylvester_of_the_future(A, B, C):
def h(V):
return V.transpose(-1,-2).conj()
m = B.shape[-1];
n = A.shape[-1];
R, U = torch.linalg.eig(A)
S, V = torch.linalg.eig(B)
F = h(U) # (C + 0j) # V
W = R[..., :, None] - S[..., None, :]
Y = F / W
X = U[...,:n,:n] # Y[...,:n,:m] # h(V)[...,:m,:m]
return X.real if all(torch.isreal(x.flatten()[0]) for x in [A, B, C]) else X
I will leave it here maybe in the future it will work properly.

My code giving differnt result where as the same code in my Machine learning assignment expects a different result?

def lrCostFunction(theta, X, y, lambda_):
m = y.size
if y.dtype == bool:
y = y.astype(int)
tempt = theta
tempt[0] = 0
J = 0
grad = np.zeros(theta.shape)
hx =
h = sigmoid(hx)
J = (1/m) * np.sum( - (1-y).dot(np.log(1-h)))
J = J + (lambda_/(2*m)) * np.sum(np.square(tempt))
grad = ((1/m) * (h - y) .dot(X)) + (lambda_/m) * tempt
return J, grad
# rand_indices = np.random.choice(m, 100, replace=False)
# sel = X[rand_indices, :]\
theta_t = np.array([-2, -1, 1, 2], dtype=float)
X_t = np.concatenate([np.ones((5, 1)), np.arange(1, 16).reshape(5, 3, order='F')/10.0], axis=1)
y_t = np.array([1, 0, 1, 0, 1])
lambda_t = 3
cost, gradient = lrCostFunction(theta_t, X_t, y_t, lambda_t)
print("J= ", cost, "\nGrad= ", gradient)
J= 3.0857279966152817
Grad= [ 0.35537648 -0.49170896 0.88597928 1.66366752]
where as the assignment asks for these results from the same input:
print('Cost : {:.6f}'.format(J))
print('Expected cost: 2.534819')
print(' [{:.6f}, {:.6f}, {:.6f}, {:.6f}]'.format(*grad))
print('Expected gradients:')
print(' [0.146561, -0.548558, 0.724722, 1.398003]');
I even searched the internet for answers everyone had the same code as me and they stated that their result is same as predicted. I went to as far as copying their code on my pycharm IDE but i got the same answer again.
The inputs are same too if u wanna read the question its "Vectorizing regularized logistic regression"
This happened to me on one part of the last assignment as well, its really frustrating so i am reaching out for help.
Your code is correct. The problem is that when you change the value tempt[0] you are also changing theta[0]. Doing a copy of theta ensures that the initial vector is not changed.
def lrCostFunction(theta, X, y, lambda_):
m = y.size
if y.dtype == bool:
y = y.astype(float)
J = 0
grad = np.zeros(theta.shape)
hx =
h = sigmoid(hx)
tempt = np.copy(theta) # Copy of theta
tempt[0] = 0
J = (1/m) * np.sum( - (1-y).dot(np.log(1-h)))
J = J + (lambda_/(2*m)) * np.sum(np.square(tempt))
grad = ((1/m) * (h - y) .dot(X)) + (lambda_/m) * tempt
print(theta, tempt)
return J, grad
cost, gradient = lrCostFunction(theta_t, X_t, y_t, lambda_t)
print("J= ", cost, "\nGrad= ", gradient)
# Output:
# [-2. -1. 1. 2.] [ 0. -1. 1. 2.]
# J= 2.534819396109744
# Grad= [ 0.14656137 -0.54855841 0.72472227 1.39800296]

Python CNN im2col function doesn't make sense

In CNN Convolution learning, im2col function code is not understood.
def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
N, C, H, W = input_data.shape
out_h = (H + 2*pad - filter_h)//stride + 1
out_w = (W + 2*pad - filter_w)//stride + 1
img = np.pad(input_data, [(0,0), (0,0), (pad, pad), (pad, pad)], 'constant')
col = np.zeros((N, C, filter_h, filter_w, out_h, out_w))
for y in range(filter_h):
y_max = y + stride*out_h
for x in range(filter_w):
x_max = x + stride*out_w
col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]
col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N*out_h*out_w, -1)
return col
Q1. I don't know why the input data(img) is converted to six dimensions(col). Why are you converting like that?
Q2. Python syntax is not familiar. So,I don't understand the syntax of lines 9-13.
Can you explain it in C / C ++ / JAVA?
I tried a lot to understand. Help me.
I'm sorry about the weird grammar. :(
Have a nice day and thank you all!

Implementing LeCun Local Contrast Normalization with Theano

I'm trying to use the code that I found to implement the LeCun Local Contrast Normalization but I get incorrect result. The code is in Python and uses the theano library.
def lecun_lcn(input, img_shape, kernel_shape, threshold=1e-4):
Yann LeCun's local contrast normalization
Orginal code in Theano by: Guillaume Desjardins
input = input.reshape(input.shape[0], 1, img_shape[0], img_shape[1])
X = T.matrix(dtype=theano.config.floatX)
X = X.reshape(input.shape)
filter_shape = (1, 1, kernel_shape, kernel_shape)
filters = gaussian_filter(kernel_shape).reshape(filter_shape)
convout = conv.conv2d(input=X,
image_shape=(input.shape[0], 1, img_shape[0], img_shape[1]),
# For each pixel, remove mean of 9x9 neighborhood
mid = int(np.floor(kernel_shape / 2.))
centered_X = X - convout[:, :, mid:-mid, mid:-mid]
# Scale down norm of 9x9 patch if norm is bigger than 1
sum_sqr_XX = conv.conv2d(input=centered_X ** 2,
image_shape=(input.shape[0], 1, img_shape[0], img_shape[1]),
denom = T.sqrt(sum_sqr_XX[:, :, mid:-mid, mid:-mid])
per_img_mean = denom.mean(axis=[1, 2])
divisor = T.largest(per_img_mean.dimshuffle(0, 'x', 'x', 1), denom)
divisor = T.maximum(divisor, threshold)
new_X = centered_X / divisor
new_X = new_X.dimshuffle(0, 2, 3, 1)
new_X = new_X.flatten(ndim=3)
f = theano.function([X], new_X)
return f(input)
Here is the testing code:
x_img_origin = plt.imread("..//data//Lenna.png")
x_img = plt.imread("..//data//Lenna.png")
x_img_real_result = plt.imread("..//data//Lenna_Processed.png")
x_img = x_img.reshape(1, x_img.shape[0], x_img.shape[1], x_img.shape[2])
for d in range(3):
x_img[:, :, :, d] = tools.lecun_lcn(x_img[:, :, :, d], (x_img.shape[1], x_img.shape[2]), 9)
x_img = x_img[0]
pylab.subplot(1, 3, 1); pylab.axis('off'); pylab.imshow(x_img_origin)
pylab.subplot(1, 3, 2); pylab.axis('off'); pylab.imshow(x_img)
pylab.subplot(1, 3, 3); pylab.axis('off'); pylab.imshow(x_img_real_result)
Here is the result:
(left to right: origin, my result, the expected result)
Could someone tell me what I did wrong with the code?
Here is how I implemented local contrast normalization as reported in Jarrett et al ( You can use it as a separate layer.
I tested it on the code from the LeNet tutorial of theano in which I applied LCN to the input and to each convolutional layer which yields slightly better results.
You can find the full code here:
class LecunLCN(object):
def __init__(self, X, image_shape, threshold=1e-4, radius=9, use_divisor=True):
Allocate an LCN.
:type X: theano.tensor.dtensor4
:param X: symbolic image tensor, of shape image_shape
:type image_shape: tuple or list of length 4
:param image_shape: (batch size, num input feature maps,
image height, image width)
:type threshold: double
:param threshold: the threshold will be used to avoid division by zeros
:type radius: int
:param radius: determines size of Gaussian filter patch (default 9x9)
:type use_divisor: Boolean
:param use_divisor: whether or not to apply divisive normalization
# Get Gaussian filter
filter_shape = (1, image_shape[1], radius, radius)
self.filters = theano.shared(self.gaussian_filter(filter_shape), borrow=True)
# Compute the Guassian weighted average by means of convolution
convout = conv.conv2d(
# Subtractive step
mid = int(numpy.floor(filter_shape[2] / 2.))
# Make filter dimension broadcastable and subtract
centered_X = X - T.addbroadcast(convout[:, :, mid:-mid, mid:-mid], 1)
# Boolean marks whether or not to perform divisive step
if use_divisor:
# Note that the local variances can be computed by using the centered_X
# tensor. If we convolve this with the mean filter, that should give us
# the variance at each point. We simply take the square root to get our
# denominator
# Compute variances
sum_sqr_XX = conv.conv2d(
# Take square root to get local standard deviation
denom = T.sqrt(sum_sqr_XX[:,:,mid:-mid,mid:-mid])
per_img_mean = denom.mean(axis=[2,3])
divisor = T.largest(per_img_mean.dimshuffle(0, 1, 'x', 'x'), denom)
# Divisise step
new_X = centered_X / T.maximum(T.addbroadcast(divisor, 1), threshold)
new_X = centered_X
self.output = new_X
def gaussian_filter(self, kernel_shape):
x = numpy.zeros(kernel_shape, dtype=theano.config.floatX)
def gauss(x, y, sigma=2.0):
Z = 2 * numpy.pi * sigma ** 2
return 1. / Z * numpy.exp(-(x ** 2 + y ** 2) / (2. * sigma ** 2))
mid = numpy.floor(kernel_shape[-1] / 2.)
for kernel_idx in xrange(0, kernel_shape[1]):
for i in xrange(0, kernel_shape[2]):
for j in xrange(0, kernel_shape[3]):
x[0, kernel_idx, i, j] = gauss(i - mid, j - mid)
return x / numpy.sum(x)
I think these two lines may have some mistakes on the matrix axes:
per_img_mean = denom.mean(axis=[1, 2])
divisor = T.largest(per_img_mean.dimshuffle(0, 'x', 'x', 1), denom)
and it should be rewritten as:
per_img_mean = denom.mean(axis=[2, 3])
divisor = T.largest(per_img_mean.dimshuffle(0, 1, 'x', 'x'), denom)
