Which TensorFlow operations do not have GPU implementations?

Which TensorFlow operations do not have GPU implementations? - python

The code below is trying to do linear implementations, similar to numpy.interp(). But it is quite slow and I think the reason is that some operations in the code do not have GPU implementations. But I don't know which one. Could anyone tell me and suggest some solutions?
def tf_interp(b, x, y):
xaxis_pad = tf.concat([[tf.minimum(b, tf.gather(x, 0))], x, [tf.maximum(b, tf.gather(x, x.get_shape()[0] - 1))]],
axis=0)
yaxis_pad = tf.concat([[0.0], y, [0.0]], axis=0)
cmp = tf.cast(b >= xaxis_pad, dtype=tf.float32)
diff = cmp[1:] - cmp[:-1]
idx = tf.argmin(diff)
# Interpolate
alpha = (b - xaxis_pad[idx]) / (xaxis_pad[idx + 1] - xaxis_pad[idx])
res = alpha * yaxis_pad[idx + 1] + (1 - alpha) * yaxis_pad[idx]
def f1(): return 0.0
def f2(): return alpha * yaxis_pad[idx + 1] + (1 - alpha) * yaxis_pad[idx]
res = tf.cond(pred=tf.is_nan(res), true_fn=f1, false_fn=f2)
return res
def tf_interpolation(t, x, y):
t = tf.cast(t, tf.float32)
x = tf.cast(x, tf.float32)
y = tf.cast(y, tf.float32)
t1 = tf.reshape(t, [-1, ])
t_return = tf.map_fn(lambda b: tf_interp(b, x, y), t1)
t_return = tf.reshape(t_return, [t.get_shape()[0], t.get_shape()[1]])
return t_return

Related

Learning parameters with gradient descent

I just started a ML course and I'm trying to run gradient descent in python. The below functions work fine, but as I move on to the bigger chunk where I do the actual learning, I just can't get the expected output and learn the right parameters, as you can tell from this decision boundary I plotted afterwards. And I'm trying to figure out why.
plotting the decision boundary
def sigmoid(z):
sigma = 1/(1+np.exp(-z))
return sigma
def compute_cost(X, y, w, b):
y_hat = sigmoid((X * np.expand_dims(w, axis=0)).sum(axis=1) + b)
total_cost = (-y * np.log(y_hat) - (1-y) * np.log(1-y_hat)).mean()
return total_cost
def compute_gradient(X, y, w, b):
z = w * X + b
yhat = sigmoid(z)
y1 = np.expand_dims(y, axis=1)
error = yhat - y1
db = error.mean()
dw_j1 = (X * error)
dw_j = np.mean(dw_j1,axis=0)
return dw_j, db
Before building this gradient descent function, I tested all the above with my training data & they all work and output the correct numbers. Really appreciate it if you can spot my mistakes.
Learning parameters with gradient descent
def gradient_descent(X, y, w, b, alpha, num_iters):
m = len(X)
J_history = []
wb_history = []
for i in range(num_iters):
cost = compute_cost(X, y, w, b)
dw_j, db = compute_gradient(X, y, w, b)
w = w - alpha * dw_j
b = b - alpha * db
wb_history.append((w,b))
J_history.append(cost)
if i % math.ceil(num_iters/10) == 0 or i == (num_iters-1):
print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}")
return w, b, J_history, wb_history
np.random.seed(1)
initial_w = 0.01 * (np.random.rand(2) - 0.5)
initial_b = -8
iterations = 10000
alpha = 0.001
w, b, J_history, _ = gradient_descent(X_train ,y_train, initial_w, initial_b, alpha, iterations)

Calculate covariance of torch tensor (2d feature map)

I have a torch tensor with shape (batch_size, number_maps, x_val, y_val). The tensor is normalized with a sigmoid function, so within range [0, 1]. I want to find the covariance for each map, so I want to have a tensor with shape (batch_size, number_maps, 2, 2). As far as I know, there is no torch.cov() function as in numpy. How can I efficiently calculate the covariance without converting it to numpy?
Edit:
def get_covariance(tensor):
bn, nk, w, h = tensor.shape
tensor_reshape = tensor.reshape(bn, nk, 2, -1)
x = tensor_reshape[:, :, 0, :]
y = tensor_reshape[:, :, 1, :]
mean_x = torch.mean(x, dim=2).unsqueeze(-1)
mean_y = torch.mean(y, dim=2).unsqueeze(-1)
xx = torch.sum((x - mean_x) * (x - mean_x), dim=2).unsqueeze(-1) / (h*w - 1)
xy = torch.sum((x - mean_x) * (y - mean_y), dim=2).unsqueeze(-1) / (h*w - 1)
yx = xy
yy = torch.sum((y - mean_y) * (y - mean_y), dim=2).unsqueeze(-1) / (h*w - 1)
cov = torch.cat((xx, xy, yx, yy), dim=2)
cov = cov.reshape(bn, nk, 2, 2)
return cov
I tried the following now, but I m pretty sure it's not correct.

You could try the function suggested on Github:
def cov(x, rowvar=False, bias=False, ddof=None, aweights=None):
"""Estimates covariance matrix like numpy.cov"""
# ensure at least 2D
if x.dim() == 1:
x = x.view(-1, 1)
# treat each column as a data point, each row as a variable
if rowvar and x.shape[0] != 1:
x = x.t()
if ddof is None:
if bias == 0:
ddof = 1
else:
ddof = 0
w = aweights
if w is not None:
if not torch.is_tensor(w):
w = torch.tensor(w, dtype=torch.float)
w_sum = torch.sum(w)
avg = torch.sum(x * (w/w_sum)[:,None], 0)
else:
avg = torch.mean(x, 0)
# Determine the normalization
if w is None:
fact = x.shape[0] - ddof
elif ddof == 0:
fact = w_sum
elif aweights is None:
fact = w_sum - ddof
else:
fact = w_sum - ddof * torch.sum(w * w) / w_sum
xm = x.sub(avg.expand_as(x))
if w is None:
X_T = xm.t()
else:
X_T = torch.mm(torch.diag(w), xm).t()
c = torch.mm(X_T, xm)
c = c / fact
return c.squeeze()
https://github.com/pytorch/pytorch/issues/19037

How to Implement Mixup in Keras Sequence?

I have multi class target, and I'm trying to implement mixup technique in Keras sequence.
Without it, I get above 70%, but I get below 5% if I include the code below.
Could someone let me know what am I doing wrong?
Thanks!
def __getitem__(self, index):
...
x_batch, y_batch = self.mixup(x_batch, y_batch)
...
return x_batch, y_batch
def mixup(self, x, y):
n = x.shape[0]
l = np.random.beta(self.alpha, self.alpha, n)
x_l = l.reshape(n, 1, 1, 1)
y_l = l.reshape(n, 1)
x1 = x
x2 = np.flip(x, axis=0)
x = x1 * x_l + x2 * (1 - x_l)
y1 = y
y2 = np.flip(y, axis=0)
y = y1 * y_l + y2 * (1 - y_l)
return x, y

Trying to understand Gradient Checking error in 3-layer Neural Network

I am building a basic 3-layer Neural Network in Python. After writing a gradient function, I proceeded to running gradient checking on it with the numerical gradient. After getting a big relative difference, I unrolled both gradients for the weight matrices and compared them side by side.
Function Gradient Numerical Gradient
-0.000968788380809 0.0
0.0153540197907 0.0153540197889
-0.00584391679274 -0.00584391679048
-0.00490359558077 -0.00490359558514
-0.00171892592537 -0.0017189259216
0.00913024106334 0.00913024106319
-0.0182154767069 -0.0182154767092
0.0152611324409 0.01526113244
-0.00373505297372 -0.00373505297135
-0.00513225994728 -0.00513225994814
-0.00531954399401 -0.00531954399641
-0.0185748801227 -0.0185748801163
0.00745186105851 0.00745186105267
0.0134566626927 0.0134566626908
0.0251548691426 0.0251548691388
0.00609388350562 0.00609388350226
-0.00471176815719 -0.00471176815564
0.0113580721225 0.0113580721228
0.00465172663488 0.00465172663944
-0.0221326283708 -0.02213262837
0.300007655583 -0.300007655583 <-diverges, corresponding to theta2
0.155638694282 -0.15345321819
0.147747817305 -0.149026829224
0.150703152382 -0.172330417252
0.156307235611 -0.116975643856
0.136898763375 -0.170081036297
0.0621121242042 -0.0621121242372
0.0442762464937 -0.0187338352431
0.0489123689979 -0.00938236375481
0.0244392582651 -0.0465061209964
0.0237741996575 -0.028319115235
0.0313594790974 -0.0330473942922
0.106306327946 -0.106306327941
0.0348751481828 -0.0704775747806
0.0303373211657 -0.0756744476749
0.0633094699759 -0.0461971224763
0.0524239030728 -0.0477244101571
0.0633274024777 -0.0397657392082
Relative Difference:
6.61473694017
The first 20 elements in each list correspond to the gradient for the first weight matrix, and the remaining 18 correspond to the gradient for the second weight matrix. From what I can see, it appears as though the error occurs in the last 18 elements (and thus the theta2 matrix gradient) in the list, where the function gradient begins to differ from the "correct" numerical gradient. This also causes scipy.optimize.fmin_cg to give me the following:
Warning: Desired error not necessarily achieved due to precision loss.
Any help would be greatly appreciated! Here is the relevant code:
def sigmoid(z):
return 1 / (1+np.exp(z))
def sigmoid_gradient(z):
return sigmoid(z)*(1-sigmoid(z))
def randInitializeWeights(layer_in, layer_out):
matrix = np.zeros((layer_out, 1 + layer_in))
epsilon_init = 0.12
matrix = np.random.rand(layer_out, 1+layer_in) * 2 * epsilon_init -epsilon_init
return matrix
def gradient(theta, *args):
X, y, num_inputs, num_hidden_units, num_labels, lamb = args
m = len(X)
theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1)))
theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1))
theta1_grad = np.zeros(theta1.shape)
theta2_grad = np.zeros(theta2.shape)
delta1 = np.zeros(theta1.shape)
delta2 = np.zeros(theta2.shape)
for t in range(0, m):
vec_y = np.zeros(num_labels)
vec_y[y[t]] = 1
vec_y = vec_y[:, np.newaxis]
#feedforward to compute all the neuron activations
a_1 = np.r_[[1], X[t]]
a_1 = a_1[:, np.newaxis]
z_2 = np.dot(theta1, a_1)
a_2 = np.vstack([1, sigmoid(z_2)])
z_3 = np.dot(theta2, a_2)
a_3 = sigmoid(z_3)
#error for output nodes
del3 = a_3 - vec_y
#error for hidden nodes
del2 = np.multiply(np.dot(theta2.T, del3), sigmoid_gradient(np.vstack([1, z_2])))
#remove bias unit
del2 = del2[1:]
#accumulate gradient
delta1 = delta1 + del2*a_1.T
delta2 = delta2 + del3*a_2.T
#no need to regularize the first column
theta1_grad[:, 0] = (1/m)*delta1[:, 0]
theta2_grad[:, 0] = (1/m)*delta2[:, 0]
#regularize the rest
theta1_grad[:, 1:] = ((1/m) * delta1[:, 1:]) + (lamb/m)*theta1[:, 1:]
theta2_grad[:, 1:] = ((1/m) * delta2[:, 1:]) + (lamb/m)*theta2[:, 1:]
#unroll
grad = np.hstack([theta1_grad.ravel(), theta2_grad.ravel()])
return grad
def gradientChecking(lamb):
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
theta2 = randInitializeWeights(hidden_layer_size, num_labels)
X = np.random.rand(m, input_layer_size)
y = np.array([1, 2, 0, 1, 2])
nn_params = np.hstack([theta1.ravel(), theta2.ravel()])
#calculate gradient with function
grad = gradient(nn_params, X, y, input_layer_size, hidden_layer_size, num_labels, lamb)
#calculate numerical gradient
num_grad = computeNumericalGradient(lambda theta: computeCost(theta, X, y, input_layer_size, hidden_layer_size, num_labels, lamb), nn_params)
print('Function Gradient', 'Numerical Gradient')
for i in range(len(grad)):
print(grad[i], num_grad[i])
diff = np.linalg.norm(num_grad-grad)/np.linalg.norm(num_grad+grad)
print('Relative Difference: ')
print(diff)
def computeNumericalGradient(J, theta):
numgrad = np.zeros(theta.shape)
perturb = np.zeros(theta.shape)
e = 0.0001
for p in range(1, np.size(theta)):
perturb[p] = e
loss1 = J(theta - perturb)
loss2 = J(theta + perturb)
numgrad[p] = (loss2 - loss1) / (2*e)
perturb[p] = 0
return numgrad

You have an error in your sigmoid function. It should be like this:
def sigmoid(z):
return 1 / (1+np.exp(-z))
I'm a little bit confused with the implementation of the back propagation algorithm. I would do it without the for loop.
You did not post your computeCost, so I programmed it and checked the gradients. In my case both columns are equal:
('Function Gradient', 'Numerical Gradient')
(-0.0087363416123043425, 0.0)
(0.017468375248392107, 0.0174683752529603)
(-0.0016267134050363559, -0.0016267134039793518)
(0.0018882373947080224, 0.0018882373997719526)
(-0.0063531428795779391, -0.0063531428762253483)
(0.0029882213493977773, 0.0029882213481435826)
(0.014295787205089885, 0.014295787205131916)
(-0.026668095974979808, -0.026668095973736428)
(0.0043373799514851595, 0.0043373799440971084)
(0.0063740837472641377, 0.0063740837497050506)
(0.0027102260448642525, 0.0027102260435896142)
(0.0067009063282609839, 0.0067009063298151261)
(-0.0029645476578591843, -0.0029645476562478734)
(-0.012000477453137556, -0.012000477451756808)
(-0.020065071389262716, -0.020065071393293721)
(0.010308693441913186, 0.010308693438876304)
(-0.0015996484140612609, -0.0015996484115099463)
(-0.0086037766244218914, -0.0086037766244828617)
(-0.0099431361329477934, -0.0099431361344493041)
(0.0062574996404342166, 0.0062574996406716821)
(0.30213488769328123, 0.3021348876908192)
(0.14900524972537924, 0.14900524972549789)
(0.13305168538400619, 0.13305168538479961)
(0.16730920742910549, 0.16730920743279754)
(0.14245586995768528, 0.14245586995365045)
(0.15465244296463604, 0.15465244296519742)
(0.10813908901043021, 0.10813908900342284)
(0.040844058224880242, 0.04084405822446513)
(0.040566215206120269, 0.040566215204762557)
(0.036451467449020114, 0.036451467448905817)
(0.065664340475228455, 0.065664340476168093)
(0.070753692265581092, 0.07075369226283712)
(0.088651862157018618, 0.088651862166777562)
(0.028272897964677978, 0.028272897965031518)
(0.026876928049457398, 0.026876928049812676)
(0.056512225949437798, 0.056512225949933992)
(0.051775047342360533, 0.051775047342772496)
(0.025689087137289929, 0.025689087135294386)
Relative Difference:
0.00878484310135
Here is my code:
import numpy as np
def sigmoid(z):
return 1 / (1+np.exp(-z))
def sigmoid_gradient(z):
return sigmoid(z)*(1-sigmoid(z))
def randInitializeWeights(layer_in, layer_out):
matrix = np.zeros((layer_out, 1 + layer_in))
epsilon_init = 0.12
matrix = np.random.rand(layer_out, 1+layer_in) * 2 * epsilon_init -epsilon_init
return matrix
def gradient(theta, *args):
X, y, num_inputs, num_hidden_units, num_labels, lamb = args
m = len(X)
y_bin = np.zeros((m, num_labels))
for i in range(m):
y_bin[i, y[i]] = 1
theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1))) #5x4
theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1)) #3x6
theta1_grad = np.zeros(theta1.shape)
theta2_grad = np.zeros(theta2.shape)
delta1 = np.zeros(theta1.shape)
delta2 = np.zeros(theta2.shape)
#forward
a_1 = np.hstack((np.ones((m, 1)), X)) #5x4
z_2 = np.dot(a_1, theta1.transpose()) #5x5
a_2 = sigmoid(z_2) #5x5
a_2 = np.hstack((np.ones((m, 1)), a_2)) #5x6
z_3 = np.dot(a_2, theta2.transpose()) #5x3
h = sigmoid(z_3) #5x3
#backward
delta3 = h - y_bin #5x3
delta2 = np.dot(delta3, theta2[:, 1:num_hidden_units+1]) * sigmoid_gradient(z_2) #5x5
D1 = np.dot(delta2.transpose(), a_1) #5x4
D2 = np.dot(delta3.transpose(), a_2) #3x6
theta1_grad = D1/m #5x4
theta2_grad = D2/m #3x6
#regularization
theta1_grad[:, 1:num_inputs+1] = theta1_grad[:, 1:num_inputs+1] +lamb/m* theta1[:, 1:num_inputs+1]
theta2_grad[:, 1:num_hidden_units+1] = theta2_grad[:, 1:num_hidden_units+1] +lamb/m* theta2[:, 1:num_hidden_units+1]
#unroll
grad = np.hstack([theta1_grad.ravel(), theta2_grad.ravel()])
return grad
def gradientChecking(lamb):
input_layer_size = 3
hidden_layer_size = 5
num_labels = 3
m = 5
theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
theta2 = randInitializeWeights(hidden_layer_size, num_labels)
X = np.random.rand(m, input_layer_size)
y = np.array([1, 2, 0, 1, 2])
nn_params = np.hstack([theta1.ravel(), theta2.ravel()])
#calculate gradient with function
grad = gradient(nn_params, X, y, input_layer_size, hidden_layer_size, num_labels, lamb)
#calculate numerical gradient
num_grad = computeNumericalGradient(lambda theta: computeCost(theta, X, y, input_layer_size, hidden_layer_size, num_labels, lamb), nn_params)
print('Function Gradient', 'Numerical Gradient')
for i in range(len(grad)):
print(grad[i], num_grad[i])
diff = np.linalg.norm(num_grad-grad)/np.linalg.norm(num_grad+grad)
print('Relative Difference: ')
print(diff)
def computeCost(theta, X, y, num_inputs, num_hidden_units, num_labels, lamb):
m = len(X)
y_bin = np.zeros((m, num_labels))
for i in range(m):
y_bin[i, y[i]] = 1
theta1 = np.reshape(theta[0:(num_hidden_units*(num_inputs+1))],(num_hidden_units, (num_inputs+1))) #5x4
theta2 = np.reshape(theta[(num_hidden_units*(num_inputs+1)):],(num_labels, num_hidden_units+1)) #3x6
a_1 = np.hstack((np.ones((m, 1)), X)) #5x4
z_2 = np.dot(a_1, theta1.transpose()) #5x5
a_2 = sigmoid(z_2) #5x5
a_2 = np.hstack((np.ones((m, 1)), a_2)) #5x6
z_3 = np.dot(a_2, theta2.transpose()) #5x3
h = sigmoid(z_3)
cost = np.sum(-y_bin * np.log(h) - (1-y_bin) * np.log(1-h))/m
#regularization
theta1_sq = theta1[:, 1:num_inputs+1] * theta1[:, 1:num_inputs+1];
theta2_sq = theta2[:, 1:num_hidden_units+1] * theta2[:, 1:num_hidden_units+1];
cost = cost + lamb/(2.0*m)*(np.sum(theta1_sq) + np.sum(theta2_sq))
return cost
def computeNumericalGradient(J, theta):
numgrad = np.zeros(theta.shape)
perturb = np.zeros(theta.shape)
e = 0.0001
for p in range(1, np.size(theta)):
perturb[p] = e
loss1 = J(theta - perturb)
loss2 = J(theta + perturb)
numgrad[p] = (loss2 - loss1) / (2*e)
perturb[p] = 0
return numgrad
gradientChecking(1.0)

How to perform Numpy optimisation for this code?

I have the following code snippet:
def func1(self, X, y):
#X.shape = (455,13)
#y.shape = (455)
num_examples, num_features = np.shape(X)
self.weights = np.random.uniform(-1 / (2 * num_examples), 1 / (2 * num_examples), num_features)
while condition:
new_weights = np.zeros(num_features)
K = (np.dot(X, self.weights) - y)
for j in range(num_features):
summ = 0
for i in range(num_examples):
summ += K[i] * X[i][j]
new_weights[j] = self.weights[j] - ((self.alpha / num_examples) * summ)
self.weights = new_weights
This code works too slow. Is there any optimization, which I can do?

You can efficiently use np.einsum(). See a testing version below:
def func2(X, y):
num_examples, num_features = np.shape(X)
weights = np.random.uniform(-1./(2*num_examples), 1./(2*num_examples), num_features)
K = (np.dot(X, weights) - y)
return weights - alpha/num_examples*np.einsum('i,ij->j', K, X)

You can get new_weights directly using matrix-multiplication with np.dot like so -
new_weights = self.weights- ((self.alpha / num_examples) * np.dot(K[None],X))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Which TensorFlow operations do not have GPU implementations? - python

Related

Learning parameters with gradient descent

Calculate covariance of torch tensor (2d feature map)

How to Implement Mixup in Keras Sequence?

Trying to understand Gradient Checking error in 3-layer Neural Network

How to perform Numpy optimisation for this code?

Categories

Resources