import numpy as np
x = np.array([1,2,3,4,5,6,7])
f1= np.array([1,2,3,4,5,6,7])
f2= np.array([1,2,3,4,5,6,7])
def func(w1,w2,x,f1,f2):
w1=1-w2
return np.std(x/(w1*f1+w2*f2))
i need my code to minimize func(w1,w2,x,f1,f2) by changing w1 and w2 then give me w1 and w2 values. w1 + w2 should be equal to 1.
Something like this might be what you need:
x = np.random.randint(1, 10, 7)
f1 = np.random.randint(1, 10, 7)
f2 = np.random.randint(1, 10, 7)
def func(w, x, f1, f2): # no need to pass w1 and w2 separately
return np.std(x / (w[0] * f1 + (1 - w[0]) * f2))
res = scipy.optimize.minimize(func, x0=[0.5], args=(x, f1, f2), bounds=[(0, 1)])
w1 = res.x[0]
w2 = 1 - w1
print("Optimal weights are", w1, w2)
Loss Function:
L = (XW-t)**2
t: means target value
w: weights, which is [w1, w2]
x: input, which is [x1, x2]
Therefore, L = (-t + w1x1 + w2x2)**2
I try to get Hessian function of it, which should be positive proved by others.
But I get the following result:
(-t + w1*x1 + w2*x2)**2
first grad:
(2*x1*(-t + w1*x1 + w2*x2),)
(2*x2*(-t + w1*x1 + w2*x2),)
Hessian:
[2*x1**2, 2*x1*x2]
[2*x1*x2, 2*x2**2]
Value: 0
Anyone knows why the value is 0?
Here is the code:
import numpy as np
from sympy import *
x1, x2, w1, w2, t = symbols("x1 x2 w1 w2 t")
# inp = Matrix(1, 2, [u, i])
# w = Matrix(2, 1, [w1, w2])
e1 = x1*w1 + x2*w2
# e2 = u*w3 + i*w4
# e3 = e1*w5 + e2*w6
L = (e1-t)**2
print("loss function: ", L)
all_symbols = [w1, w2]
first_diff = []
second_diff = []
for symbol in all_symbols:
first_diff.append(diff(L, symbol))
temp_diff = []
for symbol in all_symbols:
temp_diff.append(expand(diff(first_diff[-1], symbol)))
second_diff.append(temp_diff)
print("\nfirst grad:")
for function in zip(first_diff):
print(function)
print("\nHessian: ")
for elem in second_diff:
print(elem)
Hessian = Matrix(second_diff)
# print(Hessian)
print("\nValue: ", Hessian.det())
I'm writing a multi-layer perceptron from scratch and I think it's way slower than it should be. the culprit seems to be my compute_gradients-function, which according to my investigation answers for most of the execution time. It looks like this:
def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: {}, P: {}".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape ({}, k), has {}".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape ({}, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])
X, Y, H, P = X.T, Y.T, H.T, P.T
for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])
# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1
diag = np.diag(ind)
g = np.matmul(g, diag)
grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
return grad_W1, grad_W2, grad_b1, grad_b2
If X, Y, H, P are all 10 rows long (n=10), the computations take about 1 second. This is way too much compared to my friends who are doing the same task. But I can't see any obvious inefficiencies in my code. What can I do to speed the computations up?
EDIT: Input data is the CIFAR dataset. Load it like this:
def one_hot(Y):
# assume Y = [1, 4, 9, 0, ...]
result = [None]*len(Y)
for i, cls in enumerate(Y):
onehot = {
0: lambda: [1,0,0,0,0,0,0,0,0,0],
1: lambda: [0,1,0,0,0,0,0,0,0,0],
2: lambda: [0,0,1,0,0,0,0,0,0,0],
3: lambda: [0,0,0,1,0,0,0,0,0,0],
4: lambda: [0,0,0,0,1,0,0,0,0,0],
5: lambda: [0,0,0,0,0,1,0,0,0,0],
6: lambda: [0,0,0,0,0,0,1,0,0,0],
7: lambda: [0,0,0,0,0,0,0,1,0,0],
8: lambda: [0,0,0,0,0,0,0,0,1,0],
9: lambda: [0,0,0,0,0,0,0,0,0,1],
}[cls]()
result[i] = onehot
result = np.array(result).T
return result
def unpickle(file):
import pickle
with open(file, "rb") as fo:
d = pickle.load(fo, encoding="bytes")
return d
names = ["data_batch_1",
"data_batch_2",
"data_batch_3",
"data_batch_4",
"data_batch_5",
]
# All data sets
dataset_large = {"data": np.zeros([0, 3072]), "labels": np.array([])}
validation_large = {}
## All data batches
for name in names[0:4]:
raw = unpickle(os.path.join(path, name))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]
# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T
# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255
the dataset can be found at https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz . Then run like:
def evaluate_classifier(X, W1, W2, b1, b2):
if not X.shape[0] == n_input:
ValueError("Wrong shape of X: {}".format(X.shape))
if not len(X.shape) == 2:
ValueError("Wrong shape of X: {}".format(X.shape))
if not W1.shape == (n_hidden, n_input):
raise ValueError("Wrong shape of W1: {}".format(W1.shape))
if not b1.shape == (n_hidden, 1):
raise ValueError("Wrong shape of b1: {}".format(b1.shape))
if not W2.shape == (10, n_hidden):
raise ValueError("Wrong shape of W2: {}".format(W2.shape))
if not b2.shape == (10, 1):
raise ValueError("Wrong shape of b2: {}".format(b2.shape))
s1, h = layer_1(X, W1, b1)
p = layer_2(h, W2, b2)
return s1, h, p
W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])
X = dataset_large["data"][0:10]
Y = dataset_large["labels"][0:10]
S1, H, P = evaluate_classifier(X, W1, W2, b1, b2)
lamb = 0
compute_gradients(X, Y, S1, H, P, W1, W2, lamb)
I want to define a custom loss function in Keras with Tensorflow backend which uses only the predicted y values, regardless of the true ones. The graph compiles successfully, but at the start of the training it returns an exception: InvalidArgumentError (see above for traceback): Self-adjoint eigen decomposition was not successful. The input might not be valid. I have tried replacing my data with random dummy data, but it produces the same exception.
My full code of the loss definition can be found below. Why is the input to the
tf.self_adjoint_eig not valid?
def model_correlation_loss(representation_size, k_singular_values):
global batch_size
def keras_loss(y_true, y_pred):
global batch_size
regularization_constant_1 = regularization_constant_2 = 1e-4
epsilon = 1e-12
o1 = o2 = int(y_pred.shape[1] // 2)
h_1 = y_pred[:, 0:o1]
h_2 = y_pred[:, o1:o1+o2]
h_1 = tf.transpose(h_1)
h_2 = tf.transpose(h_2)
m = tf.shape(h_1)[1]
centered_h_1 = h_1 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_1, tf.ones(shape=(m, m)))
centered_h_2 = h_2 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul(h_2, tf.ones(shape=(m, m)))
sigma_hat_12 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_2))
sigma_hat_11 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_1, tf.transpose(centered_h_1)) + regularization_constant_1 * tf.eye(num_rows=o1)
sigma_hat_22 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul(centered_h_2, tf.transpose(centered_h_2)) + regularization_constant_2 * tf.eye(num_rows=o2)
w_1, v_1 = tf.self_adjoint_eig(sigma_hat_11)
w_2, v_2 = tf.self_adjoint_eig(sigma_hat_22)
zero = tf.constant(False, dtype=tf.bool)
idx_pos_entries_1 = tf.where(tf.equal(tf.greater(w_1, epsilon), True))
idx_pos_entries_1 = tf.reshape(idx_pos_entries_1, [-1, tf.shape(idx_pos_entries_1)[0]])[0]
w_1 = tf.gather(w_1, idx_pos_entries_1)
v_1 = tf.gather(v_1, idx_pos_entries_1)
idx_pos_entries_2 = tf.where(tf.equal(tf.greater(w_2, epsilon), True))
idx_pos_entries_2 = tf.reshape(idx_pos_entries_2, [-1, tf.shape(idx_pos_entries_2)[0]])[0]
w_2 = tf.gather(w_2, idx_pos_entries_2)
v_2 = tf.gather(v_2, idx_pos_entries_2)
sigma_hat_rootinvert_11 = tf.matmul(tf.matmul(v_1, tf.diag(tf.sqrt(w_1))), tf.transpose(v_1))
sigma_hat_rootinvert_22 = tf.matmul(tf.matmul(v_2, tf.diag(tf.sqrt(w_2))), tf.transpose(v_2))
t_matrix = tf.matmul(tf.matmul(sigma_hat_rootinvert_11, sigma_hat_12), sigma_hat_rootinvert_22)
if k_singular_values == representation_size: # use all
correlation = tf.sqrt(tf.trace(tf.matmul(K.transpose(t_matrix), t_matrix)))
return correlation
return keras_loss
Here's the tf code provided by Wang on his website for computing the loss function:
def CCA_loss(H1, H2, N, d1, d2, dim, rcov1, rcov2):
# Remove mean.
m1 = tf.reduce_mean(H1, axis=0, keep_dims=True)
H1 = tf.subtract(H1, m1)
m2 = tf.reduce_mean(H2, axis=0, keep_dims=True)
H2 = tf.subtract(H2, m2)
S11 = tf.matmul(tf.transpose(H1), H1) / (N-1) + rcov1 * tf.eye(d1)
S22 = tf.matmul(tf.transpose(H2), H2) / (N-1) + rcov2 * tf.eye(d2)
S12 = tf.matmul(tf.transpose(H1), H2) / (N-1)
E1, V1 = tf.self_adjoint_eig(S11)
E2, V2 = tf.self_adjoint_eig(S22)
# For numerical stability.
idx1 = tf.where(E1>eps_eig)[:,0]
E1 = tf.gather(E1, idx1)
V1 = tf.gather(V1, idx1, axis=1)
idx2 = tf.where(E2>eps_eig)[:,0]
E2 = tf.gather(E2, idx2)
V2 = tf.gather(V2, idx2, axis=1)
K11 = tf.matmul( tf.matmul(V1, tf.diag(tf.reciprocal(tf.sqrt(E1)))), tf.transpose(V1))
K22 = tf.matmul( tf.matmul(V2, tf.diag(tf.reciprocal(tf.sqrt(E2)))), tf.transpose(V2))
T = tf.matmul( tf.matmul(K11, S12), K22)
# Eigenvalues are sorted in increasing order.
E2, U = tf.self_adjoint_eig(tf.matmul(T, tf.transpose(T)))
return tf.reduce_sum(tf.sqrt(E2[-dim:]))