I want to replicate the torch.gather() function in TensorFlow 2.X.
I have a Tensor A (shape: [2, 4, 3]) and a corresponding Index-Tensor I (shape: [2,2,3]).
Using torch.gather() yields the following:
A = torch.tensor([[[10,20,30], [100,200,300], [1000,2000,3000]],
[[50,60,70], [500,600,700], [5000,6000,7000]]])
I = torch.tensor([[[0,1,0], [1,2,1]],
[[2,1,2], [1,0,1]]])
torch.gather(A, 1, I)
>
tensor([[[10, 200, 30], [100, 2000, 300]],
[5000, 600, 7000], [500, 60, 700]]])
I have tried using tf.gather(), but this did not yield pytorch-like results. I also tried to play around with tf.gather_nd(), but I could not find a suitable solution.
I found this StackOverflow post, but this seems not to work for me.
Edit:
When using tf.gather_nd(A, I), I get the following result:
tf.gather_nd(A, I)
>
[[100, 6000],
[ 0, 60]]
The result for tf.gather(A, I) is rather lengthy. It has the shape of [2, 2, 3, 4, 3]
torch.gather and tf.gather_nd work differently and will therefore yield different results when using the same indices tensor (in some cases an error will also be returned). This is what the indices tensor would have to look like to get the same results:
import tensorflow as tf
A = tf.constant([[
[10,20,30], [100,200,300], [1000,2000,3000]],
[[50,60,70], [500,600,700], [5000,6000,7000]]])
I = tf.constant([[[
[0,0,0],
[0,1,1],
[0,0,2],
],[
[0,1,0],
[0,2,1],
[0,1,2],
]],
[[
[1,2,0],
[1,1,1],
[1,2,2],
],
[
[1,1,0],
[1,0,1],
[1,1,2],
]]])
print(tf.gather_nd(A, I))
tf.Tensor(
[[[ 10 200 30]
[ 100 2000 300]]
[[5000 600 7000]
[ 500 60 700]]], shape=(2, 2, 3), dtype=int32)
So, the question is actually how are you calculating your indices or are they always hard-coded? Also, check out this post on the differences of the two operations.
As for the post you linked that didn't work for you, you just need to cast the indices and everything should be fine:
def torch_gather(x, indices, gather_axis):
all_indices = tf.where(tf.fill(indices.shape, True))
gather_locations = tf.reshape(indices, [indices.shape.num_elements()])
gather_indices = []
for axis in range(len(indices.shape)):
if axis == gather_axis:
gather_indices.append(tf.cast(gather_locations, dtype=tf.int64))
else:
gather_indices.append(tf.cast(all_indices[:, axis], dtype=tf.int64))
gather_indices = tf.stack(gather_indices, axis=-1)
gathered = tf.gather_nd(x, gather_indices)
reshaped = tf.reshape(gathered, indices.shape)
return reshaped
I = tf.constant([[[0,1,0], [1,2,1]],
[[2,1,2], [1,0,1]]])
A = tf.constant([[
[10,20,30], [100,200,300], [1000,2000,3000]],
[[50,60,70], [500,600,700], [5000,6000,7000]]])
print(torch_gather(A, I, 1))
tf.Tensor(
[[[ 10 200 30]
[ 100 2000 300]]
[[5000 600 7000]
[ 500 60 700]]], shape=(2, 2, 3), dtype=int32)
You could also try this as an equivalent to torch.gather:
import random
import numpy as np
import tensorflow as tf
import torch
# torch.gather equivalent
def tf_gather(x: tf.Tensor, indices: tf.Tensor, axis: int) -> tf.Tensor:
complete_indices = np.array(np.where(indices > -1))
complete_indices[axis] = tf.reshape(indices, [-1])
flat_ind = np.ravel_multi_index(tuple(complete_indices), x.shape)
return tf.reshape(tf.gather(tf.reshape(x, [-1]), flat_ind), indices.shape)
# ======= test program ========
if __name__ == '__main__':
a = np.random.rand(2, 5, 3, 4)
dim = 2 # 0 <= dim < len(a.shape))
ind = np.expand_dims(np.argmax(a, axis=dim), axis=dim)
# ========== np: groundtruth ==========
np_max = np.expand_dims(np.max(a, axis=dim), axis=dim)
# ========= torch: gather =========
torch_max = torch.gather(torch.tensor(a), dim=dim, index=torch.tensor(ind))
# ========= tensorflow: torch-like gather =========
tf_max = tf_gather(tf.convert_to_tensor(a), axis=dim, indices=tf.convert_to_tensor(ind))
keepdim = False
if not keepdim:
np_max = np.squeeze(np_max, axis=dim)
torch_max = torch.squeeze(torch_max, dim=dim)
tf_max = tf.squeeze(tf_max, axis=dim)
# print('np_max:\n', np_max)
# print('torch_max:\n', torch_max)
# print('tf_max:\n', tf_max)
assert np.allclose(np_max, torch_max.numpy()), '\33[1m\33[31mError with torch\33[0m'
assert np.allclose(np_max, tf_max.numpy()), '\33[1m\33[31mError with tensorflow\33[0m'
print('\33[1m\33[32mSuccess!\33[0m')
Related
I have a Markov chain function implemented in JAX that advances the chain from state s -> s' based on some training data (X_train).
def step(state: dict, key, X_train) -> dict:
new_state = advance(state, key, X_train)
return new_state
Here, state is a fairly complicated tree-structured dict of array's that was generated by Haiku. For example,
state = {
'layer1': {
'weights': array(...),
'bias': array(...),
},
'layer2': {
'weights': array(...),
'bias': array(...),
},
}
I would like to run multiple Markov chains, with different states, in parallel. At first glance, jax.vmap function looks like a good candidate. However, state is not an array but a (tree-structured) dict.
What is the best way to approach this?
Thanks!
Yes, you could use vmap for any pytree. But this is how you should construct it:
states = {'layer1':{'weights':jnp.array([[1, -2, 3],
[4, 5, 6]])},
'layer2':{'weights':jnp.array([[1, .2, 3],
[.4, 5, 6]])}}
So in your first run, your weights will be [1, -2, 3] and [1, .2, 3] for layer1 and layer2 respectively (second run will be [4, 5, 6] and [.4, 5, 6]). But markov chain should be handled by jax.lax.scan. And you could use jit compilation to speed things up. Here is a trivial example. In each step chain calculates the following:
import jax
import jax.numpy as jnp
from functools import partial
#jax.jit
def step(carry, k):
# this function runs a single step in the chain
# X_train dim:(3,3)
# w1 dim: (1,3)
# w2 dim: (3,1)
# X_new = log(Relu(w1#X_old)#w2) + e
# e~Normal(0, 1)
state, X_train, rng = carry
rng, rng_input = jax.random.split(rng)
e = jax.random.normal(rng) # generate pseudorandom
w1 = state['layer1']['weights'] # it is a column vector
w2 = state['layer2']['weights'][None, :] # make it a row vector
X_train = jax.nn.relu(w1#X_train)[:, None]+1
X_train = jnp.log(X_train#w2)
X_train = X_train + e
return [state, X_train, rng], e
#partial(jax.jit, static_argnums = 3)
def fi(state, X_train, key, number_of_steps):
rng = jax.random.PRNGKey(key)
carry = [state, X_train, rng]
carry, random_normals = jax.lax.scan(step, carry, xs = jnp.arange(number_of_steps))
state, X, rng = carry
return X
X_train = jnp.array([[1., -1., 0.5],
[1., 1, 2.],
[4, 2, 0.1]])
states = {'layer1':{'weights':jnp.array([[1, -2, 3],
[4, 5, 6]])},
'layer2':{'weights':jnp.array([[1, .2, 3],
[.4, 5, 6]])}}
vmap_fi = jax.vmap(fi, (0, None, None, None)) # only map on first argument axis 0
key = 42 # random seed
number_of_steps = 100 # chain runs 100 steps
last_states = vmap_fi(states, X_train, key, number_of_steps)
print(last_states)
Output:
[[[ 1.8478627 0.23842478 2.946475 ]
[ 1.3278859 -0.28155205 2.4264982 ]
[ 2.0921988 0.48276085 3.1908112 ]]
[[ 2.9374144 5.4631433 5.645465 ]
[ 3.4333894 5.959118 6.1414394 ]
[ 3.4612248 5.9869533 6.169275 ]]]
In this example, you could make states dictionaries more complicated. You just need to parallelize on their 0th axis.
The problem I'm trying to solve is the one in the picture. Given a text sentence with word embeddings, and a fixed set of indexes for each sentence pointing to the words I want to keep, how do I slice the embeddings of interest?
Note: I cannot do it as a preprocess step because the embeddings are the result of several layers.
As a toy example, say that I have 2 input datasets, one containing the data itself as 2D tensors, and another one containing the indices of the words that I'm interested in. So for instance
NUM_SENTENCES=2
NUM_ENTITIES_PER_REL=3
LEN_SENTENCE=5
NUM_H_T=2
DIM_EMBEDDING=2
indices = tf.constant([
[1, 3],
[0, 4]
])
data = tf.constant(np.reshape(np.arange(NUM_SENTENCES*LEN_SENTENCE*DIM_EMBEDDING), [NUM_SENTENCES, LEN_SENTENCE, DIM_EMBEDDING]))
With the index as stated, I want to retrieve elements 1 and 3 from first element, and 0 and 4 from second to result in
array([[[ 2, 3],
[ 6, 7]],
[[10, 11],
[18, 19]]])
I can obtain desired result if I do:
selector = [[[idx, elem]
for elem in arr]
for idx, arr in enumerate(indices)]
tf.gather_nd(data, selector)
but this doesn't work within a model. Here it is my code:
input_text = keras.Input(shape=(LEN_SENTENCE, DIM_EMBEDDING), name="input_sentence")
input_ent = keras.Input(shape=(NUM_ENTITIES_PER_REL, 2), dtype=tf.int32, name="entities_to_classify")
class Selector(layers.Layer):
def __init__(self, **kwargs):
super(Selector, self).__init__(**kwargs)
def call(self, inputs):
h_s = inputs[1]
indexes = inputs[0]
idxs = indexes.numpy()
selector = [[[idx, elem]
for elem in arr]
for idx, arr in enumerate(idxs)]
return tf.gather_nd(h_s, selector)
x = Selector(name="selector")([input_ent, input_text])
model = keras.Model(inputs=[input_ent, input_text], outputs=x, name='language_model')
keras.utils.plot_model(model, '/tmp/model.jpg', show_shapes=True)
and the result of executing it (I'm using tensorflow==2.0.0-beta1).
AttributeError: 'Tensor' object has no attribute 'numpy'
and I don't know how to solve this chicken-egg problem. Any ideas?
You can do that like this:
import tensorflow as tf
import numpy as np
NUM_SENTENCES = 2
NUM_ENTITIES_PER_REL = 3
LEN_SENTENCE = 5
NUM_H_T = 2
DIM_EMBEDDING = 2
with tf.Graph().as_default(), tf.Session() as sess:
indices = tf.constant([
[1, 3],
[0, 4]
])
data = tf.constant(np.reshape(np.arange(NUM_SENTENCES * LEN_SENTENCE * DIM_EMBEDDING),
[NUM_SENTENCES, LEN_SENTENCE, DIM_EMBEDDING]))
# Make first dimension indices
s = tf.shape(indices)
idx0 = tf.tile(tf.expand_dims(tf.range(s[0]), 1), [1, s[1]])
# Make full index
idx_gather = tf.stack([idx0, indices], axis=-1)
# Gather result
result = tf.gather_nd(data, idx_gather)
print(sess.run(result))
# [[[ 2 3]
# [ 6 7]]
#
# [[10 11]
# [18 19]]]
Tensor can't be cast to numpy use data instead
idxs = indexes[0].numpy()
For example, there is an 3D tensor like this:
a = tf.constant([[[1,2,3],
[4,5,6],
[7,8,9]],
[[9,8,7],
[6,5,4],
[3,2,1]],
[[0,8,0],
[1,5,4],
[3,1,1]]])
I want to delete the different rows from the three elements with indices as:
idx = [[1],
[0],
[2]]
The result would be like this:
re = [[[1,2,3],
[7,8,9]],
[[6,5,4],
[3,2,1]],
[[0,8,0],
[1,5,4]]]
How to do it?
First approach: using tf.one_hot and tf.boolean_mask:
# shape = (?,1,3)
mask_idx = 1- tf.one_hot(idx,a.shape[1])
# shape = (?,3)
result = tf.boolean_mask(a,mask_idx[:,0,:])
# shape = (?,2,3)
result = tf.reshape(result,shape=(-1,a.shape[1]-1,a.shape[2]))
Second approach: using tf.map_fn:
result = tf.map_fn(lambda x: tf.boolean_mask(x[0],1 - tf.one_hot(tf.squeeze(x[1]),a.shape[1]))
, [a,idx]
, dtype=tf.int32)
An example:
import tensorflow as tf
a = tf.constant([[[1,2,3],[4,5,6],[7,8,9]],
[[9,8,7],[6,5,4],[3,2,1]],
[[0,8,0],[1,5,4],[3,1,1]]],dtype=tf.int32)
idx = tf.constant([[1],[0],[2]],dtype=tf.int32)
# First approach:
# shape = (?,1,3)
mask_idx = 1- tf.one_hot(idx,a.shape[1])
# shape = (?,3)
result = tf.boolean_mask(a,mask_idx[:,0,:])
# shape = (?,2,3)
result = tf.reshape(result,shape=(-1,a.shape[1]-1,a.shape[2]))
# Second approach:
result = tf.map_fn(lambda x: tf.boolean_mask(x[0],1 - tf.one_hot(tf.squeeze(x[1]),a.shape[1]))
, [a,idx]
, dtype=tf.int32)
with tf.Session() as sess:
print(sess.run(result))
# print
[[[1 2 3]
[7 8 9]]
[[6 5 4]
[3 2 1]]
[[0 8 0]
[1 5 4]]]
You can use numpy (assuming a and idx as numpy.ndarray):
import numpy as np
columns_to_delete = idx.flatten()
mask = np.ones_like(a, dtype=np.bool)
mask[np.arange(a.shape[0]), columns_to_delete, :] = False
re = a[mask].reshape(a.shape[0], a.shape[1] - 1, a.shape[2])
and then convert re to tensor using tf.convert_to_tensor
import tensorflow as tf
with tf.Session() as sess:
with tf.variable_scope('masssdsms'):
a = tf.get_variable('a', [1000,24,128], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1) )
b = tf.get_variable('b', [1000,15,128], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1) )
I want to get a new tensor named c from a and b.
1000 is the batch size, and c's shape should be (1000,20, 10, 1). For every instance from a and b: ai and bi, they are both two dimensional tensors.
The new instance ci is the result of ai and bi and it has 20 * 10 = 200 elements, that every element is the dot product of ai and bi with 128 dimension respectively. So there are 200 dot products results in sum. The ci is more like a 2-D image.
How can I initialize this operation?
Modified:
When I take the codes in usage, the operation of dot product should be replaced with some other function like guassian distance, or cosine distance etc, which is contact notation in the graph.
So I need to a common method to do this.
Here is what I design, but I am not sure whether it is a efficient way to do this:
with tf.Session() as sess:
with tf.variable_scope('masssdsms'):
a = tf.get_variable('a', [1000,24,128], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1) )
b = tf.get_variable('b', [1000,15,128], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1) )
i = 999 # for i in range(1000):
ai = tf.slice(a,[i,0,0],[1,-1,-1]) # (1,24,128)
bi = tf.slice(b,[i,0,0],[1,-1,-1]) # (1,15,128)
ci = contact_func(ai,bi) # (1,24,15)
You can achieve that with clever application of broadcasting. Try this:
a = tf.ones([1000, 20, 128])
b = tf.ones([1000, 10, 128])
a = tf.expand_dims(a, axis=1) # [1000, 1, 20, 128]
b = tf.expand_dims(b, axis=2) # [1000, 10, 1, 128]
products = a * b # [1000, 10, 20, 128]
reduced = tf.reduce_sum(products, axis=-1) # [1000, 10, 20]
The products contains all pairwise multiplications of all items in a and b. And the reduced aggregates the sum over the last axis.
Doing a matmul of the matrix a with the transpose of the dimension-1 of b should give the desired result:
c = tf.matmul(a, tf.transpose(b, [0, 2, 1])) # [1000, 20, 10]
# to get (1000, 20, 10, 1) you do
tf.expand_dims(c, 3)
EDIT:
For the contact_func operation, you may need to manually do the broadcasting using tile operator. Here is the code for gaussian distance:
# use tile to repeat the rows
d = tf.reshape(tf.tile(a, [1, 1, b.shape[1]]), (-1,a.shape[1]*b.shape[1],a.shape[2]))
#[1000, 360, 128],
# repeat the columns
e = tf.tile(b, [1, a.shape[1], 1])
#[1000, 360, 128]
# exp(-d_i_j), where d_i_j is the eucludian distance of i, j
c = tf.reshape(tf.exp(tf.reduce_sum(d-e, 2)), (-1, a.shape[1], b.shape[1]))
#[1000, 24, 15]
I have a tensor X whose shape is (None, 56, 300, 1), and another tensor y whose shape is (None, 15), the first dimension of these tensors is batch_size, I wanna use y as index to get a tensor z, the shape of z is (None, 15, 300, 1). Is there any decent way to do this?
I write a simple code to test, for I found it's difficult for me because in practice I don't know the batch_size(first dimension of these tensors is None),
Here is my test code:
import numpy as np
import tensorflow as tf
# In this test code , batch_size is 4.
# params' shape is (4, 3, 2 ,1), in practice is (None, 56, 300, 1),
params = [
[[['a0'], ['b0']], [['d0'], ['e0']], [['f0'], ['g0']]],
[[['a1'], ['b1']], [['d1'], ['e1']], [['f1'], ['g1']]],
[[['a2'], ['b2']], [['d2'], ['e2']], [['f2'], ['g2']]],
[[['a3'], ['b3']], [['d3'], ['e3']], [['f3'], ['g3']]],
]
# ind's shape is (4, 2) (In practice is (None, 15)),
# so I wanna get output whose's shape is (4, 2, 2, 1), (In practice is (None, 15, 300, 1))
ind = [[1, 0], [0, 2], [2, 0], [2, 1]]
#ouput = [
# [[['d0'], ['e0']], [['a0'], ['b0']]],
# [[['a1'], ['b1']], [['f1'], ['g1']]],
# [[['f2'], ['g2']], [['a2'], ['b2']]],
# [[['f3'], ['g3']], [['d3'], ['e3']]]
#]
with tf.variable_scope('gather') as scope:
tf_par = tf.constant(params)
tf_ind = tf.constant(ind)
res = tf.gather_nd(tf_par, tf_ind)
with tf.Session() as sess:
init = tf.global_variables_initializer()
print sess.run(res)
print res
To slice x along the second dimension with ind, that is, to slice
tensor x of shape (d0, d1, d2,...), d0 being possibly None,
with a tensor of indices ind of shape (d0, n1),
to obtain a tensor y of shape (d0, n1, d2, ...),
you could use tf.gather_nd along with tf.shape to get the shape at run time:
ind_shape = tf.shape(ind)
ndind = tf.stack([tf.tile(tf.range(ind_shape[0])[:, None], [1, ind_shape[1]]),
ind], axis=-1)
y = tf.gather_nd(x, ndind)
For results you suppose, you should use:
ind = [[0, 1], [0, 0], [1, 0], [1, 2], [2, 2], [2, 0], [3, 2], [3, 1]]
Update
You can use this code for get what you want, with current input:
with tf.variable_scope('gather') as scope:
tf_par = tf.constant(params)
tf_ind = tf.constant(ind)
tf_par_shape = tf.shape(tf_par)
tf_ind_shape = tf.shape(tf_ind)
tf_r = tf.div(tf.range(0, tf_ind_shape[0] * tf_ind_shape[1]), tf_ind_shape[1])
tf_r = tf.expand_dims(tf_r, 1)
tf_ind = tf.expand_dims(tf.reshape(tf_ind, shape = [-1]), 1)
tf_ind = tf.concat([tf_r, tf_ind], axis=1)
res = tf.gather_nd(tf_par, tf_ind)
res = tf.reshape(res, shape = (-1, tf_ind_shape[1], tf_par_shape[2], tf_par_shape[3]))