I am trying to apply shap.deepexplainer to explain the model output.
My model class is as follows:
class MyModel(tf.keras.Model):
def __init__(self,
input_dim,
emb_dim=128,
alpha_hidden_dim_size=128,
beta_hidden_dim_size=128,
keep_prob_emb=0.25,
keep_prob_context=0.25,
num_class=1):
super(MyModel, self).__init__()
self.embedding = layers.Dense(emb_dim,
use_bias=False,
input_shape=(input_dim, ))
self.emb_drp = layers.Dropout(keep_prob_emb)
self.enroll = layers.Dense(emb_dim, activation='tanh')
self.gru_alpha = layers.Bidirectional(
layers.LSTM(alpha_hidden_dim_size, return_sequences=True))
self.gru_beta = layers.Bidirectional(
layers.LSTM(beta_hidden_dim_size, return_sequences=True))
self.alpha = layers.Dense(1)
self.beta = layers.Dense(emb_dim, activation='tanh')
self.context_drp = layers.Dropout(keep_prob_context)
self.out = layers.Dense(num_class)
def call(self, visits, enroll, lengths, **kwargs):
max_len = lengths[tf.argmax(lengths)]
visits = visits[:, :max_len]
emb = self.embedding(visits)
emb = self.emb_drp(emb, training=kwargs.get('training', False))
enroll = self.enroll(enroll)
mask = tf.sequence_mask(lengths)
h_a = self.gru_alpha(emb, mask=mask)
h_b = self.gru_beta(emb, mask=mask)
preAlpha = self.alpha(h_a)
preAlpha = tf.keras.backend.squeeze(preAlpha, axis=2)
mask_norm = (1 - tf.cast(mask, tf.float32)) * NEG_INF
alpha = tf.nn.softmax(preAlpha + mask_norm, axis=1)
beta = self.beta(h_b)
c_t = tf.math.reduce_sum(alpha[:, :, None] * beta * emb, axis=1)
c_t = layers.add([c_t, enroll])
c_t = self.context_drp(c_t, training=kwargs.get('training', False))
preY = self.out(c_t)
return preY, alpha, beta
When I applied my model as:
model = MyModel(**flags)
And the model is successfully loaded:
print(model)
<__main__.MyModel object at 0x7f51db414400>
Then I am trying to use the
background = X.loc[10:20]
e = shap.DeepExplainer((model.layers[0].input, model.layers[-1].output), background)
but then it gives me the error:
AttributeError: Layer dense is not connected, no input to return.
Traceback (most recent call last): File
"/home/ANANT/codes/test/env/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py",line
1808, in input' is not connected, no input to return.')
And also the model.layers[-1].output can not give proper output neither:
AttributeError: Layer dense_4 has no inbound nodes. Traceback (most
recent call last): File
"/home/ANANT/test/env/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py",
line 1827, in output raise AttributeError('Layer ' + self.name + ' has
no inbound nodes.')
My package versions are: keras==2.3.1, tensorflow==1.15.3, shap==0.35.0
I stuck at this question for a few days, tried shap.KernelExplainer as well, and it gives me a different error:
shap.KernelExplainer(model, df_fis, link="logit")
And the error is as follows:
TypeError: call() missing 2 required positional arguments: 'enroll'
and 'lengths' Traceback (most recent call last): File
"/home/ANANT/test/env/lib/python3.6/site-packages/shap/explainers/kernel.py",
line 97, in __init__model_null = match_model_to_data(self.model,
self.data) File
"/home/ANANT/test/env/lib/python3.6/site-packages/shap/common.py",
line 89, in match_model_to_dataout_val = model.f(data.data) File
"/home/ANANT/test/env/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py",
line 968, in __call__outputs = self.call(cast_inputs, *args, **kwargs)
Please help, thanks in advance!
I think you missed the softmax part
Pytorch version
self.softmax = LogSoftmax(dim=1)
Keras version
layers.Dense(num_classes, activation="softmax")
Add the above line at the end of your __init__ method, see if it works
Related
I am making the model using the fine-tuning method and the model is
VGG-16. But I got the following error 'Sequential' object has no
attribute 'in_features' I used classifier so I change classifier into
fc but got this error 'Sequential' object has no attribute 'fc'. Can
somebody guide me on what I am doing wrong? I have attached the
screenshot of the error as well.
**ERROR:'Sequential' object has no attribute 'in_features'**
[![enter image description here][1]][1]
Traceback (most recent call last):
File "ct_pretrained.py", line 186, in <module>
model = build_model().cuda()
File "ct_pretrained.py", line 42, in build_model
return models.VGG(is_emr=is_emr)
File "/data/torch/models/vgg.py", line 19, in __init__
num_ftrs = self.axial_model.classifier.in_features
File "/root/miniconda/lib/python3.8/site-packages/torch/nn/modules/module.py",line 778, in __getattr__
raise ModuleAttributeError("'{}' object has no attribute '{}'".format(
torch.nn.modules.module.ModuleAttributeError: 'Sequential' object has no attribute 'in_features'
**ERROR:'VGG' object has no attribute 'fc'**
[![enter image description here][2]][2]
Traceback (most recent call last):
File "ct_pretrained.py", line 186, in <module>
model = build_model().cuda()
File "ct_pretrained.py", line 42, in build_model
return models.VGG(is_emr=is_emr)
File "/data/torch/models/vgg.py", line 19, in __init__
num_ftrs = self.axial_model.fc.in_features
File "/root/miniconda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 778, in __getattr__
raise ModuleAttributeError("'{}' object has no attribute '{}'".format(
torch.nn.modules.module.ModuleAttributeError: 'VGG' object has no attribute 'fc'
import torch
import torch.nn as nn
from torchvision import models
__all__ = ['VGG']
class VGG(nn.Module):
def __init__(self, is_emr=False, mode='sum'):
super().__init__()
self.is_emr = is_emr
self.mode = mode
in_dim = 45
self.axial_model = models.vgg16(pretrained=True)
out_channels = self.axial_model.features[0].out_channels
self.axial_model.features[0] = nn.Conv2d(1, out_channels, kernel_size=7, stride=1, padding=0, bias=False)
self.axial_model.features[3] = nn.MaxPool2d(1)
num_ftrs = self.axial_model.classifier.in_features #error in this line of code
self.axial_model.classifier = nn.Linear(num_ftrs, 15)
self.sa_co_model = models.vgg16(pretrained=True)
self.sa_co_model.features[0] = nn.Conv2d(1, out_channels, kernel_size=7, stride=1, padding=(3,0), bias=False)
self.sa_co_model.features[3] = nn.MaxPool2d(1)
self.sa_co_model.classifier = nn.Linear(num_ftrs, 15)
if self.is_emr:
self.emr_model = EMRModel()
if self.mode == 'concat': in_dim = 90
self.classifier = Classifier(in_dim)
def forward(self, axial, sagittal, coronal, emr):
axial = axial[:,:,:-3,:-3]
sagittal = sagittal[:,:,:,:-3]
coronal = coronal[:,:,:,:-3]
axial_feature = self.axial_model(axial)
sagittal_feature = self.sa_co_model(sagittal)
coronal_feature = self.sa_co_model(coronal)
out = torch.cat([axial_feature, sagittal_feature, coronal_feature], dim=1)
if self.is_emr:
emr_feature = self.emr_model(emr)
if self.mode == 'concat':
out = torch.cat([out, emr_feature], dim=1)
elif self.mode == 'sum':
out += emr_feature
out = self.classifier(out)
return out
The classifier sequential object does not have a variable called in_features. If you want to do it dynamically, you will need to access the layer in the classifier, rather than the entire classifier: num_ftrs = self.axial.model.classifier[0].in_features. This accesses the first layer of the sequential object, namely the one that determines how many features go into the entire sequential object.
However, you can easily replace the classifier layer with another layer by determining the necessary number of features by hand. Looking at the pytorch sourcecode for VGG16, you can see the classifier takes 512 * 7 * 7 features as input.
I am trying to implement a custom multi-input and -output model which uses a learning algorithm as proposed in this paper. The model itself works fine without the custom learning algorithm which I use as a baseline. The problem I encounter is that the code got stuck in the train_step function in the DebiasModel class at code line:
mc_pred = self.main_classifier([xu, xs], training=True)
It did not return an error. After running for an hour, I interrupted the kernel and it returns the error message saying:
InvalidArgumentError: Operation 'while' has no attr named '_XlaCompile'.
During handling of the above exception, another exception occurred:
InvalidArgumentError: Operation 'gradients/while_grad/Placeholder_28' has no attr named '_read_only_resource_inputs'.
I am not sure what the issue is and I have tried to use persistent=True in tf.GradientTape as well instead of declaring two gradientTapes in single watch. But, exactly the same error occurs.
Does anyone have any idea what this issue is? And how it can be solved?
I am using Tensorflow V2.3.0 and Keras V2.4.0
Source Code
class model_components:
def mitigation_expert():
inputs = Input(shape=(300,), dtype=tf.int32, name="me_input")
x = Embedding(num_tokens, 300, weights=[embedding_matrix], input_length=max_length, trainable=False, name="me_embedding")(inputs)
x = LSTM(300, return_sequences=False, name="me_lstm")(x)
model = Model(inputs, x)
return model
def control_expert():
inputs = Input(shape=(22,), dtype=tf.int32, name="ce_input")
y = Dense(19, activation='relu', name="ce_hidden")(inputs)
model = Model(inputs, y)
return model
def main_classifier():
# Expert components
me = model_components.mitigation_expert()
ce = model_components.control_expert()
# Main classifier
ensemble = concatenate([me.output, ce.output], name="pred_ensemble")
pred_output = Dense(319, activation="relu", name="pred_hidden")(ensemble)
pred_output = Dense(3, activation="softmax", name="pred_output")(pred_output)
model = Model(inputs=[me.input, ce.input], outputs=pred_output, name="main_classifier")
return model
def adversary_classifier():
# Mitigation Expert component
me = model_components.mitigation_expert()
# Adversary classifier
adv_output = Dense(300, activation='relu', name="adv_hidden")(me.output)
adv_output = Dense(1, activation='sigmoid', name="adv_output")(adv_output)
model = Model(inputs=me.input, outputs=adv_output, name="adversary_classifier")
return model
def tf_normalize(x):
return x / (tf.norm(x) + np.finfo(np.float32).tiny)
class DebiasModel(keras.Model):
def __init__(self, main_classifier, adversary_classifier):
super(DebiasModel, self).__init__()
self.main_classifier = main_classifier
self.adversary_classifier = adversary_classifier
def compile(self, mc_optimizer, adv_optimizer, mc_loss, adv_loss, debias_param):
super(DebiasModel, self).compile()
self.mc_optimizer = mc_optimizer
self.adv_optimizer = adv_optimizer
self.mc_loss = mc_loss
self.adv_loss = adv_loss
self.debias_param = debias_param
def train_step(self, data):
# Unpack data from model.fit()
x, y, sample_weight = data
# Unpack input and output features
xu, xs = x
y_mc = y['pred_output']
z_adv = y['adv_output']
# Unpack sample_weights
mainClass_weights = sample_weight["pred_output"]
protectClass_weights = sample_weight["adv_output"]
# Generate prediction and compute loss for Main_Classifier
with tf.GradientTape() as mc_tape, tf.GradientTape() as me_mc_tape:
mc_pred = self.main_classifier([xu, xs], training=True)
mc_loss = self.mc_loss(y_mc, mc_pred, sample_weight=mainClass_weights)
# Compute and Apply Gradients for CE & Main Classifier
mc_trainable_vars = self.main_classifier.trainable_weights[3:]
mc_grads = mc_tape.gradient(mc_loss, mc_trainable_vars)
self.mc_optimizer.apply_gradients(zip(mc_grads, mc_trainable_vars))
# Generate prediction and compute loss for Adversary_Classifier
with tf.GradientTape() as adv_tape, tf.GradientTape() as me_adv_tape:
adv_pred = self.adversary_classifier(xu)
adv_loss = self.adv_loss(z_adv, adv_pred, sample_weight=protectClass_weights)
# Compute and Apply Gradients for CE & Main Classifier
adv_trainable_vars = self.adversary_classifier.trainable_weights[3:]
adv_grads = adv_tape.gradient(adv_loss, adv_trainable_vars)
self.adv_optimizer.apply_gradients(zip(adv_grads, adv_trainable_vars))
# Compute and Apply Gradients to debias ME
me_adv_debias_trainable_vars = self.adversary_classifier.trainable_weights[:3]
adv_debias_grads = me_adv_tape.gradient(adv_loss, me_adv_debias_trainable_vars)
adv_debias_dict = tf.lookup.StaticHashTable(
tf.lookup.KeyValueTensorInitializer(me_adv_debias_trainable_vars, adv_debias_grads), 0)
me_mc_debias_trainable_vars = self.main_classifier.trainable_weights[:3]
mc_debias_grads = me_mc_tape.gradient(mc_loss, me_mc_debias_trainable_vars)
me_grads = []
for g, v in zip(mc_debias_grads, me_mc_debias_trainable_vars):
unit_adv = tf_normalize(adv_debias_dict.lookup(v))
g -= tf.math.reduce_sum(g * unit_adv) * unit_adv
g -= self.debias_param * adv_debias_dict.lookup(v)
me_grads.append(zip(g, v))
self.mc_optimizer.apply_gradients(me_grads)
return {"pred_loss": mc_loss, "adv_loss": adv_loss}
model = DebiasModel(model_components.main_classifier(),
model_components.adversary_classifier())
model.compile(mc_optimizer=tf.keras.optimizers.Adam(),
adv_optimizer=tf.keras.optimizers.Adam(),
mc_loss=tf.keras.losses.CategoricalCrossentropy(),
adv_loss=tf.keras.losses.BinaryCrossentropy(),
debias_param=1)
epoch = 5
sample_weights = {
"pred_output": mainClass_weight,
"adv_output": protectClass_weight,}
model.fit(x=[xu_train, xs_train],
y={"pred_output": y_train, "adv_output": z_train},
validation_data=([xu_val, xs_val], {"pred_output": y_val, "adv_output": z_val}),
sample_weight=sample_weights, epochs=epoch, batch_size=256, verbose=1)
Error Traceback
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/ops.py in get_attr(self, name)
2485 with c_api_util.tf_buffer() as buf:
-> 2486 pywrap_tf_session.TF_OperationGetAttrValueProto(self._c_op, name, buf)
2487 data = pywrap_tf_session.TF_GetBuffer(buf)
InvalidArgumentError: Operation 'while' has no attr named '_XlaCompile'.
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
51 frames
ValueError: Operation 'while' has no attr named '_XlaCompile'.
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
InvalidArgumentError: Operation 'gradients/while_grad/Placeholder_28' has no attr named '_read_only_resource_inputs'.
Note: I have not added the full traceback, but if needed I can provide it. Many thanks in advance!
I have some tensorflow model which I need to export in a saved model. Below is the simplified code of the model, which I am trying to export.
import tensorflow as tf
def foo(x):
return tf.reduce_sum(x)
inputs = tf.keras.layers.Input(shape=(128,128,3))
y = tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding='SAME')(inputs)
y = tf.keras.layers.ReLU()(y)
outputs = tf.map_fn(foo, y, dtype=(tf.float32))
model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
model.save('./export', save_format='tf')
but while exporting the model I am getting the following error.
/Users/bruce/.venv/bin/python /Users/bruce/test_project/mymodel/test.py
Traceback (most recent call last):
File "/Users/bruce/test_project/mymodel/test.py", line 12, in <module>
outputs = tf.map_fn(foo, y, dtype=(tf.float32))
File "/Users/bruce/.venv/lib/python3.6/site-packages/tensorflow_core/python/ops/map_fn.py", line 228, in map_fn
for elem in elems_flat]
File "/Users/bruce/.venv/lib/python3.6/site-packages/tensorflow_core/python/ops/map_fn.py", line 228, in <listcomp>
for elem in elems_flat]
File "/Users/bruce/.venv/lib/python3.6/site-packages/tensorflow_core/python/ops/tensor_array_ops.py", line 1078, in __init__
name=name)
File "/Users/bruce/.venv/lib/python3.6/site-packages/tensorflow_core/python/ops/tensor_array_ops.py", line 716, in __init__
self._tensor_array = [None for _ in range(size)]
TypeError: 'Tensor' object cannot be interpreted as an integer
I cannot remove the part tf.map_fn which is doing some essential processing which I need in the saved model while deploying it.
You need to use a custom layer:
class MyMapLayer(tf.keras.layers.Layer):
def __init__(*args, **kwargs)
super().__init__(*args, **kwargs)
def foo(self, x):
return tf.reduce_sum(x)
def call(self, inputs, **kwargs):
return tf.map_fn(self.foo, inputs, dtype=(tf.float32))
Then, in your model:
inputs = tf.keras.layers.Input(shape=(128,128,3))
y = tf.keras.layers.Conv2D(filters=32, kernel_size=3, padding='SAME')(inputs)
y = tf.keras.layers.ReLU()(y)
outputs = MyMapLayer()(y)
model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
I get this error when I use my own layer:
Traceback (most recent call last):
File "E:/fffan/try.py", line 40, in <module>
run(input, 5000)
File "E:/fffan/try.py", line 36, in run
out = SelfAttention(nclass)(output, state)
TypeError: __call__() takes 2 positional arguments but 3 were given
Here is my code, and holp someone can tell me how to fix it.
from keras.engine.topology import Layer
from keras.layers.core import Dense
from keras import backend as K
from keras.layers import Input,CuDNNGRU
from keras.activations import softmax
class SelfAttention(Layer): #### 对于长序列效果较差
def __init__(self,units,**kwargs):
self.W1 = Dense(units)
self.W2 = Dense(units)
self.V = Dense(1)
super(SelfAttention, self).__init__(**kwargs)
def call(self,features, hidden):
hidden_with_time_axis = K.expand_dims(hidden, 1)
score = self.V(K.tanh(self.W1(features) + self.W2(hidden_with_time_axis)))
attention_weights = softmax(score, axis=1)
context_vector = attention_weights * features
return context_vector
def GRU(units):
return CuDNNGRU(units, return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
def run(input,nclass):
output, state = GRU(nclass)(input)
out = SelfAttention(nclass)(output, state)
if __name__ == '__main__':
input = Input(shape=(35, 512), name='the_input')
run(input, 5000)
My tensorflow version is 1.14.0, and my keras is 2.1.5
Does somebody know anything adout this issue ?
It should be:
def __call__(self, features, hidden):
instead of:
def call(self, features, hidden):
I am trying to implement Graph Convolution Layer using Keras custom layer that is mentioned in the following paper: GCNN.
When I am trying to train my model, It gives me the following error:
Traceback (most recent call last):
File "main.py", line 35, in <module>
model.fit(train_images, train_labels, validation_data=(test_images, test_labels), epochs=50, batch_size=32)
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 1010, in fit
self._make_train_function()
File "/usr/local/lib/python2.7/dist-packages/keras/engine/training.py", line 509, in _make_train_function
loss=self.total_loss)
File "/usr/local/lib/python2.7/dist-packages/keras/legacy/interfaces.py", line 91, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/keras/optimizers.py", line 256, in get_updates
grads = self.get_gradients(loss, params)
File "/usr/local/lib/python2.7/dist-packages/keras/optimizers.py", line 91, in get_gradients
raise ValueError('An operation has `None` for gradient. '
ValueError: An operation has `None` for gradient. Please make sure that all of your ops have a gradient defined (i.e. are differentiable). Common ops without gradient: K.argmax, K.round, K.eval.
I don't know how to get rid of this problem.
Can someone explain me briefly what should I do?
I have gone through Keras official documentation about writing custom layer but it didn't specify about it. Link
Following is the code for my custom layer.
class GraphConvolutionalLayer(Layer):
def __init__(self, A, num_input_features, num_output_features, **kwargs):
self.A = A
self.num_input_features = num_input_features
self.num_output_features = num_output_features
self.num_vertices = A.get_shape().as_list()[0]
self.input_spec = (self.num_vertices, num_input_features)
super(GraphConvolutionalLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.k0 = self.add_weight(name='k0',
shape=(self.num_output_features, self.num_input_features),
initializer='uniform',
trainable=True)
self.k1 = self.add_weight(name='k1',
shape=(self.num_output_features, self.num_input_features),
initializer='uniform',
trainable=True)
self.H = tf.einsum('ab,cd->abcd', tf.convert_to_tensor(self.k0, dtype=tf.float32), tf.eye(self.num_vertices))
self.built = True
def call(self, Vin):
Vin2 = tf.reshape(tf.transpose(Vin, [0, 2, 1]), [Vin.get_shape().as_list()[1] * Vin.get_shape().as_list()[2], -1])
H_tmp = tf.reshape(tf.transpose(self.H, [0, 2, 1, 3]), [ self.num_output_features, self.num_vertices, self.num_vertices * self.num_input_features])
Vout = tf.transpose(K.dot(H_tmp, Vin2), [2, 1, 0])
return Vout
def compute_output_shape(self, input_shape):
return (self.num_vertices, self.num_output_features)
Following is the code for the main file.
main_input = Input(shape=train_images[0].shape)
Vout1 = GraphConvolutionalLayer(A, 1, 4)(main_input)
Vout2 = GraphConvolutionalLayer(A, 4, 8)(Vout1)
Vout3 = Flatten()(Vout2)
Vout4 = Dense(10, activation='sigmoid')(Vout3)
print(train_images.shape, train_labels.shape)
model = Model(inputs=main_input, outputs=Vout4)
print(model.summary())
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
model.fit(train_images, train_labels, validation_data=(test_images, test_labels), epochs=50, batch_size=32)
Here, I take uniform as an initializer. When I changed it, I didn't get any error. I don't know why this happened but I could be able to solve my error just changing that line.
As error states, some of your function is non-differentiable. It' not easy to say why exactly it happens. For example, take a look
List of Differentiable Ops in Tensorflow
How to make sure your computation graph is differentiable
Edit: Consider example, where I use standard cifar10 data.
class GraphConvolutionalLayer(layers.Layer):
def __init__(self, A, num_input_features, num_output_features, **kwargs):
#self.A = A
self.num_input_features = num_input_features
self.num_output_features = num_output_features
self.num_vertices = A
self.input_spec = (self.num_vertices, num_input_features)
super(GraphConvolutionalLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.k0 = self.add_weight(name='k0',
shape=(self.num_output_features, self.num_input_features),
initializer='uniform',
trainable=True)
self.H = tf.einsum('ab,cd->abcd', tf.convert_to_tensor(self.k0, dtype=tf.float32), tf.eye(self.num_vertices))
self.H = tf.reshape(self.H, [32*32, 3])
self.built = True
def call(self, Vin):
Vin2 = tf.reshape(Vin, [Vin.get_shape().as_list()[1] * Vin.get_shape().as_list()[1],Vin.get_shape().as_list()[-1]])
Vin2 = tf.transpose(Vin2)
Vout = tf.matmul(self.H, Vin2)
return Vout
def input_fn():
train, test = tf.keras.datasets.cifar10.load_data()
dataset = tf.data.Dataset.from_tensor_slices((train[0], train[1]))
dataset = dataset.batch(1)
return dataset
main_input = layers.Input(shape=[32, 32, 3])
Vout1 = GraphConvolutionalLayer(32, 3, 1)(main_input)
Vout3 = layers.Flatten()(Vout1)
Vout4 = layers.Dense(10, activation='sigmoid')(Vout3)
model = Model(inputs=main_input, outputs=Vout4)
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
model.fit(input_fn(), epochs=50, steps_per_epoch=10)
In this case gradients are computed. So the problem clearly is not in how you construct GraphConvolutionalLayer but in some internal operation, which depends on data. You need to check every op one by one with your data shapes.
P.S. You can try substituting einsum with matmul, cause the former is simply a syntactic wrap for the latter.