Multi lstm layers and multi lstm in pytorch - python

I am using two ways to create a two-layer lstm as shown in the following two codes. Can anyone tell me why the outputs are not the same? and If you have the experience, can you tell me which one is better ? Thanks so much ! (Thanks for the suggestion of initializing them to have the same weights and bias. I add this suggestion in the original code. Despite the same initial parameters, their outputs are still not the same...)
The first way using num_layers:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
lstm = nn.LSTM(3, 3,2) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
weight_ih_0=None
weight_hh_0=None
# bias_ih_0=None
# bias_hh_0=None
weight_ih_1=None
weight_hh_1=None
# bias_ih_1=None
# bias_hh_1=None
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'bias {name} after init: {param}')
elif 'weight' in name:
# print(f'weight {name} before init: {param}')
nn.init.xavier_normal_(param)
print(f'weight {name} after init: {param}')
for name, param in lstm.named_parameters():
if 'weight_ih_l0' in name:
weight_ih_0=param
if 'weight_hh_l0' in name:
weight_hh_0=param
if 'weight_ih_l1' in name:
weight_ih_1=param
if 'weight_hh_l1' in name:
weight_hh_1=param
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden = (torch.zeros(2, 1, 3),
torch.zeros(2, 1, 3))
idx=0
for i in inputs:
print(f'idx: {idx}')
# print(f'i: {i}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
out, hidden = lstm(i.view(1, 1, -1), hidden)
print(out)
print("==========")
# print(hidden)
The outputs is:
weight weight_ih_l0 after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
weight weight_hh_l0 after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
weight weight_ih_l1 after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
weight weight_hh_l1 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
idx: 0
tensor([[[ 0.0374, -0.0085, -0.0240]]], grad_fn=<StackBackward>)
==========
idx: 1
tensor([[[ 0.0073, -0.0110, -0.0296]]], grad_fn=<StackBackward>)
==========
idx: 2
tensor([[[-0.0314, -0.0147, -0.0136]]], grad_fn=<StackBackward>)
==========
idx: 3
tensor([[[-0.0458, -0.0118, -0.0254]]], grad_fn=<StackBackward>)
==========
idx: 4
tensor([[[-0.0096, -0.0281, -0.0440]]], grad_fn=<StackBackward>)
==========
The second way creating two individual lstm:
import copy
torch.manual_seed(1)
lstm = nn.LSTMCell(3, 3) # Input dim is 3, output dim is 3
lstm2 = nn.LSTMCell(3, 3) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'lstm bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm weight {name} before init: {param}')
if 'weight_ih' in name:
param=copy.deepcopy(weight_ih_0)
print(f'lstm {name} after init: {param}')
if 'weight_hh' in name:
param=copy.deepcopy(weight_hh_0)
print(f'lstm {name} after init: {param}')
for name, param in lstm2.named_parameters():
if 'bias' in name:
# print(f'lstm2 bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm2 bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm2 weight {name} before init: {param}')
if 'weight_ih' in name:
param=copy.deepcopy(weight_ih_1)
print(f'lstm2 {name} after init: {param}')
if 'weight_hh' in name:
param=copy.deepcopy(weight_hh_1)
print(f'lstm2 {name} after init: {param}')
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden = torch.zeros(1, 3)
cell= torch.zeros(1, 3)
idx=0
for i in inputs:
print(f'idx: {idx}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
hidden, cell = lstm(i.view(1, -1), (hidden,cell))
# print(hidden.shape)
hidden, cell = lstm2(hidden, (hidden,cell))
print(hidden)
print("==========")
And the output is:
lstm weight_ih after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
lstm weight_hh after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
lstm2 weight_ih after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
lstm2 weight_hh after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
idx: 0
tensor([[-0.0152, -0.0344, 0.0368]], grad_fn=<MulBackward0>)
==========
idx: 1
tensor([[-0.0265, -0.0143, 0.0730]], grad_fn=<MulBackward0>)
==========
idx: 2
tensor([[-0.0210, -0.0033, 0.0529]], grad_fn=<MulBackward0>)
==========
idx: 3
tensor([[-0.0580, -0.0201, 0.1194]], grad_fn=<MulBackward0>)
==========
idx: 4
tensor([[-0.0672, -0.0801, 0.1165]], grad_fn=<MulBackward0>)
==========

Although you initialized two LSTMs, obviously the initial weights of the two are different. You can verify this with the following code:
for p in lstm.parameters():
print(p)
I may prefer the first method, because this method does not require us to manually link between multiple layers.

I have the answer now. At the very beginning, I was confused with the hidden state and input state of the second lstm layer.
Thus, for stacked lstm with num_layers=2, we initialize the hidden states with the number of 2, since each lstm layer needs the initial hidden state, while the second lstm layer takes the output hidden state of the first lstm layer as its input.
And for the model containing individual lstm, since, for the above-stacked lstm model, each lstm layer has the initial hidden states being 0, thus, we should initialize the two individual lstms to both have zero hidden states.
In addition, I made a mistake to initialize the weight and bias values.
As a result, to make the above two methods have the same outputs, I use the following codes:
the first method:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
lstm = nn.LSTM(3, 3,2) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
weight_ih_0=None
weight_hh_0=None
# bias_ih_0=None
# bias_hh_0=None
weight_ih_1=None
weight_hh_1=None
# bias_ih_1=None
# bias_hh_1=None
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'bias {name} after init: {param}')
elif 'weight' in name:
# print(f'weight {name} before init: {param}')
nn.init.xavier_normal_(param)
print(f'weight {name} after init: {param}')
for name, param in lstm.named_parameters():
if 'weight_ih_l0' in name:
weight_ih_0=param
if 'weight_hh_l0' in name:
weight_hh_0=param
if 'weight_ih_l1' in name:
weight_ih_1=param
if 'weight_hh_l1' in name:
weight_hh_1=param
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden = (torch.zeros(2, 1, 3),
torch.zeros(2, 1, 3))
idx=0
for i in inputs:
print(f'idx: {idx}')
# print(f'i: {i}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
out, hidden = lstm(i.view(1, 1, -1), hidden)
print(out)
# print(hidden)
print("==========")
And the output is:
weight weight_ih_l0 after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
weight weight_hh_l0 after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
weight weight_ih_l1 after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
weight weight_hh_l1 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
idx: 0
tensor([[[ 0.0374, -0.0085, -0.0240]]], grad_fn=<StackBackward>)
==========
idx: 1
tensor([[[ 0.0073, -0.0110, -0.0296]]], grad_fn=<StackBackward>)
==========
idx: 2
tensor([[[-0.0314, -0.0147, -0.0136]]], grad_fn=<StackBackward>)
==========
idx: 3
tensor([[[-0.0458, -0.0118, -0.0254]]], grad_fn=<StackBackward>)
==========
idx: 4
tensor([[[-0.0096, -0.0281, -0.0440]]], grad_fn=<StackBackward>)
==========
The second method:
torch.manual_seed(1)
lstm = nn.LSTM(3, 3,1) # Input dim is 3, output dim is 3
lstm2 = nn.LSTM(3, 3,1) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden1 = (torch.zeros(1, 1, 3),
torch.zeros(1, 1, 3))
hidden2 = (torch.zeros(1, 1, 3),
torch.zeros(1, 1, 3))
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'lstm bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm weight {name} before init: {param}')
if 'weight_ih' in name:
lstm.weight_ih_l0.data=weight_ih_0
print(f'lstm {name} after init: {param}')
if 'weight_hh' in name:
lstm.weight_hh_l0.data=weight_hh_0
print(f'lstm {name} after init: {param}')
for name, param in lstm2.named_parameters():
if 'bias' in name:
# print(f'lstm2 bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm2 bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm2 weight {name} before init: {param}')
if 'weight_ih' in name:
lstm2.weight_ih_l0.data=weight_ih_1
print(f'lstm2 {name} after init: {param}')
if 'weight_hh' in name:
lstm2.weight_hh_l0.data=weight_hh_1
print(f'lstm2 {name} after init: {param}')
for name, param in lstm2.named_parameters():
if 'weight' in name:
# print(f'lstm2 weight {name} before init: {param}')
print(f'lstm2 {name} after init: {param}')
idx=0
for i in inputs:
print(f'idx: {idx}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
out, hidden1 = lstm(i.view(1, 1, -1), hidden1)
out, hidden2 = lstm2(out.view(1, 1, -1), hidden2)
print(out)
print("==========")
And the output is:
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
lstm weight_ih_l0 after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
lstm weight_hh_l0 after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
lstm2 weight_ih_l0 after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
lstm2 weight_hh_l0 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
lstm2 weight_ih_l0 after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
lstm2 weight_hh_l0 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
idx: 0
tensor([[[ 0.0374, -0.0085, -0.0240]]], grad_fn=<StackBackward>)
==========
idx: 1
tensor([[[ 0.0073, -0.0110, -0.0296]]], grad_fn=<StackBackward>)
==========
idx: 2
tensor([[[-0.0314, -0.0147, -0.0136]]], grad_fn=<StackBackward>)
==========
idx: 3
tensor([[[-0.0458, -0.0118, -0.0254]]], grad_fn=<StackBackward>)
==========
idx: 4
tensor([[[-0.0096, -0.0281, -0.0440]]], grad_fn=<StackBackward>)
==========

Related

torch parameter is going to nan when after first optim.step() while doing GPT's downstream task

Foundation
I'm doing finetuning with GPTJForCausalLM. (pretrained is 6B and fp16).
Environment
ubuntu 20.04 (nvidia-docker)
cuda(in docker) version is 11.4
RTX 3090 (24G VRAM)
torch.__version__ 1.12.1
transformers.__version__ 4.12.5
jupyter lab version : 3.5.0
python3 version : Python 3.7.13
pretrained model : (kakao-kogpt 6B 16fp) https://github.com/kakaobrain/kogpt
Problem
If I freeze some layer's parameters and doing model.forward()-loss.backward()-optim.step(), then not freezed parameters are gone to nan. Only first step enough to make nan.
Question
Why I can get this error?. Many of internet search result cannot help me.
Bug reproduction Code
!git clone https://github.com/kakaobrain/kogpt.git
!pip install -r ./kogpt/requirements.txt
from transformers import GPTJForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy
import inspect
import torch.optim as optim
import types
import re
import numpy as np
#model github's official code, and it works well.
tokenizer = AutoTokenizer.from_pretrained(
'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16', # or float32 version: revision=KoGPT6B-ryan1.5b
bos_token='[BOS]', eos_token='[EOS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]'
)
model = AutoModelForCausalLM.from_pretrained(
'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16', # or float32 version: revision=KoGPT6B-ryan1.5b
pad_token_id=tokenizer.eos_token_id,
torch_dtype='auto', low_cpu_mem_usage=True
)
#for working test, I just freeze all layers but not freeze last layer.
ls = list(model.modules())
for i, m in enumerate(ls):
if i == len(ls) - 1:
#unfreeze for last layer
m.requires_grad_(True)
else:
#freeze other layers
m.requires_grad_(False)
optim = optim.AdamW(model.parameters(), lr=1e-5)
_ = model.cuda()
_ = model.train()
sample = ['Some text data for finetuning-1', 'Some text data for finetuning-2']
with torch.cuda.amp.autocast(): #this line is not affect change result.
#make batch
batch = tokenizer(sample, padding=True, truncation=True, max_length=64, return_tensors='pt')
batch = {k:v.cuda() for k, v in batch.items()}
#forward
out = model(**batch)
#loss
loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0,-2),
batch['input_ids'][:,1:].flatten(),
reduction='mean')
print(loss.grad)
#None <- result
print(loss.is_leaf)
#False <- result
print(loss)
#tensor(6.6850, device='cuda:0', grad_fn=<NllLossBackward0>) <- result
print(list(model.parameters())[-1])
#Parameter containing: <- result
#tensor([-0.0454, 0.0815, -0.0442, ..., -0.0566, -0.0557, -0.0552],
# device='cuda:0', dtype=torch.float16, requires_grad=True)
loss.backward()
print(loss.grad)
#None <- print result
print(loss.is_leaf)
#False <- print result
print(list(model.parameters())[-1])
#Parameter containing: <- result
#tensor([-0.0454, 0.0815, -0.0442, ..., -0.0566, -0.0557, -0.0552],
# device='cuda:0', dtype=torch.float16, requires_grad=True)
optim.step()
print(list(model.parameters())[-1])
#Parameter containing: <- result, this is the problem point.
#tensor([ nan, 0.0815, nan, ..., nan, nan, nan], device='cuda:0',
# dtype=torch.float16, requires_grad=True)
More Information
The work I really wanted to do is using LoRA downstream.
I did it first time with this code.
#(... same as section 5.)
#load model
model = AutoModelForCausalLM.from_pretrained(
'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16', # or float32 version: revision=KoGPT6B-ryan1.5b
pad_token_id=tokenizer.eos_token_id,
torch_dtype='auto', low_cpu_mem_usage=True
)
#my lora adapter adder code, (refer to https://github.com/huggingface/transformers/issues/14839)
def forward_linear_with_adapter(self, input: torch.Tensor) -> torch.Tensor:
#replace NN.Linear's forward()
out = F.linear(input, self.weight, self.bias)
if self.lora_adapter:
out += self.lora_adapter(input)
return out
def AddLoRAtoLinear(layer, adapter_dim=16, _dtype=None):
#add adapter
dt = _dtype if _dtype else layer._parameters['weight'].dtype
layer.lora_adapter = nn.Sequential(
nn.Linear(layer.in_features, adapter_dim, bias=False, dtype=dt),
nn.Linear(adapter_dim, layer.out_features, bias=False, dtype=dt)
)
#make trainable
layer.lora_adapter.requires_grad_(True)
nn.init.zeros_(layer.lora_adapter[1].weight)
#bind forward with adapter
layer.forward = types.MethodType(forward_linear_with_adapter, layer)
def forward_embedding_with_adapter(self, input: torch.Tensor) -> torch.Tensor:
#replace NN.Embedding's forward()
out = F.embedding(input, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse)
if self.lora_adapter:
out += self.lora_adapter(input)
return out
def AddLoRAtoEmbedding(layer, adapter_dim=16, _dtype=None):
dt = _dtype if _dtype else layer._parameters['weight'].dtype
#add adapter
layer.lora_adapter = nn.Sequential(
nn.Embedding(layer.num_embeddings, adapter_dim, dtype=dt),
nn.Linear(adapter_dim, layer.embedding_dim, bias=False, dtype=dt)
)
#make trainable
layer.lora_adapter.requires_grad_(True)
nn.init.zeros_(layer.lora_adapter[1].weight) #follow LoRA paper's
#bind forward with adapter
layer.forward = types.MethodType(forward_embedding_with_adapter, layer)
def MakeLoRA(model):
#freeze all other parameters.
for _ in model.parameters():
_.requires_grad_(False)
#apply LoRA only embedding & linear layers.
needchange = []
for module in model.modules():
if type(module) == nn.Linear or type(module) == nn.Embedding:
needchange.append(module)
for module in needchange:
if type(module) == nn.Linear:
#run custom LoRA attach function to this layer
AddLoRAtoLinear(module)
elif type(module) == nn.Embedding:
#run custom LoRA attach function to this layer
AddLoRAtoEmbedding(module)
return model
if False:
#instead of doing this(from code in section 5), do MakeLoRA()
ls = list(model.modules())
for i, m in enumerate(ls):
if i == len(ls) - 1:
#unfreeze for last layer
m.requires_grad_(True)
else:
#freeze other layers
m.requires_grad_(False)
else:
#change model to has LoRA
model = MakeLoRA(model)
#(and do it same as section 5... and make same nan error )

Custom activation function in Tensorflow with trainable params

I am trying to implement a custom version of the PElu activation function in tensorflow. The custom thing about this activation is the knee of the relu is smoothed. I got the equation from this paper.
Here is the code:
from keras import backend as K
import tensorflow as tf
def SMU_LeakyPRElu(x, alpha=2.5,u=1.0):
return ((1+alpha)*x)+((1-alpha)*x)*(tf.math.erf(u*(1-alpha)*x))
from keras.layers import Layer
class SMU_LeakyPRElu(Layer):
def __init__(self, alpha=2.5, u=1.0, trainable=False, **kwargs):
super(SMU_LeakyPRElu, self).__init__(**kwargs)
self.supports_masking = True
self.alpha = alpha
self.u = u
self.trainable = trainable
def build(self, input_shape):
self.alpha_factor = K.variable(self.alpha,
dtype=K.floatx(),
name='alpha_factor')
self.u_factor = K.variable(self.u,
dtype=K.floatx(),
name='u_factor')
if self.trainable:
self._trainable_weights.append(self.alpha_factor)
self._trainable_weights.append(self.u_factor)
super(SMU_LeakyPRElu, self).build(input_shape)
def call(self, inputs, mask=None):
return SMU_LeakyPRElu(inputs, self.alpha_factor,self.u_factor)
def get_config(self):
config = {'alpha': self.get_weights()[0] if self.trainable else self.alpha,
'u' : self.get_weights()[1] if self.trainable else self.u,
'trainable': self.trainable}
base_config = super(SMU_LeakyPRElu, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def compute_output_shape(self, input_shape):
return input_shape
x = tf.random.normal((1,10,4))
print(x)
input_shape = (1,10,4)
input_layer = tf.keras.layers.Input(shape=input_shape[1:], name="input_layer")
layer_1 = tf.keras.layers.Conv1D(2, 1,padding = 'valid', input_shape=input_shape[:1])(input_layer)
layer_2 = SMU_LeakyPRElu(alpha=2.5,u=1.0,trainable=True)(layer_1)
model = tf.keras.models.Model(input_layer, layer_2, name="model")
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss="categorical_crossentropy", run_eagerly=True)
print(model.summary())
result = model.predict(x)
print(result)
print(result.shape)
I implemented this code using a example from this post at Data Science SE.
Error:
tf.Tensor(
[[[ 1.0467066 -1.1833347 1.5384735 2.078511 ]
[-1.6025988 -0.30846047 0.8019808 0.3113866 ]
[ 0.58313304 -0.90643036 -0.3926888 -0.6210553 ]
[ 0.16505387 -0.5930619 0.6983522 -0.12211661]
[ 0.06077941 -0.11117186 -1.2540722 -0.32234746]
[ 0.41838828 0.7090619 0.30999053 0.10459523]
[ 0.35603598 -0.2695868 -0.17901018 -0.09100233]
[ 1.2746769 0.8311447 0.02825974 -0.48021472]
[-1.536545 -0.24765234 -0.36437735 -1.1891246 ]
[ 0.7531206 -0.56109476 -0.65761757 0.19102335]]], shape=(1, 10, 4), dtype=float32)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-50-c9d490dfd533> in <module>
5 input_layer = tf.keras.layers.Input(shape=input_shape[1:], name="input_layer")
6 layer_1 = tf.keras.layers.Conv1D(2, 1,padding = 'valid', input_shape=input_shape[:1])(input_layer)
----> 7 layer_2 = SMU_LeakyPRElu(alpha=2.5,u=1.0,trainable=True)(layer_1)
8
9 model = tf.keras.models.Model(input_layer, layer_2, name="model")
1 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/type_spec.py in type_spec_from_value(value)
888 3, "Failed to convert %r to tensor: %s" % (type(value).__name__, e))
889
--> 890 raise TypeError(f"Could not build a TypeSpec for {value} of "
891 f"unsupported type {type(value)}.")
892
TypeError: Could not build a TypeSpec for <__main__.SMU_LeakyPRElu object at 0x7fde698f7850> of unsupported type <class '__main__.SMU_LeakyPRElu'>.
I don't understand this error. How should I implement this function as custom activation function with trainable parameters alpha and u.?
The problem is that you have named your activation function and the custom layer you created the same thing. I refactored your code for you.
Code:
import tensorflow as tf
from typing import Optional
from tensorflow.keras import Model
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Layer
from tensorflow.keras.optimizers import Adam
class SMULeakyPReLU(Layer):
"""``SMULeakyPReLU``."""
def __init__(self,
alpha: float = 2.5,
u: float = 1.,
trainable: bool = False,
**kwargs):
super().__init__(**kwargs)
self.alpha = alpha
self.u = u
self.trainable = trainable
def build(self, input_shape: tf.TensorShape):
super().build(input_shape)
self.alpha_factor = tf.Variable(
self.alpha,
dtype=tf.float32,
trainable=self.trainable,
name="alpha_factor")
self.u_factor = tf.Variable(
self.u,
dtype=tf.float32,
name="u_factor")
def call(self,
inputs: tf.Tensor,
mask: Optional[tf.Tensor] = None
) -> tf.Tensor:
fst = (1. + self.alpha_factor) * inputs
snd = (1. - self.alpha_factor) * inputs
trd = tf.math.erf(self.u_factor * (1. - self.alpha_factor) * inputs)
return fst * snd * trd
def get_config(self):
config = {
"alpha": self.get_weights()[0] if self.trainable else self.alpha,
"u": self.get_weights()[1] if self.trainable else self.u,
"trainable": self.trainable
}
base_config = super().get_config()
return dict(list(base_config.items()) + list(config.items()))
Test
# fake data
x = tf.random.normal((1, 10, 4))
# create network
input_layer = Input(shape=x.shape[1:], name="input_layer")
layer_1 = Conv1D(2, 1, padding="valid")(input_layer)
layer_2 = SMULeakyPReLU(alpha=2.5, u=1.0, trainable=True)(layer_1)
# create model
model = Model(input_layer, layer_2, name="model")
# compile model and summary
model.compile(
optimizer=Adam(learning_rate=5e-4),
loss="categorical_crossentropy",
run_eagerly=True)
print(model.summary())
# forward pass
result = model.predict(x)
print(result)
print(result.shape)
# Model: "model"
# _________________________________________________________________
# Layer (type) Output Shape Param #
# =================================================================
# input_layer (InputLayer) [(None, 10, 4)] 0
#
# conv1d_1 (Conv1D) (None, 10, 2) 10
#
# smu_leaky_p_re_lu_1 (SMULea (None, 10, 2) 2
# kyPReLU)
#
# =================================================================
# Total params: 12
# Trainable params: 12
# Non-trainable params: 0
# _________________________________________________________________
# None
# 1/1 [==============================] - 0s 13ms/step
# [[[-1.6503611e+01 -3.5051659e+01]
# [ 4.0098205e-02 1.5923592e+00]
# [-1.4898951e+00 7.5487376e-05]
# [ 3.1900513e+01 2.8786476e+01]
# [ 1.9207695e+01 3.6511238e+01]
# [-6.8302655e-01 -4.7705490e-02]
# [ 9.6008554e-03 7.5611029e+00]
# [ 4.7136435e-01 2.5528276e+00]
# [ 2.6859209e-01 3.3496175e+00]
# [ 1.4372441e+01 3.4978668e+01]]]
# (1, 10, 2)

TypeError: cannot cast array data from dtype('float64) to dtype('<U32') according to safe rule

class SigmoidNeuron:
def __init__(self):
self.w=None
self.b=None
def perceptron(self,x):
return np.dot(x,self.w.T)+self.b
def sigmoid(self,x):
return 1.0/(1.0+np.exp(-x))
def grad_w(self,x,y):
y_pred = self.sigmoid(self.perceptron(x))
return (y_pred-y)*y_pred*(1-y_pred)*x
def grad_b(self,x,y):
y_pred = self.sigmoid(self.perceptron(x))
return (y_pred-y)*y_pred*(1-y_pred)
def fit(self,x,y,epochs=1,learning_rate=1,initialise=True):
#initialise w,b
if initialise:
self.w=np.random.randn(1,X.shape[1])
self.b=0
for i in range(epochs):
dw=0
db=0
for x,y in zip(X,Y):
dw+=self.grad_w(x,y)
db+=self.grad_b(x,y)
self.w -= learning_rate*dw
self.b -= learning_rate*db
`
I'm running a sigmoid neural network code and I'm getting error while running this class with data
X_scaled_train.astype(float)
array([[ 1.29929126, -0.90185206, 0.03173306, ..., -0.14142136,
-0.15523011, 0.21232515],
[-1.16225208, -0.86697607, 1.03451971, ..., -0.14142136,
-0.15523011, 0.21232515],
[ 1.77523922, 0.65594214, 0.03173306, ..., -0.14142136,
-0.15523011, 0.21232515],
...,
[ 1.44058831, -0.58796815, -0.66464655, ..., -0.14142136,
-0.15523011, 0.21232515],
[-1.42253612, 0.50481285, 1.54984063, ..., -0.14142136,
-0.15523011, 0.21232515],
[ 1.06875397, 0.6791928 , 0.97880934, ..., -0.14142136,
-0.15523011, 0.21232515]])
Y_scaled_train.astype(float)
array([[0.68],
[0.72],
[0.72],
[0.6 ],
[0.8 ],
[0.64],
[0.68],
These are the data for train set
while I'm running this line
sn.fit(X_scaled_train,Y_scaled_train,epochs=10,learning_rate=0.2)
I'm getting that type error
what should I do to remove it
the error shows
TypeError Traceback (most recent call last)
<ipython-input-167-51016d58d1f5> in <module>()
----> 1 sn.fit(X_scaled_train,Y_scaled_train,epochs=10,learning_rate=0.2)
2 frames
<ipython-input-25-2e09637c6d09> in perceptron(self, x)
4 self.b=None
5 def perceptron(self,x):
----> 6 return np.dot(x,self.w.T)+self.b
7 def sigmoid(self,x):
8 return 1.0/(1.0+np.exp(-x))
<__array_function__ internals> in dot(*args, **kwargs)
TypeError: Cannot cast array data from dtype('float64') to dtype('<U32') according to the rule 'safe'
Use:
np.array(your_list)
your_list.values.astype(np.float)
Or:
new_list = [float(i) for i in your_list]
Or:
For huge arrays, I would strongly recommend you to use numpy:
np.array(your_list, dtype=np.float32)
np.array(your_list,dtype=float)
You don't have to assign if it's a float:
np.array(your_list)
Or:
new_list = float("{:.1f}".format(float(input())))
list.append(new_list)

Tensorflow saving subclass model which has multiple arguments to call() method

I am following the tensorflow neural machine translation tutorial:
https://www.tensorflow.org/tutorials/text/nmt_with_attention
I am trying to save the Encoder and Decoder models which are subclasses of the tf.keras.Model and work properly during training and inference, however I want to save the models. When I try to do so I get the following error:
TypeError: call() missing 1 required positional argument: 'initial_state'
Here is the code:
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_matrix, n_units, batch_size):
super(Encoder, self).__init__()
self.n_units = n_units
self.batch_size = batch_size
self.embedding = Embedding(vocab_size, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True, mask_zero=True)
self.lstm = LSTM(n_units, return_sequences=True, return_state=True, recurrent_initializer="glorot_uniform")
def call(self, input_utterence, initial_state):
input_embed = self.embedding(input_utterence)
encoder_states, h1, c1 = self.lstm(input_embed, initial_state=initial_state)
return encoder_states, h1, c1
def create_initial_state(self):
return tf.zeros((self.batch_size, self.n_units))
encoder = Encoder(vocab_size, embedding_matrix, LSTM_DIM, BATCH_SIZE)
# do some training...
tf.saved_model.save(decoder, "encoder_model")
I also tried to make the call method take one input list argument only and unpack the variables I need within the method but then I get the following error when trying to save:
File "C:\Users\Fady\Documents\Machine Learning\chatbot\models\seq2seq_model.py", line 32, in call
input_utterence, initial_state = inputs
ValueError: too many values to unpack (expected 2)
You can export the model successfully if you package your inputs in a list. You also need to specify the input signatures to export your model, here your code with slight modifications which works
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM
import numpy as np
print('TensorFlow: ', tf.__version__)
vocab_size = 10000
LSTM_DIM = 256
BATCH_SIZE = 16
embedding_matrix = np.random.randn(vocab_size, 300)
class Encoder(tf.keras.Model):
def __init__(self, vocab_size, embedding_matrix, n_units, batch_size):
super(Encoder, self).__init__()
self.n_units = n_units
self.batch_size = batch_size
self.embedding = Embedding(vocab_size, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True, mask_zero=True)
self.lstm = LSTM(n_units, return_sequences=True, return_state=True, recurrent_initializer="glorot_uniform")
#tf.function
def call(self, inputs):
input_utterence, initial_state = inputs
input_embed = self.embedding(input_utterence)
encoder_states, h1, c1 = self.lstm(input_embed, initial_state=initial_state)
return encoder_states, h1, c1
def create_initial_state(self):
return tf.zeros((self.batch_size, self.n_units))
random_input = tf.random.uniform(shape=[BATCH_SIZE, 3], maxval=vocab_size, dtype=tf.int32)
encoder = Encoder(vocab_size, embedding_matrix, LSTM_DIM, BATCH_SIZE)
initial_state = [encoder.create_initial_state(), encoder.create_initial_state()]
_ = encoder([random_input, initial_state]) # required so that encoder.build is triggered
tf.saved_model.save(encoder, "encoder_model", signatures=encoder.call.get_concrete_function(
[
tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='input_utterence'),
[
tf.TensorSpec(shape=[None, LSTM_DIM], dtype=tf.float32, name='initial_h'),
tf.TensorSpec(shape=[None, LSTM_DIM], dtype=tf.float32, name='initial_c')
]
]))
loaded_model = tf.saved_model.load('encoder_model')
loaded_model([random_input, initial_state])
output:
TensorFlow: 2.2.0-rc1
WARNING:tensorflow:From /home/dl_user/tf_stable/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: encoder_model/assets
(<tf.Tensor: shape=(16, 3, 256), dtype=float32, numpy=
array([[[-0.06000457, 0.02422162, -0.05310762, ..., -0.01340707,
0.12212028, -0.02747637],
[ 0.13303193, 0.3119418 , -0.17995344, ..., -0.10185111,
0.09568192, 0.06919193],
[-0.08075664, -0.11490613, -0.20294832, ..., -0.14999194,
0.02177649, 0.05538464]],
[[-0.03792192, -0.08431012, 0.03687581, ..., -0.1768839 ,
-0.10469476, 0.08730042],
[-0.02956271, 0.43850696, -0.07400024, ..., 0.04097629,
0.209705 , 0.27194855],
[ 0.02529916, 0.18367583, -0.11409087, ..., 0.0458075 ,
0.2065246 , 0.22976378]],
[[ 0.04196627, 0.08302739, 0.02218204, ..., 0.07388053,
-0.05696848, -0.31895265],
[-0.00536443, 0.1566213 , -0.22412768, ..., 0.10560389,
0.20187919, -0.1896591 ],
[ 0.26364946, 0.13163888, 0.14586888, ..., 0.19517538,
0.17677066, -0.40476215]],
...,
[[ 0.10999472, 0.07398727, 0.23443945, ..., -0.1912791 ,
-0.0195728 , 0.11717851],
[ 0.03978832, 0.07587367, 0.16567066, ..., -0.29463592,
0.05950819, 0.0242265 ],
[ 0.2505787 , 0.15849623, 0.06635283, ..., -0.17969091,
0.12549783, -0.11459641]],
[[-0.20408148, 0.04629526, 0.00601436, ..., 0.21321473,
0.04952445, -0.0129672 ],
[-0.14671509, 0.2911171 , 0.13047697, ..., -0.03531414,
-0.16794083, 0.01575338],
[-0.08337164, 0.08723269, 0.16235027, ..., 0.07919721,
0.05701642, 0.15379705]],
[[-0.2747393 , 0.24351111, -0.05829309, ..., -0.00448833,
0.07568972, 0.03978251],
[-0.16282909, -0.04586324, -0.0054924 , ..., 0.11050001,
0.1312355 , 0.16555254],
[ 0.07759799, -0.07308074, -0.10038756, ..., 0.18139914,
0.07769153, 0.1375772 ]]], dtype=float32)>,
<tf.Tensor: shape=(16, 256), dtype=float32, numpy=
array([[-0.08075664, -0.11490613, -0.20294832, ..., -0.14999194,
0.02177649, 0.05538464],
[ 0.02529916, 0.18367583, -0.11409087, ..., 0.0458075 ,
0.2065246 , 0.22976378],
[ 0.26364946, 0.13163888, 0.14586888, ..., 0.19517538,
0.17677066, -0.40476215],
...,
[ 0.2505787 , 0.15849623, 0.06635283, ..., -0.17969091,
0.12549783, -0.11459641],
[-0.08337164, 0.08723269, 0.16235027, ..., 0.07919721,
0.05701642, 0.15379705],
[ 0.07759799, -0.07308074, -0.10038756, ..., 0.18139914,
0.07769153, 0.1375772 ]], dtype=float32)>,
<tf.Tensor: shape=(16, 256), dtype=float32, numpy=
array([[-0.32829475, -0.18770668, -0.2956414 , ..., -0.2427501 ,
0.03146099, 0.16033864],
[ 0.05112522, 0.6664379 , -0.19836858, ..., 0.10015503,
0.511694 , 0.51550364],
[ 0.3379809 , 0.7145362 , 0.22311993, ..., 0.372106 ,
0.25914627, -0.81374717],
...,
[ 0.36742535, 0.29009506, 0.13245934, ..., -0.4318537 ,
0.26666188, -0.20086129],
[-0.17384854, 0.22998339, 0.27335796, ..., 0.09973672,
0.10726923, 0.47339764],
[ 0.22148325, -0.11998752, -0.16339599, ..., 0.31903535,
0.20365229, 0.28087002]], dtype=float32)>)

Using to Caffe2 to create a model that uses dropout but getting an error related to dropout code

I'm trying to create a model in Caffe2 that uses dropout. But I'm getting a model an error that refers to my code dropout code.
def someModel(model, data):
conv1 = brew.conv(model, data, 'conv1', dim_in=1, dim_out=20, kernel=5)
conv_relu_1 = model.net.Relu(conv1, 'relu1')
conv2 = brew.conv(model, conv_relu_1, 'conv2', dim_in=1, dim_out=20, kernel=5)
conv_relu_2 = model.net.Relu(conv2, 'relu2')
pool1 = model.net.MaxPool(conv_relu_2, 'pool1', kernel=2, stride=2)
drop1 = model.Dropout(pool1, 'drop1', ratio=0.5, is_test=0)
#drop1 = model.Dropout(pool1, 'drop1', ratio=0.5)
conv3 = brew.conv(model, drop1, 'conv3', dim_in=1, dim_out=50, kernel=3)
conv_relu_3 = model.net.Relu(conv3, 'relu3')
conv4 = brew.conv(model, conv_relu_3, 'conv4', dim_in=1, dim_out=20, kernel=5)
conv_relu_4 = model.net.Relu(conv4, 'relu4')
pool2 = model.net.MaxPool(conv_relu_4, 'pool1', kernel=2, stride=2)
drop2 = model.Dropout(pool2, 'drop2', ratio=0.5)
fc1 = brew.fc(model, drop2, 'fc1', dim_in=20 * 4 * 4, dim_out=50)
fc_relu_1 = model.net.Relu(fc1, 'relu5')
fc2 = brew.fc(model, fc_relu_1, 'fc2', dim_in=50 * 4 * 4, dim_out=10)
pred = brew.fc(model, fc2, 'pred', 500, 10)
softmax = model.net.Softmax(pred, 'softmax')
return softmax
return pred
Below is the error I'm getting.
Exception when creating gradient for [Dropout]:[enforce fail at operator_gradient.h:86] schema->Verify(def_). (GradientMaker) Operator def did not pass schema checking: input: "pool1" output: "drop2" name: "" type: "Dropout" arg { name: "ratio" f: 0.5 } . Op: input: "pool1" output: "drop2" name: "" type: "Dropout" arg {name: "ratio" f: 0.5}
Define dropout layer as
dropout1 = brew.dropout(model,pool1, 'dropout1', ratio=0.5, is_test=0)

Categories