Multi lstm layers and multi lstm in pytorch - python
I am using two ways to create a two-layer lstm as shown in the following two codes. Can anyone tell me why the outputs are not the same? and If you have the experience, can you tell me which one is better ? Thanks so much ! (Thanks for the suggestion of initializing them to have the same weights and bias. I add this suggestion in the original code. Despite the same initial parameters, their outputs are still not the same...)
The first way using num_layers:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
lstm = nn.LSTM(3, 3,2) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
weight_ih_0=None
weight_hh_0=None
# bias_ih_0=None
# bias_hh_0=None
weight_ih_1=None
weight_hh_1=None
# bias_ih_1=None
# bias_hh_1=None
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'bias {name} after init: {param}')
elif 'weight' in name:
# print(f'weight {name} before init: {param}')
nn.init.xavier_normal_(param)
print(f'weight {name} after init: {param}')
for name, param in lstm.named_parameters():
if 'weight_ih_l0' in name:
weight_ih_0=param
if 'weight_hh_l0' in name:
weight_hh_0=param
if 'weight_ih_l1' in name:
weight_ih_1=param
if 'weight_hh_l1' in name:
weight_hh_1=param
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden = (torch.zeros(2, 1, 3),
torch.zeros(2, 1, 3))
idx=0
for i in inputs:
print(f'idx: {idx}')
# print(f'i: {i}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
out, hidden = lstm(i.view(1, 1, -1), hidden)
print(out)
print("==========")
# print(hidden)
The outputs is:
weight weight_ih_l0 after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
weight weight_hh_l0 after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
weight weight_ih_l1 after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
weight weight_hh_l1 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
idx: 0
tensor([[[ 0.0374, -0.0085, -0.0240]]], grad_fn=<StackBackward>)
==========
idx: 1
tensor([[[ 0.0073, -0.0110, -0.0296]]], grad_fn=<StackBackward>)
==========
idx: 2
tensor([[[-0.0314, -0.0147, -0.0136]]], grad_fn=<StackBackward>)
==========
idx: 3
tensor([[[-0.0458, -0.0118, -0.0254]]], grad_fn=<StackBackward>)
==========
idx: 4
tensor([[[-0.0096, -0.0281, -0.0440]]], grad_fn=<StackBackward>)
==========
The second way creating two individual lstm:
import copy
torch.manual_seed(1)
lstm = nn.LSTMCell(3, 3) # Input dim is 3, output dim is 3
lstm2 = nn.LSTMCell(3, 3) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'lstm bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm weight {name} before init: {param}')
if 'weight_ih' in name:
param=copy.deepcopy(weight_ih_0)
print(f'lstm {name} after init: {param}')
if 'weight_hh' in name:
param=copy.deepcopy(weight_hh_0)
print(f'lstm {name} after init: {param}')
for name, param in lstm2.named_parameters():
if 'bias' in name:
# print(f'lstm2 bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm2 bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm2 weight {name} before init: {param}')
if 'weight_ih' in name:
param=copy.deepcopy(weight_ih_1)
print(f'lstm2 {name} after init: {param}')
if 'weight_hh' in name:
param=copy.deepcopy(weight_hh_1)
print(f'lstm2 {name} after init: {param}')
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden = torch.zeros(1, 3)
cell= torch.zeros(1, 3)
idx=0
for i in inputs:
print(f'idx: {idx}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
hidden, cell = lstm(i.view(1, -1), (hidden,cell))
# print(hidden.shape)
hidden, cell = lstm2(hidden, (hidden,cell))
print(hidden)
print("==========")
And the output is:
lstm weight_ih after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
lstm weight_hh after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
lstm2 weight_ih after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
lstm2 weight_hh after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
idx: 0
tensor([[-0.0152, -0.0344, 0.0368]], grad_fn=<MulBackward0>)
==========
idx: 1
tensor([[-0.0265, -0.0143, 0.0730]], grad_fn=<MulBackward0>)
==========
idx: 2
tensor([[-0.0210, -0.0033, 0.0529]], grad_fn=<MulBackward0>)
==========
idx: 3
tensor([[-0.0580, -0.0201, 0.1194]], grad_fn=<MulBackward0>)
==========
idx: 4
tensor([[-0.0672, -0.0801, 0.1165]], grad_fn=<MulBackward0>)
==========
Although you initialized two LSTMs, obviously the initial weights of the two are different. You can verify this with the following code:
for p in lstm.parameters():
print(p)
I may prefer the first method, because this method does not require us to manually link between multiple layers.
I have the answer now. At the very beginning, I was confused with the hidden state and input state of the second lstm layer.
Thus, for stacked lstm with num_layers=2, we initialize the hidden states with the number of 2, since each lstm layer needs the initial hidden state, while the second lstm layer takes the output hidden state of the first lstm layer as its input.
And for the model containing individual lstm, since, for the above-stacked lstm model, each lstm layer has the initial hidden states being 0, thus, we should initialize the two individual lstms to both have zero hidden states.
In addition, I made a mistake to initialize the weight and bias values.
As a result, to make the above two methods have the same outputs, I use the following codes:
the first method:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
lstm = nn.LSTM(3, 3,2) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
weight_ih_0=None
weight_hh_0=None
# bias_ih_0=None
# bias_hh_0=None
weight_ih_1=None
weight_hh_1=None
# bias_ih_1=None
# bias_hh_1=None
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'bias {name} after init: {param}')
elif 'weight' in name:
# print(f'weight {name} before init: {param}')
nn.init.xavier_normal_(param)
print(f'weight {name} after init: {param}')
for name, param in lstm.named_parameters():
if 'weight_ih_l0' in name:
weight_ih_0=param
if 'weight_hh_l0' in name:
weight_hh_0=param
if 'weight_ih_l1' in name:
weight_ih_1=param
if 'weight_hh_l1' in name:
weight_hh_1=param
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden = (torch.zeros(2, 1, 3),
torch.zeros(2, 1, 3))
idx=0
for i in inputs:
print(f'idx: {idx}')
# print(f'i: {i}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
out, hidden = lstm(i.view(1, 1, -1), hidden)
print(out)
# print(hidden)
print("==========")
And the output is:
weight weight_ih_l0 after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
weight weight_hh_l0 after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
weight weight_ih_l1 after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
weight weight_hh_l1 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
idx: 0
tensor([[[ 0.0374, -0.0085, -0.0240]]], grad_fn=<StackBackward>)
==========
idx: 1
tensor([[[ 0.0073, -0.0110, -0.0296]]], grad_fn=<StackBackward>)
==========
idx: 2
tensor([[[-0.0314, -0.0147, -0.0136]]], grad_fn=<StackBackward>)
==========
idx: 3
tensor([[[-0.0458, -0.0118, -0.0254]]], grad_fn=<StackBackward>)
==========
idx: 4
tensor([[[-0.0096, -0.0281, -0.0440]]], grad_fn=<StackBackward>)
==========
The second method:
torch.manual_seed(1)
lstm = nn.LSTM(3, 3,1) # Input dim is 3, output dim is 3
lstm2 = nn.LSTM(3, 3,1) # Input dim is 3, output dim is 3
inputs = [torch.randn(1, 3) for _ in range(5)] # make a sequence of length 5
print(f'inputs: {inputs}')
# initialize the hidden state.
hidden1 = (torch.zeros(1, 1, 3),
torch.zeros(1, 1, 3))
hidden2 = (torch.zeros(1, 1, 3),
torch.zeros(1, 1, 3))
for name, param in lstm.named_parameters():
if 'bias' in name:
# print(f'lstm bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm weight {name} before init: {param}')
if 'weight_ih' in name:
lstm.weight_ih_l0.data=weight_ih_0
print(f'lstm {name} after init: {param}')
if 'weight_hh' in name:
lstm.weight_hh_l0.data=weight_hh_0
print(f'lstm {name} after init: {param}')
for name, param in lstm2.named_parameters():
if 'bias' in name:
# print(f'lstm2 bias {name} before init: {param}')
nn.init.constant_(param, 0.0)
# print(f'lstm2 bias {name} after init: {param}')
elif 'weight' in name:
# print(f'lstm2 weight {name} before init: {param}')
if 'weight_ih' in name:
lstm2.weight_ih_l0.data=weight_ih_1
print(f'lstm2 {name} after init: {param}')
if 'weight_hh' in name:
lstm2.weight_hh_l0.data=weight_hh_1
print(f'lstm2 {name} after init: {param}')
for name, param in lstm2.named_parameters():
if 'weight' in name:
# print(f'lstm2 weight {name} before init: {param}')
print(f'lstm2 {name} after init: {param}')
idx=0
for i in inputs:
print(f'idx: {idx}')
idx+=1
# Step through the sequence one element at a time.
# after each step, hidden contains the hidden
out, hidden1 = lstm(i.view(1, 1, -1), hidden1)
out, hidden2 = lstm2(out.view(1, 1, -1), hidden2)
print(out)
print("==========")
And the output is:
inputs: [tensor([[1.5381, 1.4673, 1.5951]]), tensor([[-1.5279, 1.0156, -0.2020]]), tensor([[-1.2865, 0.8231, -0.6101]]), tensor([[-1.2960, -0.9434, 0.6684]]), tensor([[ 1.1628, -0.3229, 1.8782]])]
lstm weight_ih_l0 after init: Parameter containing:
tensor([[ 0.6025, -0.1577, -0.0990],
[-0.5255, 0.4554, 0.4651],
[ 0.1428, 0.1414, -0.0291],
[ 0.1248, 0.3465, -0.5053],
[ 0.6295, -0.8635, -0.3394],
[ 0.1072, 0.0786, 0.3427],
[ 0.5352, -0.2032, 0.8816],
[ 0.3727, -0.1608, -0.6332],
[-0.3745, 0.1903, -0.1654],
[-0.0460, -0.2148, 0.7737],
[-0.1980, -0.8980, -0.3470],
[-0.1130, 0.6074, 0.1844]], requires_grad=True)
lstm weight_hh_l0 after init: Parameter containing:
tensor([[-0.0719, -0.0122, 0.2626],
[ 0.3887, -0.3044, -0.4356],
[-0.8422, 0.2204, 0.1151],
[ 0.4171, 0.1116, -0.2114],
[ 0.2061, -0.3204, -0.0983],
[ 0.4791, -0.5683, -0.3928],
[-0.3196, -0.1726, -0.0732],
[-0.3058, -0.5667, -0.0211],
[-0.0832, -0.3168, 0.1241],
[-0.4197, 0.0525, 0.0741],
[ 0.3849, 0.0481, -0.3130],
[ 0.5788, 0.6312, -0.3627]], requires_grad=True)
lstm2 weight_ih_l0 after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
lstm2 weight_hh_l0 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
lstm2 weight_ih_l0 after init: Parameter containing:
tensor([[ 3.6955e-02, 7.1276e-02, -4.3073e-01],
[-5.2666e-01, 2.7323e-02, 1.2894e-01],
[ 3.7136e-01, 3.3969e-01, 1.9601e-01],
[ 3.5802e-01, -4.3600e-01, -1.7962e-01],
[ 8.3209e-01, 1.7189e-01, 2.2195e-01],
[-2.1302e-02, -1.6867e-01, -1.3460e-01],
[ 1.3446e-01, 1.7708e-01, -5.6676e-01],
[-2.3697e-01, -2.8254e-02, -2.2063e-01],
[-2.0928e-01, 3.4973e-01, 3.5858e-04],
[-5.0565e-01, -6.8619e-02, 3.7702e-01],
[-9.0796e-02, -1.7238e-01, 4.7868e-01],
[-1.1565e-01, -6.7956e-02, -2.1049e-01]], requires_grad=True)
lstm2 weight_hh_l0 after init: Parameter containing:
tensor([[-0.3017, -0.0811, -0.6554],
[ 0.2665, -0.2052, -0.0577],
[ 0.5493, -0.5094, 0.2167],
[ 0.1210, -0.3868, -0.2293],
[-0.0991, 0.6744, -0.0114],
[-0.0343, -0.6136, 0.4856],
[ 0.0505, 0.3920, -0.1662],
[ 0.1163, -0.1296, 0.2505],
[-0.1373, -0.8803, -0.4666],
[-0.0230, -0.0346, -0.8451],
[ 0.2032, 0.1847, -0.0758],
[ 0.2533, 0.1532, 0.8224]], requires_grad=True)
idx: 0
tensor([[[ 0.0374, -0.0085, -0.0240]]], grad_fn=<StackBackward>)
==========
idx: 1
tensor([[[ 0.0073, -0.0110, -0.0296]]], grad_fn=<StackBackward>)
==========
idx: 2
tensor([[[-0.0314, -0.0147, -0.0136]]], grad_fn=<StackBackward>)
==========
idx: 3
tensor([[[-0.0458, -0.0118, -0.0254]]], grad_fn=<StackBackward>)
==========
idx: 4
tensor([[[-0.0096, -0.0281, -0.0440]]], grad_fn=<StackBackward>)
==========
Related
torch parameter is going to nan when after first optim.step() while doing GPT's downstream task
Foundation I'm doing finetuning with GPTJForCausalLM. (pretrained is 6B and fp16). Environment ubuntu 20.04 (nvidia-docker) cuda(in docker) version is 11.4 RTX 3090 (24G VRAM) torch.__version__ 1.12.1 transformers.__version__ 4.12.5 jupyter lab version : 3.5.0 python3 version : Python 3.7.13 pretrained model : (kakao-kogpt 6B 16fp) https://github.com/kakaobrain/kogpt Problem If I freeze some layer's parameters and doing model.forward()-loss.backward()-optim.step(), then not freezed parameters are gone to nan. Only first step enough to make nan. Question Why I can get this error?. Many of internet search result cannot help me. Bug reproduction Code !git clone https://github.com/kakaobrain/kogpt.git !pip install -r ./kogpt/requirements.txt from transformers import GPTJForCausalLM from transformers import AutoTokenizer, AutoModelForCausalLM import transformers import torch import torch.nn as nn import torch.nn.functional as F import math import copy import inspect import torch.optim as optim import types import re import numpy as np #model github's official code, and it works well. tokenizer = AutoTokenizer.from_pretrained( 'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16', # or float32 version: revision=KoGPT6B-ryan1.5b bos_token='[BOS]', eos_token='[EOS]', unk_token='[UNK]', pad_token='[PAD]', mask_token='[MASK]' ) model = AutoModelForCausalLM.from_pretrained( 'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16', # or float32 version: revision=KoGPT6B-ryan1.5b pad_token_id=tokenizer.eos_token_id, torch_dtype='auto', low_cpu_mem_usage=True ) #for working test, I just freeze all layers but not freeze last layer. ls = list(model.modules()) for i, m in enumerate(ls): if i == len(ls) - 1: #unfreeze for last layer m.requires_grad_(True) else: #freeze other layers m.requires_grad_(False) optim = optim.AdamW(model.parameters(), lr=1e-5) _ = model.cuda() _ = model.train() sample = ['Some text data for finetuning-1', 'Some text data for finetuning-2'] with torch.cuda.amp.autocast(): #this line is not affect change result. #make batch batch = tokenizer(sample, padding=True, truncation=True, max_length=64, return_tensors='pt') batch = {k:v.cuda() for k, v in batch.items()} #forward out = model(**batch) #loss loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0,-2), batch['input_ids'][:,1:].flatten(), reduction='mean') print(loss.grad) #None <- result print(loss.is_leaf) #False <- result print(loss) #tensor(6.6850, device='cuda:0', grad_fn=<NllLossBackward0>) <- result print(list(model.parameters())[-1]) #Parameter containing: <- result #tensor([-0.0454, 0.0815, -0.0442, ..., -0.0566, -0.0557, -0.0552], # device='cuda:0', dtype=torch.float16, requires_grad=True) loss.backward() print(loss.grad) #None <- print result print(loss.is_leaf) #False <- print result print(list(model.parameters())[-1]) #Parameter containing: <- result #tensor([-0.0454, 0.0815, -0.0442, ..., -0.0566, -0.0557, -0.0552], # device='cuda:0', dtype=torch.float16, requires_grad=True) optim.step() print(list(model.parameters())[-1]) #Parameter containing: <- result, this is the problem point. #tensor([ nan, 0.0815, nan, ..., nan, nan, nan], device='cuda:0', # dtype=torch.float16, requires_grad=True) More Information The work I really wanted to do is using LoRA downstream. I did it first time with this code. #(... same as section 5.) #load model model = AutoModelForCausalLM.from_pretrained( 'kakaobrain/kogpt', revision='KoGPT6B-ryan1.5b-float16', # or float32 version: revision=KoGPT6B-ryan1.5b pad_token_id=tokenizer.eos_token_id, torch_dtype='auto', low_cpu_mem_usage=True ) #my lora adapter adder code, (refer to https://github.com/huggingface/transformers/issues/14839) def forward_linear_with_adapter(self, input: torch.Tensor) -> torch.Tensor: #replace NN.Linear's forward() out = F.linear(input, self.weight, self.bias) if self.lora_adapter: out += self.lora_adapter(input) return out def AddLoRAtoLinear(layer, adapter_dim=16, _dtype=None): #add adapter dt = _dtype if _dtype else layer._parameters['weight'].dtype layer.lora_adapter = nn.Sequential( nn.Linear(layer.in_features, adapter_dim, bias=False, dtype=dt), nn.Linear(adapter_dim, layer.out_features, bias=False, dtype=dt) ) #make trainable layer.lora_adapter.requires_grad_(True) nn.init.zeros_(layer.lora_adapter[1].weight) #bind forward with adapter layer.forward = types.MethodType(forward_linear_with_adapter, layer) def forward_embedding_with_adapter(self, input: torch.Tensor) -> torch.Tensor: #replace NN.Embedding's forward() out = F.embedding(input, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse) if self.lora_adapter: out += self.lora_adapter(input) return out def AddLoRAtoEmbedding(layer, adapter_dim=16, _dtype=None): dt = _dtype if _dtype else layer._parameters['weight'].dtype #add adapter layer.lora_adapter = nn.Sequential( nn.Embedding(layer.num_embeddings, adapter_dim, dtype=dt), nn.Linear(adapter_dim, layer.embedding_dim, bias=False, dtype=dt) ) #make trainable layer.lora_adapter.requires_grad_(True) nn.init.zeros_(layer.lora_adapter[1].weight) #follow LoRA paper's #bind forward with adapter layer.forward = types.MethodType(forward_embedding_with_adapter, layer) def MakeLoRA(model): #freeze all other parameters. for _ in model.parameters(): _.requires_grad_(False) #apply LoRA only embedding & linear layers. needchange = [] for module in model.modules(): if type(module) == nn.Linear or type(module) == nn.Embedding: needchange.append(module) for module in needchange: if type(module) == nn.Linear: #run custom LoRA attach function to this layer AddLoRAtoLinear(module) elif type(module) == nn.Embedding: #run custom LoRA attach function to this layer AddLoRAtoEmbedding(module) return model if False: #instead of doing this(from code in section 5), do MakeLoRA() ls = list(model.modules()) for i, m in enumerate(ls): if i == len(ls) - 1: #unfreeze for last layer m.requires_grad_(True) else: #freeze other layers m.requires_grad_(False) else: #change model to has LoRA model = MakeLoRA(model) #(and do it same as section 5... and make same nan error )
Custom activation function in Tensorflow with trainable params
I am trying to implement a custom version of the PElu activation function in tensorflow. The custom thing about this activation is the knee of the relu is smoothed. I got the equation from this paper. Here is the code: from keras import backend as K import tensorflow as tf def SMU_LeakyPRElu(x, alpha=2.5,u=1.0): return ((1+alpha)*x)+((1-alpha)*x)*(tf.math.erf(u*(1-alpha)*x)) from keras.layers import Layer class SMU_LeakyPRElu(Layer): def __init__(self, alpha=2.5, u=1.0, trainable=False, **kwargs): super(SMU_LeakyPRElu, self).__init__(**kwargs) self.supports_masking = True self.alpha = alpha self.u = u self.trainable = trainable def build(self, input_shape): self.alpha_factor = K.variable(self.alpha, dtype=K.floatx(), name='alpha_factor') self.u_factor = K.variable(self.u, dtype=K.floatx(), name='u_factor') if self.trainable: self._trainable_weights.append(self.alpha_factor) self._trainable_weights.append(self.u_factor) super(SMU_LeakyPRElu, self).build(input_shape) def call(self, inputs, mask=None): return SMU_LeakyPRElu(inputs, self.alpha_factor,self.u_factor) def get_config(self): config = {'alpha': self.get_weights()[0] if self.trainable else self.alpha, 'u' : self.get_weights()[1] if self.trainable else self.u, 'trainable': self.trainable} base_config = super(SMU_LeakyPRElu, self).get_config() return dict(list(base_config.items()) + list(config.items())) def compute_output_shape(self, input_shape): return input_shape x = tf.random.normal((1,10,4)) print(x) input_shape = (1,10,4) input_layer = tf.keras.layers.Input(shape=input_shape[1:], name="input_layer") layer_1 = tf.keras.layers.Conv1D(2, 1,padding = 'valid', input_shape=input_shape[:1])(input_layer) layer_2 = SMU_LeakyPRElu(alpha=2.5,u=1.0,trainable=True)(layer_1) model = tf.keras.models.Model(input_layer, layer_2, name="model") model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), loss="categorical_crossentropy", run_eagerly=True) print(model.summary()) result = model.predict(x) print(result) print(result.shape) I implemented this code using a example from this post at Data Science SE. Error: tf.Tensor( [[[ 1.0467066 -1.1833347 1.5384735 2.078511 ] [-1.6025988 -0.30846047 0.8019808 0.3113866 ] [ 0.58313304 -0.90643036 -0.3926888 -0.6210553 ] [ 0.16505387 -0.5930619 0.6983522 -0.12211661] [ 0.06077941 -0.11117186 -1.2540722 -0.32234746] [ 0.41838828 0.7090619 0.30999053 0.10459523] [ 0.35603598 -0.2695868 -0.17901018 -0.09100233] [ 1.2746769 0.8311447 0.02825974 -0.48021472] [-1.536545 -0.24765234 -0.36437735 -1.1891246 ] [ 0.7531206 -0.56109476 -0.65761757 0.19102335]]], shape=(1, 10, 4), dtype=float32) --------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-50-c9d490dfd533> in <module> 5 input_layer = tf.keras.layers.Input(shape=input_shape[1:], name="input_layer") 6 layer_1 = tf.keras.layers.Conv1D(2, 1,padding = 'valid', input_shape=input_shape[:1])(input_layer) ----> 7 layer_2 = SMU_LeakyPRElu(alpha=2.5,u=1.0,trainable=True)(layer_1) 8 9 model = tf.keras.models.Model(input_layer, layer_2, name="model") 1 frames /usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/type_spec.py in type_spec_from_value(value) 888 3, "Failed to convert %r to tensor: %s" % (type(value).__name__, e)) 889 --> 890 raise TypeError(f"Could not build a TypeSpec for {value} of " 891 f"unsupported type {type(value)}.") 892 TypeError: Could not build a TypeSpec for <__main__.SMU_LeakyPRElu object at 0x7fde698f7850> of unsupported type <class '__main__.SMU_LeakyPRElu'>. I don't understand this error. How should I implement this function as custom activation function with trainable parameters alpha and u.?
The problem is that you have named your activation function and the custom layer you created the same thing. I refactored your code for you. Code: import tensorflow as tf from typing import Optional from tensorflow.keras import Model from tensorflow.keras.layers import Conv1D from tensorflow.keras.layers import Input from tensorflow.keras.layers import Layer from tensorflow.keras.optimizers import Adam class SMULeakyPReLU(Layer): """``SMULeakyPReLU``.""" def __init__(self, alpha: float = 2.5, u: float = 1., trainable: bool = False, **kwargs): super().__init__(**kwargs) self.alpha = alpha self.u = u self.trainable = trainable def build(self, input_shape: tf.TensorShape): super().build(input_shape) self.alpha_factor = tf.Variable( self.alpha, dtype=tf.float32, trainable=self.trainable, name="alpha_factor") self.u_factor = tf.Variable( self.u, dtype=tf.float32, name="u_factor") def call(self, inputs: tf.Tensor, mask: Optional[tf.Tensor] = None ) -> tf.Tensor: fst = (1. + self.alpha_factor) * inputs snd = (1. - self.alpha_factor) * inputs trd = tf.math.erf(self.u_factor * (1. - self.alpha_factor) * inputs) return fst * snd * trd def get_config(self): config = { "alpha": self.get_weights()[0] if self.trainable else self.alpha, "u": self.get_weights()[1] if self.trainable else self.u, "trainable": self.trainable } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) Test # fake data x = tf.random.normal((1, 10, 4)) # create network input_layer = Input(shape=x.shape[1:], name="input_layer") layer_1 = Conv1D(2, 1, padding="valid")(input_layer) layer_2 = SMULeakyPReLU(alpha=2.5, u=1.0, trainable=True)(layer_1) # create model model = Model(input_layer, layer_2, name="model") # compile model and summary model.compile( optimizer=Adam(learning_rate=5e-4), loss="categorical_crossentropy", run_eagerly=True) print(model.summary()) # forward pass result = model.predict(x) print(result) print(result.shape) # Model: "model" # _________________________________________________________________ # Layer (type) Output Shape Param # # ================================================================= # input_layer (InputLayer) [(None, 10, 4)] 0 # # conv1d_1 (Conv1D) (None, 10, 2) 10 # # smu_leaky_p_re_lu_1 (SMULea (None, 10, 2) 2 # kyPReLU) # # ================================================================= # Total params: 12 # Trainable params: 12 # Non-trainable params: 0 # _________________________________________________________________ # None # 1/1 [==============================] - 0s 13ms/step # [[[-1.6503611e+01 -3.5051659e+01] # [ 4.0098205e-02 1.5923592e+00] # [-1.4898951e+00 7.5487376e-05] # [ 3.1900513e+01 2.8786476e+01] # [ 1.9207695e+01 3.6511238e+01] # [-6.8302655e-01 -4.7705490e-02] # [ 9.6008554e-03 7.5611029e+00] # [ 4.7136435e-01 2.5528276e+00] # [ 2.6859209e-01 3.3496175e+00] # [ 1.4372441e+01 3.4978668e+01]]] # (1, 10, 2)
TypeError: cannot cast array data from dtype('float64) to dtype('<U32') according to safe rule
class SigmoidNeuron: def __init__(self): self.w=None self.b=None def perceptron(self,x): return np.dot(x,self.w.T)+self.b def sigmoid(self,x): return 1.0/(1.0+np.exp(-x)) def grad_w(self,x,y): y_pred = self.sigmoid(self.perceptron(x)) return (y_pred-y)*y_pred*(1-y_pred)*x def grad_b(self,x,y): y_pred = self.sigmoid(self.perceptron(x)) return (y_pred-y)*y_pred*(1-y_pred) def fit(self,x,y,epochs=1,learning_rate=1,initialise=True): #initialise w,b if initialise: self.w=np.random.randn(1,X.shape[1]) self.b=0 for i in range(epochs): dw=0 db=0 for x,y in zip(X,Y): dw+=self.grad_w(x,y) db+=self.grad_b(x,y) self.w -= learning_rate*dw self.b -= learning_rate*db ` I'm running a sigmoid neural network code and I'm getting error while running this class with data X_scaled_train.astype(float) array([[ 1.29929126, -0.90185206, 0.03173306, ..., -0.14142136, -0.15523011, 0.21232515], [-1.16225208, -0.86697607, 1.03451971, ..., -0.14142136, -0.15523011, 0.21232515], [ 1.77523922, 0.65594214, 0.03173306, ..., -0.14142136, -0.15523011, 0.21232515], ..., [ 1.44058831, -0.58796815, -0.66464655, ..., -0.14142136, -0.15523011, 0.21232515], [-1.42253612, 0.50481285, 1.54984063, ..., -0.14142136, -0.15523011, 0.21232515], [ 1.06875397, 0.6791928 , 0.97880934, ..., -0.14142136, -0.15523011, 0.21232515]]) Y_scaled_train.astype(float) array([[0.68], [0.72], [0.72], [0.6 ], [0.8 ], [0.64], [0.68], These are the data for train set while I'm running this line sn.fit(X_scaled_train,Y_scaled_train,epochs=10,learning_rate=0.2) I'm getting that type error what should I do to remove it the error shows TypeError Traceback (most recent call last) <ipython-input-167-51016d58d1f5> in <module>() ----> 1 sn.fit(X_scaled_train,Y_scaled_train,epochs=10,learning_rate=0.2) 2 frames <ipython-input-25-2e09637c6d09> in perceptron(self, x) 4 self.b=None 5 def perceptron(self,x): ----> 6 return np.dot(x,self.w.T)+self.b 7 def sigmoid(self,x): 8 return 1.0/(1.0+np.exp(-x)) <__array_function__ internals> in dot(*args, **kwargs) TypeError: Cannot cast array data from dtype('float64') to dtype('<U32') according to the rule 'safe'
Use: np.array(your_list) your_list.values.astype(np.float) Or: new_list = [float(i) for i in your_list] Or: For huge arrays, I would strongly recommend you to use numpy: np.array(your_list, dtype=np.float32) np.array(your_list,dtype=float) You don't have to assign if it's a float: np.array(your_list) Or: new_list = float("{:.1f}".format(float(input()))) list.append(new_list)
Tensorflow saving subclass model which has multiple arguments to call() method
I am following the tensorflow neural machine translation tutorial: https://www.tensorflow.org/tutorials/text/nmt_with_attention I am trying to save the Encoder and Decoder models which are subclasses of the tf.keras.Model and work properly during training and inference, however I want to save the models. When I try to do so I get the following error: TypeError: call() missing 1 required positional argument: 'initial_state' Here is the code: class Encoder(tf.keras.Model): def __init__(self, vocab_size, embedding_matrix, n_units, batch_size): super(Encoder, self).__init__() self.n_units = n_units self.batch_size = batch_size self.embedding = Embedding(vocab_size, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True, mask_zero=True) self.lstm = LSTM(n_units, return_sequences=True, return_state=True, recurrent_initializer="glorot_uniform") def call(self, input_utterence, initial_state): input_embed = self.embedding(input_utterence) encoder_states, h1, c1 = self.lstm(input_embed, initial_state=initial_state) return encoder_states, h1, c1 def create_initial_state(self): return tf.zeros((self.batch_size, self.n_units)) encoder = Encoder(vocab_size, embedding_matrix, LSTM_DIM, BATCH_SIZE) # do some training... tf.saved_model.save(decoder, "encoder_model") I also tried to make the call method take one input list argument only and unpack the variables I need within the method but then I get the following error when trying to save: File "C:\Users\Fady\Documents\Machine Learning\chatbot\models\seq2seq_model.py", line 32, in call input_utterence, initial_state = inputs ValueError: too many values to unpack (expected 2)
You can export the model successfully if you package your inputs in a list. You also need to specify the input signatures to export your model, here your code with slight modifications which works import tensorflow as tf from tensorflow.keras.layers import Embedding, LSTM import numpy as np print('TensorFlow: ', tf.__version__) vocab_size = 10000 LSTM_DIM = 256 BATCH_SIZE = 16 embedding_matrix = np.random.randn(vocab_size, 300) class Encoder(tf.keras.Model): def __init__(self, vocab_size, embedding_matrix, n_units, batch_size): super(Encoder, self).__init__() self.n_units = n_units self.batch_size = batch_size self.embedding = Embedding(vocab_size, embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True, mask_zero=True) self.lstm = LSTM(n_units, return_sequences=True, return_state=True, recurrent_initializer="glorot_uniform") #tf.function def call(self, inputs): input_utterence, initial_state = inputs input_embed = self.embedding(input_utterence) encoder_states, h1, c1 = self.lstm(input_embed, initial_state=initial_state) return encoder_states, h1, c1 def create_initial_state(self): return tf.zeros((self.batch_size, self.n_units)) random_input = tf.random.uniform(shape=[BATCH_SIZE, 3], maxval=vocab_size, dtype=tf.int32) encoder = Encoder(vocab_size, embedding_matrix, LSTM_DIM, BATCH_SIZE) initial_state = [encoder.create_initial_state(), encoder.create_initial_state()] _ = encoder([random_input, initial_state]) # required so that encoder.build is triggered tf.saved_model.save(encoder, "encoder_model", signatures=encoder.call.get_concrete_function( [ tf.TensorSpec(shape=[None, None], dtype=tf.int32, name='input_utterence'), [ tf.TensorSpec(shape=[None, LSTM_DIM], dtype=tf.float32, name='initial_h'), tf.TensorSpec(shape=[None, LSTM_DIM], dtype=tf.float32, name='initial_c') ] ])) loaded_model = tf.saved_model.load('encoder_model') loaded_model([random_input, initial_state]) output: TensorFlow: 2.2.0-rc1 WARNING:tensorflow:From /home/dl_user/tf_stable/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:1817: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version. Instructions for updating: If using Keras pass *_constraint arguments to layers. INFO:tensorflow:Assets written to: encoder_model/assets (<tf.Tensor: shape=(16, 3, 256), dtype=float32, numpy= array([[[-0.06000457, 0.02422162, -0.05310762, ..., -0.01340707, 0.12212028, -0.02747637], [ 0.13303193, 0.3119418 , -0.17995344, ..., -0.10185111, 0.09568192, 0.06919193], [-0.08075664, -0.11490613, -0.20294832, ..., -0.14999194, 0.02177649, 0.05538464]], [[-0.03792192, -0.08431012, 0.03687581, ..., -0.1768839 , -0.10469476, 0.08730042], [-0.02956271, 0.43850696, -0.07400024, ..., 0.04097629, 0.209705 , 0.27194855], [ 0.02529916, 0.18367583, -0.11409087, ..., 0.0458075 , 0.2065246 , 0.22976378]], [[ 0.04196627, 0.08302739, 0.02218204, ..., 0.07388053, -0.05696848, -0.31895265], [-0.00536443, 0.1566213 , -0.22412768, ..., 0.10560389, 0.20187919, -0.1896591 ], [ 0.26364946, 0.13163888, 0.14586888, ..., 0.19517538, 0.17677066, -0.40476215]], ..., [[ 0.10999472, 0.07398727, 0.23443945, ..., -0.1912791 , -0.0195728 , 0.11717851], [ 0.03978832, 0.07587367, 0.16567066, ..., -0.29463592, 0.05950819, 0.0242265 ], [ 0.2505787 , 0.15849623, 0.06635283, ..., -0.17969091, 0.12549783, -0.11459641]], [[-0.20408148, 0.04629526, 0.00601436, ..., 0.21321473, 0.04952445, -0.0129672 ], [-0.14671509, 0.2911171 , 0.13047697, ..., -0.03531414, -0.16794083, 0.01575338], [-0.08337164, 0.08723269, 0.16235027, ..., 0.07919721, 0.05701642, 0.15379705]], [[-0.2747393 , 0.24351111, -0.05829309, ..., -0.00448833, 0.07568972, 0.03978251], [-0.16282909, -0.04586324, -0.0054924 , ..., 0.11050001, 0.1312355 , 0.16555254], [ 0.07759799, -0.07308074, -0.10038756, ..., 0.18139914, 0.07769153, 0.1375772 ]]], dtype=float32)>, <tf.Tensor: shape=(16, 256), dtype=float32, numpy= array([[-0.08075664, -0.11490613, -0.20294832, ..., -0.14999194, 0.02177649, 0.05538464], [ 0.02529916, 0.18367583, -0.11409087, ..., 0.0458075 , 0.2065246 , 0.22976378], [ 0.26364946, 0.13163888, 0.14586888, ..., 0.19517538, 0.17677066, -0.40476215], ..., [ 0.2505787 , 0.15849623, 0.06635283, ..., -0.17969091, 0.12549783, -0.11459641], [-0.08337164, 0.08723269, 0.16235027, ..., 0.07919721, 0.05701642, 0.15379705], [ 0.07759799, -0.07308074, -0.10038756, ..., 0.18139914, 0.07769153, 0.1375772 ]], dtype=float32)>, <tf.Tensor: shape=(16, 256), dtype=float32, numpy= array([[-0.32829475, -0.18770668, -0.2956414 , ..., -0.2427501 , 0.03146099, 0.16033864], [ 0.05112522, 0.6664379 , -0.19836858, ..., 0.10015503, 0.511694 , 0.51550364], [ 0.3379809 , 0.7145362 , 0.22311993, ..., 0.372106 , 0.25914627, -0.81374717], ..., [ 0.36742535, 0.29009506, 0.13245934, ..., -0.4318537 , 0.26666188, -0.20086129], [-0.17384854, 0.22998339, 0.27335796, ..., 0.09973672, 0.10726923, 0.47339764], [ 0.22148325, -0.11998752, -0.16339599, ..., 0.31903535, 0.20365229, 0.28087002]], dtype=float32)>)
Using to Caffe2 to create a model that uses dropout but getting an error related to dropout code
I'm trying to create a model in Caffe2 that uses dropout. But I'm getting a model an error that refers to my code dropout code. def someModel(model, data): conv1 = brew.conv(model, data, 'conv1', dim_in=1, dim_out=20, kernel=5) conv_relu_1 = model.net.Relu(conv1, 'relu1') conv2 = brew.conv(model, conv_relu_1, 'conv2', dim_in=1, dim_out=20, kernel=5) conv_relu_2 = model.net.Relu(conv2, 'relu2') pool1 = model.net.MaxPool(conv_relu_2, 'pool1', kernel=2, stride=2) drop1 = model.Dropout(pool1, 'drop1', ratio=0.5, is_test=0) #drop1 = model.Dropout(pool1, 'drop1', ratio=0.5) conv3 = brew.conv(model, drop1, 'conv3', dim_in=1, dim_out=50, kernel=3) conv_relu_3 = model.net.Relu(conv3, 'relu3') conv4 = brew.conv(model, conv_relu_3, 'conv4', dim_in=1, dim_out=20, kernel=5) conv_relu_4 = model.net.Relu(conv4, 'relu4') pool2 = model.net.MaxPool(conv_relu_4, 'pool1', kernel=2, stride=2) drop2 = model.Dropout(pool2, 'drop2', ratio=0.5) fc1 = brew.fc(model, drop2, 'fc1', dim_in=20 * 4 * 4, dim_out=50) fc_relu_1 = model.net.Relu(fc1, 'relu5') fc2 = brew.fc(model, fc_relu_1, 'fc2', dim_in=50 * 4 * 4, dim_out=10) pred = brew.fc(model, fc2, 'pred', 500, 10) softmax = model.net.Softmax(pred, 'softmax') return softmax return pred Below is the error I'm getting. Exception when creating gradient for [Dropout]:[enforce fail at operator_gradient.h:86] schema->Verify(def_). (GradientMaker) Operator def did not pass schema checking: input: "pool1" output: "drop2" name: "" type: "Dropout" arg { name: "ratio" f: 0.5 } . Op: input: "pool1" output: "drop2" name: "" type: "Dropout" arg {name: "ratio" f: 0.5}
Define dropout layer as dropout1 = brew.dropout(model,pool1, 'dropout1', ratio=0.5, is_test=0)