I am new to Pytorch and I have compiled the below code from different articles and code snippets. The code is basically taking in sequence of products and then predicting the next product in a sequence.
I am trying to find accuracy of this model but not sure how to do it. Any help or suggestion would be appreciated.
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(50)
prod_list = ['AA105045091',
'C2106264154',
'B2106691381',
'AA105045091',
'B2106691381',
'X3106692282',
'V2106350393',
'C2106264154',
'V6104504285',
'A2106329636',
'M6M100936257',
'N2101433968',
'X2M200042701',
'V3M200052002',
'K5101434063',
'B1106334744',
'P1103790575',
'K1106031596',
'E3D227124S6',
'D1105834415',
'M4102794084',
'B4101250283',
'C2102794082',
'D1106816721',
'B5106788450',
'A3106805351',
'C2106788452',
'C2106805373',
'B2106788454',
'A1104146375']
prod_list
sequences = []
for i in range(3, len(prod_list)):
words = prod_list[i-3:i+1]
sequences.append(words)
# split the sequence to input list and output list
X = []
y= []
for i in sequences:
X.append(i[0:3])
y.append(i[3])
# create integer-to-token mapping
int2token = {}
cnt = 0
for w in set(" ".join(prod_list).split()):
int2token[cnt] = w
cnt+= 1
# create token-to-integer mapping
token2int = {t: i for i, t in int2token.items()}
def get_integer_seq_train(seq):
new_list = []
for i in seq:
new_list.append(token2int[i])
return new_list
# convert text sequences to integer sequences
x_int = [get_integer_seq_train(i) for i in X]
# convert lists to numpy arrays
x_int = np.array(x_int)
vocab_size = len(int2token)
vocab_size
def get_integer_seq_test(seq):
return [token2int[w] for w in seq.split()]
#return [token2int[w] for w in seq.split()]
# convert text sequences to integer sequences
y_int = [get_integer_seq_test(i) for i in y]
# convert lists to numpy arrays
y_int = np.array(y_int)
def get_batches(arr_x, arr_y, batch_size):
# iterate through the arrays
prv = 0
for n in range(batch_size, arr_x.shape[0], batch_size):
x = arr_x[prv:n,:]
y = arr_y[prv:n,:]
prv = n
yield x, y
class WordLSTM(nn.Module):
def __init__(self, n_hidden=256, n_layers=4, drop_prob=0.3, lr=0.001):
super().__init__()
self.drop_prob = drop_prob
self.n_layers = n_layers
self.n_hidden = n_hidden
self.lr = lr
self.emb_layer = nn.Embedding(vocab_size, 200)
## define the LSTM
self.lstm = nn.LSTM(200, n_hidden, n_layers,
dropout=drop_prob, batch_first=True)
## define a dropout layer
self.dropout = nn.Dropout(drop_prob)
## define the fully-connected layer
self.fc = nn.Linear(3*n_hidden, vocab_size)
torch.manual_seed(50)
def forward(self, x, hidden):
''' Forward pass through the network.
These inputs are x, and the hidden/cell state `hidden`. '''
## pass input through embedding layer
embedded = self.emb_layer(x)
## Get the outputs and the new hidden state from the lstm
lstm_output, hidden = self.lstm(embedded, hidden)
## pass through a dropout layer
out = self.dropout(lstm_output)
#out = out.contiguous().view(-1, self.n_hidden)
out = out.reshape(x.shape[0], -1)
## put "out" through the fully-connected layer
out = self.fc(out)
# return the final output and the hidden state
return out, hidden
def init_hidden(self, batch_size):
''' initializes hidden state '''
# Create two new tensors with sizes n_layers x batch_size x n_hidden,
# initialized to zero, for hidden state and cell state of LSTM
weight = next(self.parameters()).data
# if GPU is available
if (torch.cuda.is_available()):
hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
# if GPU is not available
else:
hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
return hidden
net = WordLSTM()
def train(net, epochs=10, batch_size=32, lr=0.001, clip=1, print_every=32):
torch.manual_seed(50)
# optimizer
opt = torch.optim.Adam(net.parameters(), lr=lr)
# loss
criterion = nn.CrossEntropyLoss()
counter = 0
net.train()
for e in range(epochs):
# initialize hidden state
h = net.init_hidden(batch_size)
for x, y in get_batches(x_int, y_int, batch_size):
counter+= 1
# convert numpy arrays to PyTorch arrays
inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
# detach hidden states
h = tuple([each.data for each in h])
# zero accumulated gradients
net.zero_grad()
# get the output from the model
output, h = net(inputs, h)
# calculate the loss and perform backprop
loss = criterion(output, targets.view(-1))
# back-propagate error
loss.backward()
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
nn.utils.clip_grad_norm_(net.parameters(), clip)
# update weigths
opt.step()
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {}...".format(loss))
train(net, batch_size = 32, epochs=20, print_every=256)
def predict(net, tkn, h=None):
# tensor inputs
new_inp = []
for t1 in tkn:
x = np.array([token2int[t1]])
new_inp.append(x)
new_inp = np.asarray(new_inp).reshape(1,-1)
inputs = torch.from_numpy(new_inp)
# detach hidden state from history
h = tuple([each.data for each in h])
# get the output of the model
out, h = net(inputs, h)
# get the token probabilities
p = F.softmax(out, dim=1).data
p = p.cpu()
p = p.numpy()
p = p.reshape(p.shape[1],)
# get indices of top 3 values
top_n_idx = p.argsort()[-1:][::-1]
# randomly select one of the three indices
#sampled_token_index = top_n_idx[random.sample([0],1)[0]]
sampled_token_index = top_n_idx[0]
# return the encoded value of the predicted char and the hidden state
return int2token[sampled_token_index]
# function to generate text
def sample(net, prime):
net.eval()
# batch size is 1
h = net.init_hidden(1)
token = predict(net, prime, h)
return token
sample(net, prime=['AA105045091', 'C2106264154', 'B2106691381'])
Related
Given 5 features on a time series we want to predict the following values using an LSTM Recurrent Neural Network, using PyTorch. The problem is that the Loss Value starts very low (i.e. 0.04) and it increases a bit as the computation runs (it seems it converge to a slightly higher value, but it never decreases).
Moreover, the dataset is normalized, and we tried different values of learning rate, epochs, batch sizes etc.
An example of loss during training:
step : 0 loss : 0.0016425768844783306
step : 1 loss : 0.0028163508977741003
step : 2 loss : 0.009786984883248806
This is the class:
class MV_LSTM(torch.nn.Module):
def __init__(self,n_features,seq_length):
super(MV_LSTM, self).__init__()
self.n_features = n_features
self.seq_len = seq_length
self.n_hidden = 40 # number of hidden states
self.n_layers = 1 # number of LSTM layers (stacked)
self.l_lstm = torch.nn.LSTM(input_size = n_features,
hidden_size = self.n_hidden,
num_layers = self.n_layers,
batch_first = True)
# according to pytorch docs LSTM output is
# (batch_size,seq_len, num_directions * hidden_size)
# when considering batch_first = True
self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 5)
def init_hidden(self, batch_size):
hidden_state = torch.randn(self.n_layers,batch_size,self.n_hidden)
cell_state = torch.randn(self.n_layers,batch_size,self.n_hidden)
self.hidden = (hidden_state, cell_state)
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
x = lstm_out.contiguous().view(batch_size,-1)
return self.l_linear(x)
This is the main code:
n_features = 5 # this is number of parallel inputs
n_timesteps = 24 # this is number of timesteps
# convert dataset into input/output
X, y = split_sequences(dataset, n_timesteps)
print(X.shape, y.shape)
X
y
# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss() # reduction='sum' created huge loss value
optimizer = torch.optim.Adam(mv_net.parameters(), lr=1e-4)
train_episodes = 50
batch_size = 16
This is the training:
mv_net.train()
for t in range(train_episodes):
X, y = sklearn.utils.shuffle(X, y)
for b in range(0,len(X),batch_size):
inpt = X[b:b+batch_size,:,:]
target = y[b:b+batch_size,:]
x_batch = torch.tensor(inpt,dtype=torch.float32)
y_batch = torch.tensor(target,dtype=torch.float32)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
loss = criterion(output.view(-1,5), y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
print('step : ' , t , 'loss : ' , loss.item())
Thank you for your time, and sorry for our unexperience (this is our first RNN).
I'm trying to complete a task and write simple RNN. Here's the class:
class RNNBaseline(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout, pad_idx):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_dim) #RNN(embedding_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, output_dim) # YOUR CODE GOES HERE
self.dropout = nn.Dropout(dropout)
def forward(self, text, text_lengths, hidden = None):
#text = [sent len, batch size]
embedded = self.embedding(text)
#embedded = [sent len, batch size, emb dim]
#pack sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
# cell arg for LSTM, remove for GRU
# packed_output, (hidden, cell) = self.rnn(packed_embedded)
# unpack sequence
# output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
#output = [sent len, batch size, hid dim * num directions]
#output over padding tokens are zero tensors
#hidden = [num layers * num directions, batch size, hid dim]
#cell = [num layers * num directions, batch size, hid dim]
#concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
#and apply dropout
output, hidden = self.rnn(packed_embedded, hidden)
#hidden = None # concatenate
#hidden = [batch size, hid dim * num directions] or [batch_size, hid dim * num directions]
return self.fc(hidden)
For now I'm not using LSTM or trying to do bidirectional RNN, I just want simple GRU to train without errors. This is the training function:
import numpy as np
min_loss = np.inf
cur_patience = 0
for epoch in range(1, max_epochs + 1):
train_loss = 0.0
model.train()
pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, ((text, txt_len), label) in pbar:
#YOUR CODE GOES HERE
opt.zero_grad()
input = text.to(device)
labels = label.to(device)
output = model(input, txt_len.type(torch.int64).cpu())
train_loss = loss_func(output, labels)
train_loss.backward()
opt.step()
train_loss /= len(train_iter)
val_loss = 0.0
model.eval()
pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, ((text, txt_len), label) in pbar:
# YOUR CODE GOES HERE
input = text.to(device)
labels = label.to(device)
output = model(input, txt_len.type(torch.int64).cpu())
val_loss = loss_func(output, labels)
val_loss /= len(valid_iter)
if val_loss < min_loss:
min_loss = val_loss
best_model = model.state_dict()
else:
cur_patience += 1
if cur_patience == patience:
cur_patience = 0
break
print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)
And some variables:
vocab_size = len(TEXT.vocab)
emb_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = False
dropout = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
patience=3
opt = torch.optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()
max_epochs = 1
But I get this error:
ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([1, 64, 1]))
... in this line:
---> 18 train_loss = loss_func(output, labels)
What am I doing wrong?
nn.BCEWithLogitsLoss expects both outputs and targets (or in your case labels) to be of size [b,d] where b is the batch size and d is the number of classes (or dimension of whatever you are trying to predict). Currently, your outputs are of size [b,d,1] and your targets are of size [d]. Two fixes are necessary, and both are very simple:
Add a batch dimension to your targets (labels). This is a common error when using a dataset that returns data elements because it generally does not add a batch dimension. Encapsulating your dataset class within a pytorch dataloader, but if you don't want to do this simply add an unsqueeze() operation. Note that the unsqueeze operation only works with a batch size of 1, otherwise using dataloader is probably a better bet.
Your output has an empty 3rd dimension, which can easily be flattened with a squeeze() operation. Both unsqueeze and squeeze are differentiable so shouldn't present problems for backpropagation.
... code before here
for it, ((text, txt_len), label) in pbar:
# YOUR CODE GOES HERE
input = text.to(device)
labels = label.to(device).unsqueeze(0) # added unsqueeze operation
output = model(input, txt_len.type(torch.int64).cpu())
output = output.squeeze(-1) # added squeeze on last dim
val_loss = loss_func(output, labels)
... code after here
I modified the code from here. What I'm trying to do is combine the two matrices to predict the output matrix. The output matrix is built from the two input matrices. The problem seems to be associated to:
self.Combined_dense_1 = tf.keras.layers.Dense(units=32, activation="relu")
self.Combined_dense_2 = tf.keras.layers.Dense(units=16, activation="softmax")
The linked medium tutorial only predicting a single number based on the combined mixed input. I however am trying to predict a whole matrix but don't know how to structure the combined layer (if this is even the problem).
The error: "ValueError: Shape mismatch: The shape of labels (received (40,)) should equal the shape of logits except for the last dimension (received (10, 16))."
The code:
import warnings
import sys
if not sys.warnoptions:
warnings.simplefilter("ignore")
import numpy as np
import os
import random
import tensorflow as tf
from tensorflow import keras
from IPython.display import clear_output
class model(keras.Model):
def __init__(self):
super().__init__()
# The layers to process our image
self.Conv2D_1 = tf.keras.layers.Conv2D(filters=32,
kernel_size=(1, 1),
strides=(1, 1)
)
self.Conv2D_2 = tf.keras.layers.Conv2D(filters=32,
kernel_size=(3, 3),
strides=(1, 1)
)
# our combined layers
self.Combined_dense_1 = tf.keras.layers.Dense(units=32, activation="relu")
self.Combined_dense_2 = tf.keras.layers.Dense(units=16, activation="softmax")
def call(self, input_image_one, input_image_two):
# Image model
I = self.Conv2D_1(input_image_one)
I = self.Conv2D_2(I)
# Flatten I so we can merge our data.
I = tf.keras.layers.Flatten()(I)
N = self.Conv2D_1(input_image_two)
N = self.Conv2D_2(N)
N = tf.keras.layers.Flatten()(N)
# Combined model
x = tf.concat([N, I], 1) # Concatenate through axis #1
x = self.Combined_dense_1(x)
x = self.Combined_dense_2(x)
return x
network = model()
optimizer = tf.keras.optimizers.Adam()
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def train_step(model, optimizer, loss_function,
images_one_batch, images_two_batch,
labels):
with tf.GradientTape() as tape:
model_output = model(images_one_batch, images_two_batch)
print(model_output)
loss = loss_function(labels, model_output) # our labels vs our predictions
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return loss
def train(model, optimizer, loss_function, epochs,
images_one_batch, images_two_batch,
labels):
loss_array = []
for epoch in range(epochs):
loss = train_step(model, optimizer, loss_function, images_one_batch, images_two_batch, labels)
loss_array.append(loss)
if ((epoch + 1) % 20 == 0):
# Calculating accuracy
network_output = network(images_one_batch, images_two_batch)
preds = np.argmax(network_output, axis=1)
acc = 0
for i in range(len(images_one_batch)):
if (preds[i] == labels[i]):
acc += 1
print(" loss:", loss, " Accuracy: ", acc / len(images_one_batch) * 100, "%")
clear_output(wait=True)
NumberofVars = 2;
width= NumberofVars; height = NumberofVars
NumberOfComputationSets = 10
CM_MatrixArr1 = []
CM_MatrixArr2 = []
for j in range(NumberOfComputationSets):
Theta1 = list(np.reshape(np.random.randint(2, size=4), (1,4))[0])
Theta1 = list(np.float_(Theta1))
CM_MatrixArr1.append(Theta1)
Theta2 = list(np.reshape(np.random.randint(2, size=4), (1,4))[0])
Theta2 = list(np.float_(Theta2))
CM_MatrixArr2.append(Theta2)
combinedCM_MatrixArr = []
combinedCM_toIntArr = []
for x,y in zip(CM_MatrixArr1, CM_MatrixArr2):
combinedCM = []
combinedCM_toInt = 0
for a,b in zip(x,y):
LogVal = (a == b)
combinedCM.append(float(LogVal == True))
combinedCM_MatrixArr.append(combinedCM)
combinedCM_MatrixArr = np.array(combinedCM_MatrixArr)
combinedCM_MatrixArr = combinedCM_MatrixArr.reshape(NumberOfComputationSets,2,2)
CM_MatrixArr1 = np.array(CM_MatrixArr1)
CM_MatrixArr1 = CM_MatrixArr1.reshape(NumberOfComputationSets,2,2)
CM_MatrixArr1 = CM_MatrixArr1.reshape(NumberOfComputationSets, 2,2,1)
CM_MatrixArr2 = np.array(CM_MatrixArr2)
CM_MatrixArr2 = CM_MatrixArr2.reshape(NumberOfComputationSets,2,2)
CM_MatrixArr2 = CM_MatrixArr2.reshape(NumberOfComputationSets, 2,2,1)
train(network,optimizer,loss_function,300,CM_MatrixArr1,CM_MatrixArr2,combinedCM_MatrixArr)
I recently shifted to pytorch from keras and I am still trying to understand how all this work. Below is the code I have implemented to classify mnist dataset using a simple MLP. Just like I used to do in keras I have flattend each of 28x28 image into a vector of 784 , and I have also created a one-hot representation for my labels.
In the model I was hoping that given a vector of 784 the model would output a one-hot vector with probabilities,but as soon as my code reaches to compute the loss I get the following error :
RuntimeError: 1D target tensor expected, multi-target not supported
Below is my code :
import numpy as np
import matplotlib.pyplot as plt
import torch
import time
from torch import nn, optim
from keras.datasets import mnist
from torch.utils.data import Dataset, DataLoader
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
# ----------------------------------------------------
class MnistDataset(Dataset):
def __init__(self, data_size=0):
(x, y), (_, _) = mnist.load_data()
x = [i.flatten() for i in x]
x = np.array(x, dtype=np.float32)
if data_size < 0 or data_size > len(y):
assert ("Data size should be between 0 to number of files in the dataset")
if data_size == 0:
data_size = len(y)
self.data_size = data_size
# picking 'data_size' random samples
self.x = x[:data_size]
self.y = y[:data_size]
# scaling between 0-1
self.x = (self.x / 255)
# Creating one-hot representation of target
y_encoded = []
for label in y:
encoded = np.zeros(10)
encoded[label] = 1
y_encoded.append(encoded)
self.y = np.array(y_encoded)
def __len__(self):
return self.data_size
def __getitem__(self, index):
x_sample = self.x[index]
label = self.y[index]
return x_sample, label
# ----------------------------------------------------
num_train_samples = 10000
num_test_samples = 2000
# Each generator returns a single
# sample & its label on each iteration.
mnist_train = MnistDataset(data_size=num_train_samples)
mnist_test = MnistDataset(data_size=num_test_samples)
# Each generator returns a batch of samples on each iteration.
train_loader = DataLoader(mnist_train, batch_size=128, shuffle=True) # 79 batches
test_loader = DataLoader(mnist_test, batch_size=128, shuffle=True) # 16 batches
# ----------------------------------------------------
# Defining the Model Architecture
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(28 * 28, 100)
self.act1 = nn.ReLU()
self.fc2 = nn.Linear(100, 50)
self.act2 = nn.ReLU()
self.fc3 = nn.Linear(50, 10)
self.act3 = nn.Sigmoid()
def forward(self, x):
x = self.act1(self.fc1(x))
x = self.act2(self.fc2(x))
output = self.act3(self.fc3(x))
return output
# ----------------------------------------------------
model = MLP()
# Defining optimizer and loss function
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
# ----------------------------------------------------
# Training the model
epochs = 10
print("Training Started...")
for epoch in range(epochs):
for batch_index, (inputs, targets) in enumerate(train_loader):
optimizer.zero_grad() # Zero the gradients
outputs = model(inputs) # Forward pass
loss = criterion(outputs, targets) # Compute the Loss
loss.backward() # Compute the Gradients
optimizer.step() # Update the parameters
# Evaluating the model
total = 0
correct = 0
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(test_loader):
outputs = model(inputs)
_, predicted = torch.max(outputs.data, 1)
total += targets.size(0)
correct += predicted.eq(targets.data).cpu().sum()
print('Epoch : {} Test Acc : {}'.format(epoch, (100. * correct / total)))
print("Training Completed Sucessfully")
# ----------------------------------------------------
I also read some other posts related to the same problem & most of them said that the CrossEntropy loss the target has to be a single number ,which totally gets over my head.Can someone please explain a solution.Thank you.
For nn.CrossEntropyLoss, you don't need one-hot representation of the label, you just need to pass the prediction's logit, which shape is (batch_size, n_class), and a target vector (batch_size,)
So just pass in the label index vector y instead of one-hot vector.
Fixed of your code:
class MnistDataset(Dataset):
def __init__(self, data_size=0):
(x, y), (_, _) = mnist.load_data()
x = [i.flatten() for i in x]
x = np.array(x, dtype=np.float32)
if data_size < 0 or data_size > len(y):
assert ("Data size should be between 0 to number of files in the dataset")
if data_size == 0:
data_size = len(y)
self.data_size = data_size
# picking 'data_size' random samples
self.x = x[:data_size]
self.y = y[:data_size]
# scaling between 0-1
self.x = (self.x / 255)
self.y = y # <--
def __len__(self):
return self.data_size
def __getitem__(self, index):
x_sample = self.x[index]
label = self.y[index]
return x_sample, label
Take a look at Pytorch example for more detail:
https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
I got the idea of implementing my version of deep feature selection is from the paper here,http://link.springer.com/chapter/10.1007%2F978-3-319-16706-0_20
The basic idea of deep feature selection according to this paper is to add a one to one mapping layer before any full connected hidden layer, then by adding a regularization term (whether lasso or elastic net) to produce zeros in the input layer weights.
My question is, even though it seems I have implemented the deep feature selection framework well, while testing on the random data generated by numpy.rand.random(1000,50) fails to give me any zeros on the initial weight. Is is a common thing for lasso like regularization? Am I going to adjust the parameters I used for this framework (even larger epochs)? Or did I do something wrong with my code.
class DeepFeatureSelectionMLP:
def __init__(self, X, Y, hidden_dims=[100], epochs=1000,
lambda1=0.001, lambda2=1.0, alpha1=0.001, alpha2=0.0, learning_rate=0.1):
# Initiate the input layer
# Get the dimension of the input X
n_sample, n_feat = X.shape
n_classes = len(np.unique(Y))
# One hot Y
one_hot_Y = np.zeros((len(Y), n_classes))
for i,j in enumerate(Y):
one_hot_Y[i][j] = 1
self.epochs = epochs
Y = one_hot_Y
# Store up original value
self.X = X
self.Y = Y
# Two variables with undetermined length is created
self.var_X = tf.placeholder(dtype=tf.float32, shape=[None, n_feat], name='x')
self.var_Y = tf.placeholder(dtype=tf.float32, shape=[None, n_classes], name='y')
self.input_layer = One2OneInputLayer(self.var_X)
self.hidden_layers = []
layer_input = self.input_layer.output
# Create hidden layers
for dim in hidden_dims:
self.hidden_layers.append(DenseLayer(layer_input, dim))
layer_input = self.hidden_layers[-1].output
# Final classification layer, variable Y is passed
self.softmax_layer = SoftmaxLayer(self.hidden_layers[-1].output, n_classes, self.var_Y)
n_hidden = len(hidden_dims)
# regularization terms on coefficients of input layer
self.L1_input = tf.reduce_sum(tf.abs(self.input_layer.w))
self.L2_input = tf.nn.l2_loss(self.input_layer.w)
# regularization terms on weights of hidden layers
L1s = []
L2_sqrs = []
for i in xrange(n_hidden):
L1s.append(tf.reduce_sum(tf.abs(self.hidden_layers[i].w)))
L2_sqrs.append(tf.nn.l2_loss(self.hidden_layers[i].w))
L1s.append(tf.reduce_sum(tf.abs(self.softmax_layer.w)))
L2_sqrs.append(tf.nn.l2_loss(self.softmax_layer.w))
self.L1 = tf.add_n(L1s)
self.L2_sqr = tf.add_n(L2_sqrs)
# Cost with two regularization terms
self.cost = self.softmax_layer.cost \
+ lambda1*(1.0-lambda2)*0.5*self.L2_input + lambda1*lambda2*self.L1_input \
+ alpha1*(1.0-alpha2)*0.5 * self.L2_sqr + alpha1*alpha2*self.L1
self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.cost)
self.y = self.softmax_layer.y
def train(self, batch_size=100):
sess = tf.Session()
sess.run(tf.initialize_all_variables())
for i in xrange(self.epochs):
x_batch, y_batch = get_batch(self.X, self.Y, batch_size)
sess.run(self.optimizer, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
if (i + 1) % 50 == 0:
l = sess.run(self.cost, feed_dict={self.var_X: x_batch, self.var_Y: y_batch})
print('epoch {0}: global loss = {1}'.format(i, l))
self.selected_w = sess.run(self.input_layer.w)
print(self.selected_w)
class One2OneInputLayer(object):
# One to One Mapping!
def __init__(self, input):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.zeros([n_in,]), name='w')
self.w = w
self.output = self.w * self.input
self.params = [w]
class DenseLayer(object):
# Canonical dense layer
def __init__(self, input, n_out, activation='sigmoid'):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight for the input layer
w = tf.Variable(tf.ones([n_in, n_out]), name='w')
b = tf.Variable(tf.ones([n_out]), name='b')
output = tf.add(tf.matmul(input, w), b)
output = activate(output, activation)
self.w = w
self.b = b
self.output = output
self.params = [w]
class SoftmaxLayer(object):
def __init__(self, input, n_out, y):
"""
The second dimension of the input,
for each input, each row is a sample
and each column is a feature, since
this is one to one mapping, n_in equals
the number of features
n_out defines how many nodes are there in the
hidden layer
"""
n_in = input.get_shape()[1].value
self.input = input
# Initiate the weight and biases for this layer
w = tf.Variable(tf.random_normal([n_in, n_out]), name='w')
b = tf.Variable(tf.random_normal([n_out]), name='b')
pred = tf.add(tf.matmul(input, w), b)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
self.y = y
self.w = w
self.b = b
self.cost = cost
self.params= [w]
Gradient descent algorithms such as Adam do not give exact zeros when using l1 regularization. Instead, something like ftrl or proximal adagrad can give you exact zeros.