Why is my LSTM in tensorflow learning so slowly and badly? - python

This program reads a text file RNNtext.txt, creates one-hot vector representation for all the data, trains the LSTM with the data and displays a bunch of sampled characters every now and then. However, even looking at the cost vs iterations graph shows that it's learning very very inefficiently. Honestly, the raw code (numpy) for the LSTM I have does a MUCH better job. It's not only faster but it produces mostly meaningful words. This produces gibberish only. Where is my mistake? I really am out of ideas and I can't seem to find where it is logically wrong.
import numpy as np
import random
import tensorflow as tf
import os
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
# Reading RNNtext.txt file
direc = os.path.dirname(os.path.realpath(__file__))
data = open(direc + "/RNNtext.txt", "r").read()
# Array of unique characters
chars = list(set(data))
num_hidden = 80
iterations = 1000
display_iteration = 100 # Sample when iteration % display_iteration == 0
sample_size = 250
batch_size = 120 # batch size or the number of time steps to unroll RNN
alpha = 0.01 # Learning rate
#Vocabulary and text file sizes
vocab_size = len(chars)
data_size = len(data)
# Bijection from a unique character to an index
char_to_ix = {}
# Bijection from an index to a unique character
ix_to_char = {}
for j in range(vocab_size):
char_to_ix[chars[j]] = j
ix_to_char[j] = chars[j]
# Transforming all characters to indices
data_ix = [char_to_ix[ch] for ch in data]
train_data = [] # This will contain one-hot vectors
for k in range(data_size):
# Representing each index/character by a one-hot vector
hot1 = np.zeros((vocab_size, 1))
hot1[data_ix[k]] = 1
train_data.append(hot1)
X = tf.placeholder(tf.float32, [None, vocab_size, 1]) #Number of examples, number of input, dimension of each input
target = tf.placeholder(tf.float32, [None, vocab_size])
cell = tf.contrib.rnn.LSTMCell(num_hidden,state_is_tuple=True)
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
output = tf.transpose(output, [1, 0, 2])
weight = tf.Variable(tf.random_normal([num_hidden, vocab_size]))
bias = tf.Variable(tf.constant(0.0, shape=[vocab_size]))
prediction = tf.matmul(output[-1], weight) + bias
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=target))
optimizer = tf.train.ProximalGradientDescentOptimizer(alpha)
minimize = optimizer.minimize(cost)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
ARR = [i for i in range(vocab_size)] # for extracting index by probabilities in np.random.choice()
ITER = []
COST = []
p = 0 # p will be iterated by batch_size steps
for i in range(iterations):
if p + batch_size >= data_size:
p = 0
# sweeping through data one-hot vectors
inp, out = train_data[p:p+batch_size], train_data[p+1:p+batch_size+1]
out = np.reshape(out, [-1, vocab_size])
c = sess.run(cost, {X: inp, target: out}) # calculating cost for plotting later
COST.append(c)
ITER.append(i)
sess.run(minimize, {X: inp, target: out})
# displaying sample_size number of characters with random seed
# doesn't affect training
if i % display_iteration == 0:
seed = np.random.randint(0, vocab_size)
CHARS = []
for j in range(sample_size):
x = np.zeros((vocab_size, 1))
x[seed] = 1
x = [x]
pred = sess.run(prediction, {X: x})[0]
pred = np.exp(pred) / np.sum(np.exp(pred))
pred = pred.ravel()
seed = np.random.choice(ARR, 1, p = pred)[0]
ch = ix_to_char[seed]
CHARS.append(ch)
TXT = ''.join(CHARS)
print("-------------------------------------------------")
print(TXT)
print("Iteration: ", str(i))
p += batch_size
sess.close()
plt.plot(ITER, COST)
plt.show()
EDIT: Added numpy code for comparison
import numpy as np
import matplotlib.pyplot as plt
import os
plt.style.use('fivethirtyeight')
direc = os.path.dirname(os.path.realpath(__file__))
readFile = open(direc + "\RNNtext.txt", 'r')
data = readFile.read()
readFile.close()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(chars)
print("Vocabulary size: " + str(vocab_size))
char_to_ix = {}
ix_to_char = {}
for j in range(len(chars)):
char_to_ix[chars[j]] = j
ix_to_char[j] = chars[j]
hidden_size = 80
batch_size = 120
alpha = 0.1
sample_size = 250
iterations = 1000
display_iteration = 100
Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias
def sample(hid, seed, weights, sample_size):
X = np.zeros((vocab_size, 1))
X[seed] = 1
CHARS = []
ARR = [i for i in range(vocab_size)]
for t in range(sample_size):
hid = np.tanh(np.dot(Wxh, X) + np.dot(Whh, hid) + bh)
y = np.dot(Why, hid) + by
prob = np.exp(y) / np.sum(np.exp(y))
prob = prob.ravel()
ix = np.random.choice(ARR, 1, p=prob)[0]
CHARS.append(ix_to_char[ix])
X = np.zeros((vocab_size, 1))
X[ix] = 1
TXT = ''.join(CHARS)
return TXT
LOSS = []
ITER = []
p = 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*batch_size # loss at iteration 0
hprev = np.zeros((hidden_size,1))
for i in range(iterations): ## just time passing by
dWxh = np.zeros_like(Wxh)
dWhh = np.zeros_like(Whh)
dWhy = np.zeros_like(Why)
dbh = np.zeros_like(bh)
dby = np.zeros_like(by)
if p+batch_size >= len(data) or i == 0:
hprev = np.zeros((hidden_size,1))
p = 0
inputs = [char_to_ix[ch] for ch in data[p:p+batch_size]]
targets = [char_to_ix[ch] for ch in data[p+1:p+batch_size+1]]
HID = {}
X = {}
Y = {}
P = {}
HID[-1] = np.copy(hprev)
loss = 0
##======FORWARD======##
for t in range(len(inputs)):
X[t] = np.zeros((vocab_size,1))
X[t][inputs[t]] = 1
HID[t] = np.tanh(np.dot(Wxh, X[t]) + np.dot(Whh, HID[t-1]) + bh) # inp -> X
Y[t] = np.dot(Why, HID[t]) + by # tanh
P[t] = np.exp(Y[t]) / np.sum(np.exp(Y[t]))
loss += -np.log(P[t][targets[t]][0])
dhnext = np.zeros_like(HID[0])
##======BACKPROP======##
for t in reversed(range(len(inputs))):
dy = np.copy(P[t])
dy[targets[t]] -= 1
dh = (np.dot(Why.T, dy) + dhnext)*(1-HID[t]*HID[t])
dx = np.dot(Why.T, dy)*(1 - HID[t]**2)
dWhy += np.dot(dy, HID[t].T)
dWhh += np.dot(dh, HID[t-1].T)
dWxh += np.dot(dh, X[t].T)
dby += dy
dbh += dh
dhnext = np.dot(Whh.T, dh)
##=====================##
hprev = HID[-1]
smooth_loss = smooth_loss * 0.999 + loss * 0.001
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
np.clip(dparam, -1, 1, out=dparam) # clip to mitigate exploding gradients
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
[dWxh, dWhh, dWhy, dbh, dby],
[mWxh, mWhh, mWhy, mbh, mby]):
mem += dparam * dparam
param += -alpha * dparam / np.sqrt(mem + 1e-8) # Adagrad
if i % display_iteration == 0:
print(str(i))
weights = [Wxh,Whh,Why,bh,by]
seed = inputs[np.random.randint(0,len(inputs))]
TXT = sample(HID[-1], seed, weights, sample_size)
print("-----------------------------------------------")
print(TXT)
print("-----------------------------------------------")
with open(direc + "\RNNout.txt", 'w') as writeFile:
writeFile.write(TXT)
ITER.append(i)
LOSS.append(loss)
p += batch_size
best_text = sample(HID[-1], inputs[0], weights, sample_size)
plt.plot(ITER, LOSS, linewidth = 1)
plt.show()
writeFile.close()

Well, doh... looks like you are not re-using the state! How is LSTM (state machine) supposed to work properly if you are not maintaining the state?
To me this looks like a red flag:
output, _ = tf.nn.dynamic_rnn(cell, X, dtype = tf.float32)
the second output from tf.nn.dynamic_rnn is the latest state after the given sequence has been processed. Looks like you are explicitly ignoring it and not re-feeding it into each following iteration of training in sess.run(...) (and hence your dynamic_rnn doesn't have the initial_state parameter).
I would highly recommend changing that part of your code before looking any further.
Also, I don't know what your data looks like, but your feeding and batching strategy needs to be such as to make sense out of this whole state-passing exercise. Otherwise, once again, it will just produce gibberish.

With the information provided, I would suggest these two initial steps to try to improve the model.
Increase the number of iterations, Recurrent Neural Networks work differently than other deep arhitectures and could need maybe an additional order of magnitude in iteration number, to settle.
Play with the seeds: from my experience in order to get meaningful sequences can depend on the quality of the used seeds.

Related

Simple convolutional neural network

import numpy as np
from keras.datasets import mnist
import time
# Functions
def sigmoid(x):
return 1.0/(1.0 + np.exp(-x))
def sigmoid_derivative(x):
return sigmoid(x)*(1-sigmoid(x))
def relu(x):
return np.maximum(0,x)
def relu_derivative(x):
return np.greater(x, 0).astype(int)
def softmax(x):
exps = np.exp(x - x.max())
return exps / np.sum(exps)
# Import and Create Dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Setup
np.random.seed(seed=12345)
alpha = 0.05
# Initialize Network Values
# Layers
input_layer = np.zeros(shape=(28,28))
convolutional_layer = np.zeros(shape=(10,24,24))
pooling_layer = np.zeros(shape=(10,12,12))
flattened_layer = np.reshape(pooling_layer,newshape=(1440,1))
dense_layer = np.zeros(shape=(100,1))
output_layer = np.zeros(shape=(10,1))
# Filters and Weights
convolution_filters = np.random.rand(10,5,5)
weights1 = np.random.rand(100,1440)
weights2 = np.random.rand(10,100)
# Bias
dense_layer_bias = np.ones(shape=(100,1))
output_layer_bias = np.ones(shape=(10,1))
convolution_bias = np.ones(shape=(10,5,5))
for epoch in range(1):
print(np.mean(weights1),np.mean(weights2),np.mean(convolution_filters))
for sample in range(20):
# Get Input Data
input = x_train[sample]
# Target Data
target = np.zeros((10,1))
target[y_train[sample]][0] = 1
# Feed Forward Input to Convolution Layer
i=j=k=0
for i in range(10):
for j in range(24):
for k in range(24):
minimatrix = input[j:j+5, k:k+5]
convolutional_layer[i][j][k] = np.sum(minimatrix * convolution_filters[i] + convolution_bias[i])
# Pooling Layer
i=j=k=0
for i in range(10):
for j in range(12):
for k in range(12):
minimatrix = convolutional_layer[i,j*2:j*2+2,k*2:k*2+2]
pooling_layer[i][j][k] = relu(minimatrix.max())
# Flattening Layer
flattened_layer = np.reshape(pooling_layer,newshape=(1440,1))
# Feed Forward - DENSE_LAYER
dense_layer = relu(np.dot(weights1,flattened_layer) + dense_layer_bias)
# Feed Forward - OUTPUT_LAYER
output_layer = softmax(np.dot(weights2,dense_layer) + output_layer_bias)
# Backpropogation - OUTPUT_LAYER
delta = output_layer - target
weights2gradient = np.dot(delta,dense_layer.T)
output_layer_bias_gradient = delta
# Backpropogation - DENSE_LAYER
delta = np.dot(weights2.T,delta) * relu_derivative(dense_layer)
weights1gradient = np.dot(delta,flattened_layer.T)
dense_layer_bias_gradient = delta
# Backpropogation - POOLING_LAYER
delta = np.reshape(np.dot(weights1.T,delta),newshape=(10,12,12)) * relu_derivative(pooling_layer) # find delta at pooling layer
# Backpropagation - TRANSPOSE FOR CALCULATIONS
delta = np.array([delta[i].T for i in range(len(delta))]) # Math says this has to happen
# Gradient For Backward Pass
pooling_backward_pass = np.zeros(shape=(10,24,24)) # matrix for passing adjusted cost
# BACKWARD POOLING GRADIENT PASS
i=j=k=0
for i in range(10):
for j in range(12):
for k in range(12):
minimatrix = convolutional_layer[i,j*2:j*2+2,k*2:k*2+2]
maxvalindex = np.argmax(minimatrix)
pooling_backward_pass[i, j*2+(maxvalindex // 2), k+(maxvalindex % 2)] += delta[i,j,k]
# Backpropogation - CONVOLUTION LAYER
convolution_gradient = np.zeros(shape=(10,5,5))
convolution_bias_gradient = np.zeros(shape=(10,5,5))
i=j=k=0
for i in range(10):
for j in range(24):
for k in range(24):
minimatrix = input[j:j+5, k:k+5]
convolution_gradient[i] += pooling_backward_pass[i,j,k] * minimatrix
convolution_bias_gradient[i] += pooling_backward_pass[i,j,k]
# Weight and Filter Adjustments
weights2 -= weights2gradient * alpha
weights1 -= weights1gradient * alpha
convolution_filters -= convolution_gradient * alpha
# Bias Adjustments
dense_layer_bias -= dense_layer_bias_gradient * alpha
output_layer_bias -= output_layer_bias_gradient * alpha
convolution_bias -= convolution_gradient * alpha
print(np.mean(weights1),np.mean(weights2),np.mean(convolution_filters))
I have been working on this code for a while now and I am (almost) certain the basic functionality should work, but I am getting no changes to the weights of the network. I am specifically trying to understand neural networks without the abstraction offered by the actual frameworks. Is there a Python scope issue that is keeping the weights from updating?

Scipy fails to minimize cost function

Currently I'm learning from Andrew Ng course on Coursera called "Machine Learning". In exercise 5, we built a model that can predict digits, trained by the MNIST dataset. This task was completed successfully in Matlab by me, but I wanted to migrate that code to Python, just to see how different things are and maybe continue to play around with the model.
I managed to implement the cost function and the back propagation algorithm correctly. I know that because I compared the metrics with my working model in Matlab and it emits the same numbers.
Now, because in the course we train the model using fmincg, I tried to do the same using Scipy fmin_cg
function.
My problem is, the cost function takes extra small steps and fails to converge.
Here is my code for the network:
import numpy as np
import utils
import scipy.optimize as op
class Network:
def __init__(self, layers):
self.layers = layers
self.weights = self.generate_params()
# Function for generating theta multidimensional matrix
def generate_params(self):
theta = []
epsilon = 0.12
for i in range(len(self.layers) - 1):
current_layer_units = self.layers[i]
next_layer_units = self.layers[i + 1]
theta_i = np.multiply(
np.random.rand(next_layer_units, current_layer_units + 1),
2 * epsilon - epsilon
)
# Appending the params to the theta matrix
theta.append(theta_i)
return theta
# Function to append bias row/column to matrix X
def append_bias(self, X, d):
m = X.shape[0]
n = 1 if len(X.shape) == 1 else X.shape[1]
if (d == 'column'):
ones = np.ones((m, n + 1))
ones[:, 1:] = X.reshape((m, n))
elif (d == 'row'):
ones = np.ones((m + 1, n))
ones[1:, :] = X.reshape((m, n))
return ones
# Function for computing the gradient for 1 training example
def back_prop(self, y, feed, theta):
activations = feed["activations"]
weighted_layers = feed["weighted_layers"]
delta_output = activations[-1] - y.reshape(len(y), 1)
current_delta = delta_output
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Peforming delta calculations.
# Here, we continue to propagate the delta values backwards
# until we arrive to the second layer.
for i in reversed(range(len(theta))):
theta_i = theta[i]
if (i > 0):
i_weighted_inputs = self.append_bias(weighted_layers[i - 1], 'row')
t_theta_i = np.transpose(theta_i)
delta_i = np.multiply(np.dot(t_theta_i, current_delta), utils.sigmoidGradient(i_weighted_inputs))
delta_i = delta_i[1:]
gradients[i] = current_delta * np.transpose(activations[i])
# Setting current delta for the next layer
current_delta = delta_i
else:
gradients[i] = current_delta * np.transpose(activations[i])
return gradients
# Function for computing the cost and the derivatives
def compute_cost(self, theta, X, y, r12n = 0):
m = len(X)
num_labels = self.layers[-1]
costs = np.zeros(m)
# Initializing gradients
gradients = []
for i, theta_i in enumerate(theta):
gradients.append(np.zeros(theta_i.shape))
# Iterating over the training set
for i in range(m):
inputs = X[i]
observed = utils.create_output_vector(y[i], num_labels)
feed = self.feed_forward(inputs)
predicted = feed["activations"][-1]
total_cost = 0
for k, o in enumerate(observed):
if (o == 1):
total_cost += np.log(predicted[k])
else:
total_cost += np.log(1 - predicted[k])
cost = -1 * total_cost
# Storing the cost for the i-th training example
costs[i] = cost
# Calculating the gradient for this training example
# using back propagation algorithm
gradients_i = self.back_prop(observed, feed, theta)
for i, gradient in enumerate(gradients_i):
gradients[i] += gradient
# Calculating the avg regularization term for the cost
sum_of_theta = 0
for i, theta_i in enumerate(theta):
squared_theta = np.power(theta_i[:, 1:], 2)
sum_of_theta += np.sum(squared_theta)
r12n_avg = r12n * sum_of_theta / (2 * m)
total_cost = np.sum(costs) / m + r12n_avg
# Applying regularization terms to the gradients
for i, theta_i in enumerate(theta):
lambda_i = np.copy(theta_i)
lambda_i[:, 0] = 0
lambda_i = np.multiply((r12n / m), lambda_i)
# Adding the r12n matrix to the gradient
gradients[i] = gradients[i] / m + lambda_i
return total_cost, gradients
# Function for training the neural network using conjugate gradient algorithm
def train_cg(self, X, y, r12n = 0, iterations = 50):
weights = self.weights
def Cost(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
cost, _ = self.compute_cost(theta, X, y, r12n)
print(cost);
return cost
def Gradient(theta, X, y):
theta = utils.roll_theta(theta, self.layers)
_, gradient = self.compute_cost(theta, X, y, r12n)
return utils.unroll_theta(gradient)
unrolled_theta = utils.unroll_theta(weights)
result = op.fmin_cg(f = Cost,
x0 = unrolled_theta,
args=(X, y),
fprime=Gradient,
maxiter = iterations)
self.weights = utils.roll_theta(result, self.layers)
# Function for feeding forward the network
def feed_forward(self, X):
# Useful variables
activations = []
weighted_layers = []
weights = self.weights
currentActivations = self.append_bias(X, 'row')
activations.append(currentActivations)
for i in range(len(self.layers) - 1):
layer_weights = weights[i]
weighted_inputs = np.dot(layer_weights, currentActivations)
# Storing the weighted inputs
weighted_layers.append(weighted_inputs)
activation_nodes = []
# If the next layer is not the output layer, we'd like to add a bias unit to it
# (Excluding the input and the output layer)
if (i < len(self.layers) - 2):
activation_nodes = self.append_bias(utils.sigmoid(weighted_inputs), 'row')
else:
activation_nodes = utils.sigmoid(weighted_inputs)
# Appending the layer of nodes to the activations array
activations.append(activation_nodes)
currentActivations = activation_nodes
data = {
"activations": activations,
"weighted_layers": weighted_layers
}
return data
def predict(self, X):
data = self.feed_forward(X)
output = data["activations"][-1]
# Finding the max index in the output layer
return np.argmax(output, axis=0)
Here is the invocation of the code:
import numpy as np
from network import Network
# %% Load data
X = np.genfromtxt('data/mnist_data.csv', delimiter=',')
y = np.genfromtxt('data/mnist_outputs.csv', delimiter=',').astype(int)
# %% Create network
num_labels = 10
input_layer = 400
hidden_layer = 25
output_layer = num_labels
layers = [input_layer, hidden_layer, output_layer]
# Create a new neural network
network = Network(layers)
# %% Train the network and save the weights
network.train_cg(X, y, r12n = 1, iterations = 20)
This is what the code emits after each iteration:
15.441233231650283
15.441116436313076
15.441192262452514
15.44122384651483
15.441231216030646
15.441232804294314
15.441233141284435
15.44123321255294
15.441233227614855
As you can see, the changes to the cost are very small.
I checked for the shapes of the vectors and gradient and they both seem fine, just like in my Matlab implementation. I'm not sure what I do wrong here.
If you guys could help me, that'd be great :)

Cuda error: device side assert triggered - only after certain number of batches

I am trying to put a dataset through a neural network. It is running on a Google Cloud virtual machine using a Tesla V100 GPU. However, before I can finish training a single epoch, I get an error message: "Cuda error: device side assert triggered". I think the problem may be in my data, but I have no idea where and I'm not sure what the problem is exactly (but I tested the code with a different dataset and it ran fine).
The thing that is odd is that the network actually runs for some time before triggering the error. I had it print every time it finished a batch and sometimes it finishes 60+ batches, sometimes 80+, I've even gotten it to finish as many as 140 batches (given the size of my data and my batches, there are 200 batches in each epoch). No matter how many it finishes, it eventually triggers this error and has not completed an epoch.
I tried setting CUDA_LAUNCH_BLOCKING = 1 and did not get any better error message. I of course made sure the neural network has the right number of input and output parameters (this is given because it works for the first however many batches). I also standardized the inputs. Some were really large and some were closes to zero so I normalized them to all fall in the range [-1,1]. Certainly the network should be able to handle that but it still causes a problem.
Here is my training loop which WORKS with a different data set. It is always the line "loss.backward()" that eventually triggers the error message.
CUDA_LAUNCH_BLOCKING = 1
start = time.time()
for epoch in range(1,6):
# Decrease learning rate at epoch 3 and 5
if epoch == 3 or epoch == 5:
lr = lr/3
# Setup optimizer
optimizer = optim.SGD(net.parameters(), lr=lr)
# Initialize stats to zeros to track network's progress
running_loss = 0
running_error = 0
num_batches = 0
# Shuffle indices to train randomly
shuffled_indices = torch.randperm(50000)
for count in range(0, 50000, bs):
# Clear gradient before each iteration
optimizer.zero_grad()
# Setup indices for minibatch
if (count + bs > 50000):
indices_list = shuffled_indices[count : ].tolist() + shuffled_indices[ : (count + bs) - 50000].tolist()
indices = torch.Tensor(indices_list)
else:
indices = shuffled_indices[count : count + bs]
# Create minibatch
minibatch_data = train_data[indices]
minibatch_label = train_label[indices]
# Send minibatch to gpu for training
minibatch_data = minibatch_data.to(device)
minibatch_label = minibatch_label.to(device)
temp = minibatch_data - mean
# Standardize entries with mean and std
inputs = ((minibatch_data - mean) / std).view(bs, 33)
# Begin tracking changes
inputs.requires_grad_()
# Forward inputs through the network
scores = net(inputs)
print(scores[:2])
print(minibatch_label)
# Compute loss
loss = criterion(scores, minibatch_label)
# Back propogate neural network
loss.backward()
# Do one step of stochastic gradient descent
optimizer.step()
# Update summary statistics
with torch.no_grad():
num_batches += 1
error = get_error(scores, minibatch_label)
running_error += error
running_loss += loss.item()
print("success: ", num_batches)
# At the end of each epoch, compute and print summary statistics
total_error = running_error / num_batches
avg_loss = running_loss / num_batches
print('Epoch: ', epoch)
print('Time: ', time.time(), '\t Loss: ', avg_loss, '\t Error (%): ', total_error * 100)
Here is my dataset formatting and normalizing:
train_list_updated = []
train_label_list = []
for entry in train_list[1:]:
entry[0] = string_to_int(entry[0])
entry[1] = handedness[entry[1]]
entry[2] = string_to_int(entry[2])
entry[3] = handedness[entry[3]]
entry[4] = string_to_int(entry[4])
entry[5] = string_to_int(entry[5])
entry[6] = string_to_int(entry[6])
entry[17] = entry[17].replace(':','')
entry[-3] = pitch_types[entry[-3]]
entry[-2] = pitch_outcomes[entry[-2]]
train_label_list.append(entry[-2])
del entry[-1]
del entry[-1]
del entry[-3]
train_list_updated.append(entry)
final_train_list = []
for entry in train_list_updated:
for index in range(len(entry)):
try:
entry[index] = float(entry[index])
except:
entry[index] = 0.
final_train_list.append(entry)
# Do the same for the test data
test_list_updated = []
for entry in test_list[1:]:
entry[0] = string_to_int(entry[0])
entry[1] = handedness[entry[1]]
entry[2] = string_to_int(entry[2])
entry[3] = handedness[entry[3]]
entry[4] = string_to_int(entry[4])
entry[5] = string_to_int(entry[5])
entry[6] = string_to_int(entry[6])
entry[17] = entry[17].replace(':','')
entry[-3] = pitch_types[entry[-3]]
del entry[-1]
del entry[-1]
del entry[-3]
test_list_updated.append(entry)
final_test_list = []
for entry in test_list_updated:
for index in range(len(entry)):
try:
entry[index] = float(entry[index])
except:
entry[index] = 0.
final_test_list.append(entry)
# Create tensors of test and train data
train_data = torch.tensor(final_train_list)
train_label = torch.tensor(train_label_list)
test_data = torch.tensor(final_test_list)
And normalizing:
max_indices = torch.argmax(train_data, dim = 0)
min_indices = torch.argmin(train_data, dim = 0)
max_values = []
min_values = []
for i in range(33):
max_idx = max_indices[i].item()
min_idx = min_indices[i].item()
max_val = train_data[max_idx][i]
min_val = train_data[min_idx][i]
max_values.append(max_val)
min_values.append(min_val)
max_values = torch.Tensor(max_values)
min_values = torch.Tensor(min_values)
ranges = max_values - min_values
min_values = min_values.view(1, 33)
min_values = torch.repeat_interleave(min_values, 582205, dim = 0)
ranges = ranges.view(1, 33)
ranges = torch.repeat_interleave(ranges, 582205, dim = 0)
train_data = train_data - min_values
train_data = 2 * (train_data / ranges)
train_data = train_data - 1
And here's my net (a lot is commented out since I thought maybe there was an issue with the gradient zeroing or something. A five layer neural network should definitely not cause a problem though):
"""
DEFINING A NEURAL NETWORK
"""
# Define a fifteen layer artificial neural network
class fifteen_layer_net(nn.Module):
def __init__(self):
super().__init__()
self.linear1 = nn.Linear(33, 200)
self.linear2 = nn.Linear(200, 250)
self.linear3 = nn.Linear(250, 300)
self.linear4 = nn.Linear(300, 350)
self.linear5 = nn.Linear(350, 7)
# self.linear6 = nn.Linear(400, 450)
# self.linear7 = nn.Linear(450, 500)
# self.linear8 = nn.Linear(500, 450)
# self.linear9 = nn.Linear(450, 400)
# self.linear10 = nn.Linear(400, 350)
# self.linear11 = nn.Linear(350, 300)
# self.linear12 = nn.Linear(300, 250)
# self.linear13 = nn.Linear(250, 200)
# self.linear14 = nn.Linear(200, 150)
# self.linear15 = nn.Linear(150, 7)
def forward(self, x):
x = self.linear1(x)
x = F.relu(x)
x = self.linear2(x)
x = F.relu(x)
x = self.linear3(x)
x = F.relu(x)
x = self.linear4(x)
x = F.relu(x)
scores = self.linear5(x)
# x = F.relu(x)
# x = self.linear6(x)
# x = F.relu(x)
# x = self.linear7(x)
# x = F.relu(x)
# x = self.linear8(x)
# x = F.relu(x)
# x = self.linear9(x)
# x = F.relu(x)
# x = self.linear10(x)
# x = F.relu(x)
# x = self.linear11(x)
# x = F.relu(x)
# x = self.linear12(x)
# x = F.relu(x)
# x = self.linear13(x)
# x = F.relu(x)
# x = self.linear14(x)
# x = F.relu(x)
# scores = self.linear15(x)
return scores
Network should output scores, compute a loss using cross entropy loss criterion, and then do one step of stochastic gradient descent. This works for awhile and then mysteriously breaks. I have no idea why.
Any help is greatly appreciated.
Thanks in advance.
I was also facing same issue, You can try few things :
Make sure there are no NaN, and inf values in your dataset.
set your batch size where number of samples % batchsize = 0

For the deep network, assuming that each batch consists of two inputs (S and I), how to calculate whether each sample Si and all I match?

For the deep network, assuming that each batch consists of two inputs (S and I), how to calculate whether each single sample Si and all I(batch examples) match? I wrote the following program which uses tensorarray to calculate the attention of each Si on all I, S and I have tan function relationship. Ideally, each Si will pay the most attention to its corresponding I. But the end result of convergence is that Si pays the same attention to each I.Any Advice?
import tensorflow as tf
from tensorflow.python.ops import tensor_array_ops
import tensorflow.contrib.layers as layers
import numpy as np
batch_szie = 64
word_len = 16
word_emb_dim = 512
feature_dim = 512
x = tf.placeholder(dtype=tf.float32,shape =[None,1024],name="S")
sentence = tf.placeholder(dtype=tf.float32,shape =[None,1024],name="I")
# target = tf.placeholder(dtype=tf.float32,shape=[None,64],name = "target")
batch_size = tf.shape(sentence)[0]
labels = tf.eye(batch_size)
loss_array = tensor_array_ops.TensorArray(dtype=tf.float32, size=64,dynamic_size=False, infer_shape=True)
attention_array = tensor_array_ops.TensorArray(dtype=tf.float32, size=64,dynamic_size=False, infer_shape=True)
x_pre = layers.fully_connected(
x,
num_outputs=1024,
# activation_fn=tf.nn.relu,
scope="pre",
reuse=tf.AUTO_REUSE)
sentence_tp = layers.fully_connected(
sentence,
num_outputs=1024,
# activation_fn=tf.nn.relu,
scope="s_pre",
reuse=tf.AUTO_REUSE)
def body(i,loss_array,attention_array):
res = tf.tile(tf.expand_dims(tf.expand_dims(x_pre[i],1),0) [batch_size,1,1])
res = tf.matmul(tf.expand_dims(sentence_tp,1),res)
res = tf.reshape(res, [batch_size])
attention = tf.reduce_sum(labels[i] * tf.nn.softmax(res, 0), 0)
tp_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels[i], logits=res))
loss_array =loss_array.write(i,tp_loss)
attention_array = attention_array.write(i,attention)
return i+1 , loss_array , attention_array
_, loss_res,attention_res = tf.while_loop(cond=lambda i, _1,_2: i < 64,
body=body,
loop_vars=[tf.constant(0), loss_array,attention_array])
loss= tf.reduce_mean(loss_res.stack())
attention_all = attention_res.stack()
vars = tf.trainable_variables()
dis_optimizer = tf.train.AdamOptimizer(learning_rate=0.001, beta1=0.5, beta2=0.9)
dis_grads = tf.gradients(loss, vars)
dis_grads_and_vars = list(zip(dis_grads, vars))
for grad, var in dis_grads_and_vars:
print("var:", var, " ", grad)
dis_train_op =dis_optimizer.apply_gradients(grads_and_vars=dis_grads_and_vars)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(1000000):
org = np.random.uniform(low=0.0,high=100,size = (64,1024))
image = np.sin(org)
s = np.cos(org)
feed_dict = {x:image,sentence:s}
a,_,lr = sess.run([attention_all,dis_train_op,loss],feed_dict)
print(lr)
for j in range(1):
print("attention:",a)
Converges when attention on each use case is 1/batch_size

Training TensorFlow to predict a sum

The TensorFlow provided examples are a little complicated for getting started, so I am trying to teach TensorFlow train a neural network to predict the sum of three binary digits. The network gets two of them as inputs; the third one is unknown. So an "optimal" network would guess that the sum will be the sum of the two known bits, plus 1/2 for the unknown bit. Let's say that the "loss" function is the square of the difference between the value predicted by the network and the actual value.
I have written code to generate the trials:
import tensorflow as tf
import numpy as np
from random import randint
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('batch_size', 5, 'Batch size. ')
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('dim1', 3, 'layer size')
flags.DEFINE_integer('training_epochs', 10, 'Number of passes through the main training loop')
def ezString(list):
#debugging code so I can see what is going on
listLength = len(list)
r = ''
for i in range(listLength):
value = list[i]
valueString = str(value)
r = r + ' '
r = r + valueString
return r
def generateTrial():
inputs = np.zeros(2, dtype=np.int)
for i in range(2):
inputs[i] = randint(0,1)
unknownInput = randint(0,1)
sum = 0
for j in range(2):
sum = sum + inputs[j]
sum = sum + unknownInput
inputTensor = tf.pack(inputs)
print 'inputs' + ezString(inputs)
print 'unknown ' + str(unknownInput)
print 'sum ' + str(sum)
print ''
return inputTensor, sum
def printTensor(tensor):
sh = tensor.get_shape()
print(sh)
def placeholder_inputs(size):
output_placeholder = tf.placeholder(tf.int32, shape=(size))
input_placeholder = tf.placeholder(tf.int32, shape=(size,
2))
return input_placeholder, output_placeholder
def fill_feed_dict(inputs_pl, output_pl):
print ('Filling feed dict')
inputs_placeholder, output_placeholder = placeholder_inputs(FLAGS.batch_size)
inputs = []
outputs = []
for i in range(FLAGS.batch_size):
input, output = generateTrial()
inputTensor = tf.pack(input)
inputs.append(input)
outputs.append(output)
inputs_placeholder = tf.pack(inputs)
outputs_placeholder = tf.pack(outputs)
def run_training():
input_placeholder, output_placeholder = placeholder_inputs(FLAGS.batch_size)
fill_feed_dict(input_placeholder, output_placeholder)
printTensor(input_placeholder)
printTensor(output_placeholder)
run_training()
The output suggests that this much is working:
Filling feed dict
inputs 1 0
unknown 0
sum 1
inputs 1 0
unknown 1
sum 2
inputs 0 1
unknown 1
sum 2
inputs 0 1
unknown 0
sum 1
inputs 0 0
unknown 0
sum 0
(5, 2)
(5,)
But I'm unclear on how I would finish it up. In particular, I need to define a loss function, and I also need to hook things up so that the outputs from my network get used to generate guesses for further training steps. Can anyone help?
I'm not sure whether this code is what you wanted to get, but i hope you would find it useful anyway. Mean squared error is actually decreasing along the iterations, though I haven't tested it for making predictions, so it's up to you!
import tensorflow as tf
import numpy as np
from random import randint
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('batch_size', 50, 'Batch size.')
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('dim1', 3, 'layer size')
flags.DEFINE_integer('training_epochs', 10, 'Number of passes through the main training loop')
flag.DEFINE_integer('num_iters', 100, 'Number of iterations')
def ezString(list):
#debugging code so I can see what is going on
listLength = len(list)
r = ''
for i in range(listLength):
value = list[i]
valueString = str(value)
r = r + ' '
r = r + valueString
return r
def generateTrial():
inputs = np.zeros(2, dtype = np.float)
for i in range(2):
inputs[i] = randint(0, 1)
unknownInput = randint(0, 1)
um = 0
for j in range(2):
sum = sum + inputs[j]
sum = sum + unknownInput
inputTensor = np.asarray(inputs)
return inputTensor, sum
def printTensor(tensor):
sh = tensor.get_shape()
print(sh)
def placeholder_inputs(size):
output_placeholder = tf.placeholder(tf.float32, shape=(size))
input_placeholder = tf.placeholder(tf.float32, shape=(size, 2))
return input_placeholder, output_placeholder
def fill_feed_dict(inputs_pl, output_pl):
inputs = []
outputs = []
for i in range(FLAGS.batch_size):
input, output = generateTrial()
inputs.append(input)
outputs.append(output)
return {inputs_pl: inputs, output_pl: outputs}
def loss(y, pred):
return tf.reduce_mean(tf.pow(y - pred, 2))
def NN(x, y, W1, b1, W2, b2):
layer1 = tf.add(tf.matmul(x, W1), b1)
layer1 = tf.nn.relu(layer1)
output = tf.add(tf.matmul(layer1, W2), b2)
return output, loss(y, output)
def get_params(dim_hidden):
with tf.variable_scope('nn_params'):
return tf.Variable(tf.truncated_normal([2, dim_hidden], stddev = 0.05)), tf.Variable(0.0, (dim_hidden)),\
tf.Variable(tf.truncated_normal([dim_hidden, 1], stddev = 0.05)), tf.Variable(0.0, 1)
def run_training():
input_placeholder, output_placeholder = placeholder_inputs(FLAGS.batch_size)
W1, b1, W2, b2 = get_params(FLAGS.dim1)
pred, loss = NN(input_placeholder, output_placeholder, W1, b1, W2, b2)
optm = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(loss)
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for iters in range(FLAGS.num_iters):
l, _ = sess.run([loss, optm], feed_dict = fill_feed_dict(input_placeholder, output_placeholder))
print l, iters + 1

Categories