convolution neural network with backpropagation and sparsity in python

convolution neural network with backpropagation and sparsity in python - python

I am trying to modify the code provided by neural-networks-and-deep-learning on github for network3.py. This code basically constructs a convolution neural network and trains the MNIST data set.
What I am trying to do is add the concept of back propagation and sparsity to this code. The part of code which I added is outlined between the two lines of #. I get an Typeerror: make node requires 4D tensor of kernels
I understand that the size should be of 4D (1,1,28,28) but I am not sure where and how to do this modification.
class ConvPoolLayer(object):
def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
activation_fn=sigmoid):
self.filter_shape = filter_shape
self.image_shape = image_shape
self.poolsize = poolsize
self.activation_fn=activation_fn
# initialize weights and biases
n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))
self.w = theano.shared(
np.asarray(
np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),
dtype=theano.config.floatX),
borrow=True)
#print self.w.eval()
self.b = theano.shared(
np.asarray(
np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),
dtype=theano.config.floatX),
borrow=True)
#print filter_shape[0]
#print self.b.eval()
self.params = [self.w, self.b]
def sigmoid(self, x):
return (1 / (1 + T.exp(-x)))
def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
self.inpt = inpt.reshape(self.image_shape)
############################################################
learning_rate = 0.0001
learning_rate_s = 0.0001
gamma = 1
alpha = 1 - learning_rate
v1 = T.dot(self.w, self.inpt) + self.b
y1 = self.sigmoid(v1)
diff1 = self.inpt - T.dot(T.transpose(self.w), y1)
d1 = T.dot(self.w, diff1)
d1 = T.dot(d1, (1.0 - T.dot(v1,v1)))
delta_w1_bp = learning_rate * T.dot(d1 , T.transpose(self.inpt))
delta_b1_bp = T.sum(learning_rate * d1, axis=1)
delta_w1_s = learning_rate_s * T.dot(self.sigmoid(y1),T.transpose(self.inpt))
delta_b1_s = T.sum(learning_rate_s * self.sigmoid(y1), axis=1)
total_w1 = gamma * delta_w1_bp + (1 - gamma) * delta_w1_s
total_b1 = gamma * delta_b1_bp + (1 - gamma) * delta_b1_s
self.w = (alpha * self.w) + total_w1
self.b = (alpha * self.b) + total_b1
##################################################################
conv_out = conv.conv2d(
input=self.inpt, filters=self.w, filter_shape=self.filter_shape,
image_shape=self.image_shape)
pooled_out = downsample.max_pool_2d(
input=conv_out, ds=self.poolsize, ignore_border=True)
self.output = self.activation_fn(
pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
self.output_dropout = self.output # no dropout in the convolution layers
Does anyone know how to fix this?
The main code I run to call the above script is
import network3
from network3 import Network
from network3 import ConvPoolLayer, FullyConnectedLayer, SoftmaxLayer
training_data, validation_data, test_data = network3.load_data_shared()
mini_batch_size = 10
net = Network([
ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28),
filter_shape=(20, 1, 5, 5),
poolsize=(2, 2)),
FullyConnectedLayer(n_in=20*12*12, n_out=100),
SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
net.SGD(training_data, 60, mini_batch_size, 0.1,
validation_data, test_data)

Related

What is hp_metric in TensorBoard and how to get rid of it?

I am new to Tensorboard.
I am using fairly simple code running an experiment, and this is the output:
I don't remember asking for a hp_metric graph, yet here it is.
What is it and how do I get rid of it?
Full code to reproduce, using Pytorch Lightning (not that I think anyone should have to reproduce this to answer):
Please notice the ONLY line dereferencing TensorBoard is
self.logger.experiment.add_scalars("losses", {"train_loss": loss}, global_step=self.current_epoch)
import torch
from torch import nn
import torch.nn.functional as F
from typing import List, Optional
from pytorch_lightning.core.lightning import LightningModule
from Testing.Research.toy_datasets.ClustersDataset import ClustersDataset
from torch.utils.data import DataLoader
from Testing.Research.config.ConfigProvider import ConfigProvider
from pytorch_lightning import Trainer, seed_everything
from torch import optim
import os
from pytorch_lightning.loggers import TensorBoardLogger
class VAEFC(LightningModule):
# see https://towardsdatascience.com/understanding-variational-autoencoders-vaes-f70510919f73
# for possible upgrades, see https://arxiv.org/pdf/1602.02282.pdf
# https://stats.stackexchange.com/questions/332179/how-to-weight-kld-loss-vs-reconstruction-loss-in-variational-auto-encoder
def __init__(self, encoder_layer_sizes: List, decoder_layer_sizes: List, config):
super(VAEFC, self).__init__()
self._config = config
self.logger: Optional[TensorBoardLogger] = None
assert len(encoder_layer_sizes) >= 3, "must have at least 3 layers (2 hidden)"
# encoder layers
self._encoder_layers = nn.ModuleList()
for i in range(1, len(encoder_layer_sizes) - 1):
enc_layer = nn.Linear(encoder_layer_sizes[i - 1], encoder_layer_sizes[i])
self._encoder_layers.append(enc_layer)
# predict mean and covariance vectors
self._mean_layer = nn.Linear(encoder_layer_sizes[
len(encoder_layer_sizes) - 2],
encoder_layer_sizes[len(encoder_layer_sizes) - 1])
self._logvar_layer = nn.Linear(encoder_layer_sizes[
len(encoder_layer_sizes) - 2],
encoder_layer_sizes[len(encoder_layer_sizes) - 1])
# decoder layers
self._decoder_layers = nn.ModuleList()
for i in range(1, len(decoder_layer_sizes)):
dec_layer = nn.Linear(decoder_layer_sizes[i - 1], decoder_layer_sizes[i])
self._decoder_layers.append(dec_layer)
self._recon_function = nn.MSELoss(reduction='mean')
def _encode(self, x):
for i in range(len(self._encoder_layers)):
layer = self._encoder_layers[i]
x = F.relu(layer(x))
mean_output = self._mean_layer(x)
logvar_output = self._logvar_layer(x)
return mean_output, logvar_output
def _reparametrize(self, mu, logvar):
if not self.training:
return mu
std = logvar.mul(0.5).exp_()
if std.is_cuda:
eps = torch.cuda.FloatTensor(std.size()).normal_()
else:
eps = torch.FloatTensor(std.size()).normal_()
reparameterized = eps.mul(std).add_(mu)
return reparameterized
def _decode(self, z):
for i in range(len(self._decoder_layers) - 1):
layer = self._decoder_layers[i]
z = F.relu((layer(z)))
decoded = self._decoder_layers[len(self._decoder_layers) - 1](z)
# decoded = F.sigmoid(self._decoder_layers[len(self._decoder_layers)-1](z))
return decoded
def _loss_function(self, recon_x, x, mu, logvar, reconstruction_function):
"""
recon_x: generating images
x: origin images
mu: latent mean
logvar: latent log variance
"""
binary_cross_entropy = reconstruction_function(recon_x, x) # mse loss TODO see if mse or cross entropy
# loss = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
kld_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
kld = torch.sum(kld_element).mul_(-0.5)
# KL divergence Kullback–Leibler divergence, regularization term for VAE
# It is a measure of how different two probability distributions are different from each other.
# We are trying to force the distributions closer while keeping the reconstruction loss low.
# see https://towardsdatascience.com/understanding-variational-autoencoders-vaes-f70510919f73
# read on weighting the regularization term here:
# https://stats.stackexchange.com/questions/332179/how-to-weight-kld-loss-vs-reconstruction-loss-in-variational
# -auto-encoder
return binary_cross_entropy + kld * self._config.regularization_factor
def training_step(self, batch, batch_index):
orig_batch, noisy_batch, _ = batch
noisy_batch = noisy_batch.view(noisy_batch.size(0), -1)
recon_batch, mu, logvar = self.forward(noisy_batch)
loss = self._loss_function(
recon_batch,
orig_batch, mu, logvar,
reconstruction_function=self._recon_function
)
# self.logger.experiment.add_scalars("losses", {"train_loss": loss})
self.logger.experiment.add_scalars("losses", {"train_loss": loss}, global_step=self.current_epoch)
# self.logger.experiment.add_scalar("train_loss", loss, self.current_epoch)
self.logger.experiment.flush()
return loss
def train_dataloader(self):
default_dataset, train_dataset, test_dataset = ClustersDataset.clusters_dataset_by_config()
train_dataloader = DataLoader(train_dataset, batch_size=self._config.batch_size, shuffle=True)
return train_dataloader
def test_dataloader(self):
default_dataset, train_dataset, test_dataset = ClustersDataset.clusters_dataset_by_config()
test_dataloader = DataLoader(test_dataset, batch_size=self._config.batch_size, shuffle=True)
return test_dataloader
def configure_optimizers(self):
optimizer = optim.Adam(model.parameters(), lr=self._config.learning_rate)
return optimizer
def forward(self, x):
mu, logvar = self._encode(x)
z = self._reparametrize(mu, logvar)
decoded = self._decode(z)
return decoded, mu, logvar
if __name__ == "__main__":
config = ConfigProvider.get_config()
seed_everything(config.random_seed)
latent_dim = config.latent_dim
enc_layer_sizes = config.enc_layer_sizes + [latent_dim]
dec_layer_sizes = [latent_dim] + config.dec_layer_sizes
model = VAEFC(config=config, encoder_layer_sizes=enc_layer_sizes, decoder_layer_sizes=dec_layer_sizes)
logger = TensorBoardLogger(save_dir='tb_logs', name='VAEFC')
logger.hparams = config # TODO only put here relevant stuff
# trainer = Trainer(gpus=1)
trainer = Trainer(deterministic=config.is_deterministic,
#auto_lr_find=config.auto_lr_find,
#log_gpu_memory='all',
# min_epochs=99999,
max_epochs=config.num_epochs,
default_root_dir=os.getcwd(),
logger=logger
)
# trainer.tune(model)
trainer.fit(model)
print("done training vae with lightning")
ClustersDataset.py
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import torch
import numpy as np
from Testing.Research.config.ConfigProvider import ConfigProvider
class ClustersDataset(Dataset):
__default_dataset = None
__default_dataset_train = None
__default_dataset_test = None
def __init__(self, cluster_size: int, noise_factor: float = 0, transform=None, n_clusters=2, centers_radius=4.0):
super(ClustersDataset, self).__init__()
self._cluster_size = cluster_size
self._noise_factor = noise_factor
self._n_clusters = n_clusters
self._centers_radius = centers_radius
# self._transform = transform
self._size = self._cluster_size * self._n_clusters
self._create_data_clusters()
self._combine_clusters_to_array()
self._normalize_data()
self._add_noise()
# self._plot()
pass
#staticmethod
def clusters_dataset_by_config():
if ClustersDataset.__default_dataset is not None:
return \
ClustersDataset.__default_dataset, \
ClustersDataset.__default_dataset_train, \
ClustersDataset.__default_dataset_test
config = ConfigProvider.get_config()
default_dataset = ClustersDataset(
cluster_size=config.cluster_size,
noise_factor=config.noise_factor,
transform=None,
n_clusters=config.n_clusters,
centers_radius=config.centers_radius
)
train_size = int(config.train_size * len(default_dataset))
test_size = len(default_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(default_dataset, [train_size, test_size])
ClustersDataset.__default_dataset = default_dataset
ClustersDataset.__default_dataset_train = train_dataset
ClustersDataset.__default_dataset_test = test_dataset
return default_dataset, train_dataset, test_dataset
def _create_data_clusters(self):
self._clusters = [torch.zeros((self._cluster_size, 2)) for _ in range(self._n_clusters)]
centers_radius = self._centers_radius
for i, c in enumerate(self._clusters):
r, x, y = 3.0, centers_radius * np.cos(i * np.pi * 2 / self._n_clusters), centers_radius * np.sin(
i * np.pi * 2 / self._n_clusters)
cluster_length = 1.1
cluster_start = i * 2 * np.pi / self._n_clusters
cluster_end = cluster_length * (i + 1) * 2 * np.pi / self._n_clusters
cluster_inds = torch.linspace(start=cluster_start, end=cluster_end, steps=self._cluster_size,
dtype=torch.float)
c[:, 0] = r * torch.sin(cluster_inds) + y
c[:, 1] = r * torch.cos(cluster_inds) + x
def _plot(self):
plt.figure()
plt.scatter(self._noisy_values[:, 0], self._noisy_values[:, 1], s=1, color='b', label="noisy_values")
plt.scatter(self._values[:, 0], self._values[:, 1], s=1, color='r', label="values")
plt.legend(loc="upper left")
plt.show()
def _combine_clusters_to_array(self):
size = self._size
self._values = torch.zeros(size, 2)
self._labels = torch.zeros(size, dtype=torch.long)
for i, c in enumerate(self._clusters):
self._values[i * self._cluster_size: (i + 1) * self._cluster_size, :] = self._clusters[i]
self._labels[i * self._cluster_size: (i + 1) * self._cluster_size] = i
def _add_noise(self):
size = self._size
mean = torch.zeros(size, 2)
std = torch.ones(size, 2)
noise = torch.normal(mean, std)
self._noisy_values = torch.zeros(size, 2)
self._noisy_values[:] = self._values
self._noisy_values = self._noisy_values + noise * self._noise_factor
def _normalize_data(self):
values_min, values_max = torch.min(self._values), torch.max(self._values)
self._values = (self._values - values_min) / (values_max - values_min)
self._values = self._values * 2 - 1
def __len__(self):
return self._size # number of samples in the dataset
def __getitem__(self, index):
item = self._values[index, :]
noisy_item = self._noisy_values[index, :]
# if self._transform is not None:
# noisy_item = self._transform(item)
return item, noisy_item, self._labels[index]
#property
def values(self):
return self._values
#property
def noisy_values(self):
return self._noisy_values
Config values (ConfigProvider just returns those as an object)
num_epochs: 15
batch_size: 128
learning_rate: 0.0001
auto_lr_find: False
noise_factor: 0.1
regularization_factor: 0.0
cluster_size: 5000
n_clusters: 5
centers_radius: 4.0
train_size: 0.8
latent_dim: 8
enc_layer_sizes: [2, 200, 200, 200]
dec_layer_sizes: [200, 200, 200, 2]
retrain_vae: False
random_seed: 11
is_deterministic: True

It's the default setting of tensorboard in pytorch lightning. You can set default_hp_metric to false to get rid of this metric.
TensorBoardLogger(save_dir='tb_logs', name='VAEFC', default_hp_metric=False)
The hp_metric helps you track the model performance across different hyperparameters. You can check it at hparams in your tensorboard.

hp_metric (hyperparameter metric) is to help you tune your hyperparameters.
You can set this metric to whatever you like as documented in pytorch official docs.
Then, you can look through your hyperparameters and see which come out best according to whichever metric you choose.
Alternatively, if you don't want it, you can disable it as suggested in #joe32140's answer:
You can set default_hp_metric to false to get rid of this metric.
TensorBoardLogger(save_dir='tb_logs', name='VAEFC', default_hp_metric=False)

Python codes about class or multi-line animation?

I am a novice of python. The problem that I am trying is about optimization. I wanna compare two optimization algorithms, namely RMSprop and Adam with Beale function. Actually, I download the Adam algorithm online and add PMSprop to the original code. But the animation figure shows me that the particle paths of two algorithms are surprisingly same. (The path flashes.) I am sure that they should be different. And I try some severe changes of class RMSprop but the result does not change. I am not sure which step is wrong. Animation step? Or the step of calling class?
import sys
import matplotlib.pyplot as plt
import autograd.numpy as np
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LogNorm
from matplotlib import animation
from autograd import elementwise_grad,value_and_grad,grad
from scipy.optimize import minimize
from collections import defaultdict
from itertools import zip_longest
from functools import partial
f = lambda x,y: (1.5 - x + x*y)**2 + (2.25 - x + x*y**2)**2 + (2.625 - x
+x*y**3)**2
xmin, xmax, xstep = -4.5, 4.5, 0.2
ymin, ymax, ystep = -4.5, 4.5, 0.2
x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep),np.arange(ymin, ymax
+ ystep, ystep))
z = f(x,y)
minima = np.array([3.0,0.5])
minima_ = minima.reshape(-1,1)
def target_func(weights):
x,y = weights
return f(x,y)
class Adam:
def __init__(self, loss, weights, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.loss = loss
self.theta = weights
self.lr = lr #　learning rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.get_gradient = grad(loss)
self.m = 0
self.v = 0
self.t = 0
def minimize_trace(self, path=[]):
self.t +=1
g = self.get_gradient(self.theta)
self.m = self.beta1 * self.m + (1 - self.beta1) * g
self.v = self.beta2 * self.v + (1 - self.beta2) * (g * g)
self.m_hat = self.m / (1 - self.beta1 ** self.t)
self.v_hat = self.v / (1 - self.beta2 ** self.t)
self.theta -= self.lr * self.m_hat / (self.v_hat ** 0.5 + self.epsilon)
path.append(np.copy(self.theta))
class RMSprop:
def __init__(self, loss, weights, lr=0.001, beta1=0.9, beta2=0.999,
epsilon=1e-8):
self.loss = loss
self.theta = weights
self.lr = lr #　learning rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.get_gradient = grad(loss)
self.m = 0
self.v = 0
self.t = 0
def minimize_trace(self, path=[]):
self.t +=1
g = self.get_gradient(self.theta)
## self.m = self.beta1 * self.m + (1 - self.beta1) * g
self.v = self.beta2 * self.v + (1 - self.beta2) * (g * g)
self.m_hat = self.m / (1 - self.beta1 ** self.t)
self.v_hat = self.v / (1 - self.beta2 ** self.t)
self.theta -= self.lr * self.m_hat / (self.v_hat ** 0.5 +
self.epsilon)
path.append(np.copy(self.theta))
EPOCHS = 3000
SHOW_STEPS = 100
PRECISION = 1e-8
weights = np.array([1,1.5])
path_trace_adam = [np.copy(weights)]
path_trace_rmsprop = [np.copy(weights)]
adam = Adam(target_func, weights, lr = 0.01)
rmsprop = RMSprop(target_func, weights, lr = 0.01)
for i in range(EPOCHS):
adam.minimize_trace(path_trace_adam)
rmsprop.minimize_trace(path_trace_rmsprop)
print("\n final weights:{} loss:{}".format(adam.theta, adam.loss(adam.theta)))
print("\n final weights:{} loss:{}".format(rmsprop.theta, rmsprop.loss(rmsprop.theta)))
path_trace_adam = np.array(path_trace_adam).T
path_trace_rmsprop = np.array(path_trace_rmsprop).T
shape_adam = path_trace_adam.shape
shape_rmsprop = path_trace_rmsprop.shape
if shape_adam[1] > SHOW_STEPS:
show_step_adam = shape_adam[1] // SHOW_STEPS
path_trace_adam = np.array(path_trace_adam[:,::show_step_adam])
if shape_rmsprop[1] > SHOW_STEPS:
show_step_rmsprop = shape_rmsprop[1] // SHOW_STEPS
path_trace_rmsprop = np.array(path_trace_rmsprop[:,::show_step_rmsprop])
################## Visualize Convergence Trace
fig, ax = plt.subplots(figsize=(10,10))
ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap=plt.cm.jet)
ax.plot(*minima_, 'r*', markersize=12)
line_adam, = ax.plot([], [], 'r', label='Adam Optimizer', lw=2)
line_rmsprop, = ax.plot([], [], 'k', label='RMSprop Optimizer', lw=2)
point_adam, = ax.plot([], [], 'ro')
point_rmsprop, = ax.plot([], [], 'ko')
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))
ax.legend(loc='upper left')
################### animation
def init_adam():
line_adam.set_data([], [])
point_adam.set_data([], [])
return line_adam, point_adam
def init_rmsprop():
line_rmsprop.set_data([], [])
point_rmsprop.set_data([], [])
return line_rmsprop, point_rmsprop
def animate_adam(i):
line_adam.set_data(*path_trace_adam[::,:i])
point_adam.set_data(*path_trace_adam[::,i-1:i])
return line_adam, point_adam
def animate_rmsprop(i):
line_rmsprop.set_data(*path_trace_rmsprop[::,:i])
point_rmsprop.set_data(*path_trace_rmsprop[::,i-1:i])
return line_rmsprop, point_rmsprop
anim_adam = animation.FuncAnimation(fig, animate_adam, init_func=init_adam,
frames=path_trace_adam.shape[1], interval=60,
repeat_delay=None, repeat=True, blit=True)
anim_rmsprop = animation.FuncAnimation(fig, animate_rmsprop, init_func=init_rmsprop,
frames=path_trace_rmsprop.shape[1], interval=60,
repeat_delay=None, repeat=True, blit=True)
plt.show()

The bug is here:
weights = np.array([1,1.5])
# ... truncated for brevity
adam = Adam(target_func, weights, lr = 0.01)
rmsprop = RMSprop(target_func, weights, lr = 0.01)
Since the reference to weights is shared between the two routines, every time Adam.minimize_trace and RMSprop.minimize_trace run, they modify the same array. Since the path is derived from the array, the path on both become the same.
If you copy the array before passing it to the two constructors, it should work as expected.
adam = Adam(target_func, np.copy(weights), lr = 0.01)
rmsprop = RMSprop(target_func, np.copy(weights), lr = 0.01)

Pytorch: Custom Loss only works for batch_size == 1

I am currently trying to port my existing (working) keras
BNN code to pytorch.
To this end, I have to write a custom NegativeLogLikelihood loss function. My unit test for this loss passes (e.g. for fixed network weights I get the same results and gradients as in my old (working) keras code), but in a simple dummy example (fitting a sinc function) my loss only gives okay results for batch_size == 1 and my network fails to fit sinc properly (at any amount of training iterations) for larger values. Using nn.MSELoss instead works perfectly fine, so I am assuming an issue with my loss computation.
import matplotlib.pyplot as plt
from itertools import islice
try:
from tqdm import tqdm
except ImportError:
tqdm = lambda x, total: x
import numpy as np
import torch
from torch.utils import data as data_utils
import torch.nn as nn
class NLLLoss(torch.nn.modules.loss._Loss):
def __init__(self, parameters, num_datapoints, size_average=False, reduce=True):
super().__init__(size_average, reduce)
self.parameters = tuple(parameters)
self.num_datapoints = num_datapoints
def log_variance_prior(self, log_variance, mean=1e-6, variance=0.01):
return torch.mean(
torch.sum(
((-(log_variance - torch.log(torch.tensor(mean))) ** 2) /
((2. * variance))) - 0.5 * torch.log(torch.tensor(variance)),
dim=1
)
)
def weight_prior(self, parameters, wdecay=1.):
num_parameters = torch.sum(torch.tensor([
torch.prod(torch.tensor(parameter.size()))
for parameter in parameters
]))
log_likelihood = torch.sum(torch.tensor([
torch.sum(-wdecay * 0.5 * (parameter ** 2))
for parameter in parameters
]))
return log_likelihood / (num_parameters.float() + 1e-16)
def forward(self, input, target):
torch.nn.modules.loss._assert_no_grad(target)
batch_size, *_ = input.shape
prediction_mean = input[:, 0].view(-1, 1)
log_prediction_variance = input[:, 1].view(-1, 1)
prediction_variance_inverse = 1. / (torch.exp(log_prediction_variance) + 1e-16)
mean_squared_error = torch.pow(target - prediction_mean, 2)
log_likelihood = (
torch.sum(
torch.sum(
-mean_squared_error * 0.5 * prediction_variance_inverse -
0.5 * log_prediction_variance,
dim=1
)
)
)
log_likelihood /= batch_size
log_likelihood += (
self.log_variance_prior(log_prediction_variance) / self.num_datapoints
)
log_likelihood += self.weight_prior(self.parameters) / self.num_datapoints
return -log_likelihood
# Helper Functions {{{ #
def infinite_dataloader(dataloader):
while True:
yield from dataloader
def tanh_network(input_dimensionality: int):
class AppendLayer(nn.Module):
def __init__(self, bias=True, *args, **kwargs):
super().__init__(*args, **kwargs)
if bias:
self.bias = nn.Parameter(torch.Tensor(1, 1))
else:
self.register_parameter('bias', None)
def forward(self, x):
return torch.cat((x, self.bias * torch.ones_like(x)), dim=1)
def init_weights(module):
if type(module) == AppendLayer:
nn.init.constant_(module.bias, val=np.log(1e-3))
elif type(module) == nn.Linear:
nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="linear")
nn.init.constant_(module.bias, val=0.0)
return nn.Sequential(
nn.Linear(input_dimensionality, 50), nn.Tanh(),
nn.Linear(50, 50), nn.Tanh(),
nn.Linear(50, 50), nn.Tanh(),
nn.Linear(50, 1),
AppendLayer()
).apply(init_weights)
# }}} Helper Functions #
input_dimensionality, num_datapoints = 1, 100
num_train_steps = 13000
# Set up data
x_train = np.array([
np.random.uniform(np.zeros(1), np.ones(1), input_dimensionality)
for _ in range(num_datapoints)
])
y_train = np.sinc(x_train * 10 - 5).sum(axis=1)
# Data Normalization
x_train_, x_mean, x_std = (
np.true_divide(x_train - np.mean(x_train), np.std(x_train)), np.mean(x_train), np.std(x_train)
)
y_train_, y_mean, y_std = (
np.true_divide(y_train - np.mean(y_train), np.std(y_train)), np.mean(y_train), np.std(y_train)
)
model = tanh_network(input_dimensionality=input_dimensionality)
# TODO Why does setting batch_size to 1 work with NLL, but setting it to higher values fails?
batch_size = 20 # setting this to 1 gives okay results.
loss_function = NLLLoss(model.parameters(), num_datapoints=num_datapoints)
# NOTE: Using MSE like this also works:
# loss_function = lambda input, target: nn.MSELoss()(input=input[:, 0], target=target)
train_loader = infinite_dataloader(
data_utils.DataLoader(
data_utils.TensorDataset(
torch.from_numpy(x_train_).float(),
torch.from_numpy(y_train_).float()
), batch_size=batch_size
)
)
optimizer = torch.optim.Adam(model.parameters())
# Train loop
for epoch, (x_batch, y_batch) in tqdm(enumerate(islice(train_loader, num_train_steps)), total=num_train_steps):
optimizer.zero_grad()
y_pred = model(x_batch)
loss = loss_function(input=y_pred, target=y_batch)
loss.backward()
optimizer.step()
if epoch % 100 == 0:
mse_value = nn.MSELoss()(input=y_pred[:, 0], target=y_batch)
print("Epoch: {}, Loss: {}, MSE: {}".format(epoch, loss, mse_value))
x_test = np.linspace(0, 1, 100)[:, None]
y_test = np.sinc(x_test * 10 - 5).sum(axis=1)
# Data Normalization
x_test_ = np.true_divide(x_test - x_mean, x_std)
x_test_torch = torch.from_numpy(x_test_).float()
y_test_torch = torch.from_numpy(y_test).float()
# Unnormalize predictions
y_pred = model(x_test_torch).detach().numpy() * y_std + y_mean
plt.plot(x_test[:, 0], y_test, label="true", color="black")
plt.plot(x_train[:, 0], y_train, "ro")
plt.plot(x_test[:, 0], y_pred[:, 0], label="Adam", color="blue")
plt.legend()
plt.show()
Any help or suggestions on what I could be doing wrong are very appreciated!

How to use TensorFlow in OOP style?

Specifically, when using TensorFlow to build my model in OOP style, where should I build the graph? Where should I start a session to run the graph? What's the best practice for this case?
In TensorFlow Mechanics 101, the MNIST example just simply define the inference, loss and training function in the module mnist.py and build the graph in fully_connected_feed.py. But in my opinion, the graph is actually part of the model and should be built inside the model, maybe in its __init__ method.
I have seen many other models using TensorFlow in its model zoo and each have their own practice, so I am a little confused here. Is there a best practice or any recommended programming paradigms when using TensorFlow？

Also check out a nice article about this topic:
https://danijar.com/structuring-your-tensorflow-models/
In this article, Danijar Hafner introduces lazy property:
class Model:
def __init__(self, data, target):
self.data = data
self.target = target
self.prediction
self.optimize
self.error
#lazy_property
def prediction(self):
data_size = int(self.data.get_shape()[1])
target_size = int(self.target.get_shape()[1])
weight = tf.Variable(tf.truncated_normal([data_size, target_size]))
bias = tf.Variable(tf.constant(0.1, shape=[target_size]))
incoming = tf.matmul(self.data, weight) + bias
return tf.nn.softmax(incoming)
#lazy_property
def optimize(self):
cross_entropy = -tf.reduce_sum(self.target, tf.log(self.prediction))
optimizer = tf.train.RMSPropOptimizer(0.03)
return optimizer.minimize(cross_entropy)
#lazy_property
def error(self):
mistakes = tf.not_equal(
tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
return tf.reduce_mean(tf.cast(mistakes, tf.float32))
See more in the article.

I usually build my graphs in the init but I sometime create a separate compile function. I have a unique variable scope for the entire class and the class provided save and restore and init functions for its variables. I also provide functions to train and predict. I don't think there is really any standard practice but this makes sense to me. Here is an example of how I build a generative model with image pyramids.
class PyramidGenerator:
def __init__(self,
session,
log2_input_size,
log2_output_size,
num_features,
convs_per_cell,
filter_size,
conv_activation,
num_attributes,
name = 'pyrgen'):
self.session = session
self.log2_input_size = log2_input_size
self.log2_output_size = log2_output_size
self.num_attributes = num_attributes
if not hasattr(num_features, '__iter__'):
num_features = [num_features] * (log2_output_size - log2_input_size)
if not hasattr(convs_per_cell, '__iter__'):
convs_per_cell = [convs_per_cell] * (log2_output_size - log2_input_size)
if not hasattr(filter_size, '__iter__'):
filter_size = [filter_size] * (log2_output_size - log2_input_size)
with tf.variable_scope(name) as scope:
self.training_images = tf.placeholder(tf.float32, (None, 2 ** log2_output_size, 2 ** log2_output_size, 3), 'training_images')
if num_attributes:
self.image_attributes = tf.placeholder(tf.float32, (None, num_attributes))
self.seed_images = tf.placeholder(tf.float32, (None, 2 ** log2_input_size, 2 ** log2_input_size, 3), 'seed_images')
self.learning_rate = tf.placeholder(tf.float32, (), 'learning_rate')
self.scope_name = scope.name
self.cost = 0
def _augment(img):
img = tf.image.random_flip_left_right(img)
return img
augmented = tf.map_fn(_augment, self.training_images)
training_scales = {s:tf.image.resize_area(augmented, (2 ** s, 2 ** s)) for s in range(log2_input_size, log2_output_size + 1)}
x_gen = self.seed_images
x_train = None
if num_attributes:
h_gen = h_train = tf.tile(tf.reshape(self.image_attributes, (-1, 1, 1, num_attributes)), (1, 2 ** log2_input_size, 2 ** log2_input_size, 1))
else:
h_gen = h_train = None
self.generator_outputs = []
for n_features, conv_size, n_convs, log2_size in zip(num_features, filter_size, convs_per_cell, range(log2_input_size, log2_output_size)):
size = 2 ** log2_size
with tf.variable_scope('level_%d' % size) as level_scope:
y_train = training_scales[log2_size + 1]
x_train = training_scales[log2_size]
x_train, h_train = ops.sharpen_cell(x_train, h_train, 2, n_features, conv_size, n_convs, conv_activation, 'upsampler')
self.cost += tf.reduce_mean((x_train - y_train) ** 2)
level_scope.reuse_variables()
x_gen, h_gen = ops.sharpen_cell(x_gen, h_gen, 2, n_features, conv_size, n_convs, conv_activation, 'upsampler')
self.generator_outputs.append(tf.clip_by_value(x_gen, -1, 1))
with tf.variable_scope('training'):
opt = tf.train.AdamOptimizer(self.learning_rate)
grads = opt.compute_gradients(self.cost)
grads = [(tf.clip_by_value(g, -1.0, 1.0), v) for g, v in grads]
self.train_step = opt.apply_gradients(grads)
self.variables = tf.get_collection(tf.GraphKeys.VARIABLES, self.scope_name)
self.init_vars = tf.initialize_variables(self.variables)
self.saver = tf.train.Saver(self.variables)
def save(self, fn):
self.saver.save(self.session, fn)
def restore(self, fn):
self.saver.restore(self.session, fn)
def initialize(self):
self.session.run(self.init_vars)
def train(self, training_images, validation_images = [], learning_rate = 1e-3, batch_size = 32):
with ThreadPoolExecutor(max(os.cpu_count(), batch_size)) as exc:
def _loadImage(fn):
img = cv2.imread(fn, cv2.IMREAD_COLOR)
img = cv2.resize(img, (2 ** self.log2_output_size, 2 ** self.log2_output_size))
return np.float32(img / 128.0 - 1.0)
def _loadBatch(b):
if self.num_attributes:
imgs, attrs = zip(*b)
else:
imgs = b
attrs = None
imgs = list(exc.map(_loadImage, imgs))
return imgs, attrs
total_cost = 0
batches = list(_batch(training_images, batch_size, False))
loader = exc.submit(_loadBatch, batches[0])
for i in range(len(batches)):
imgs, attrs = loader.result()
if i < len(batches) - 1:
loader = exc.submit(_loadBatch, batches[i + 1])
feed_dict = {self.training_images: imgs, self.learning_rate: learning_rate}
if self.num_attributes:
feed_dict.update({self.image_attributes: attrs})
total_cost += self.session.run((self.cost, self.train_step), feed_dict)[0]
print('Training Batch(%d/%d) Cost(%e)' % (i + 1, len(batches), total_cost / (i + 1)), end = '\r')
print()
return total_cost / (i + 1)
def generate_random(self):
img = np.clip(np.random.randn(1, 2 ** self.log2_input_size, 2 ** self.log2_input_size, 3), -1, 1)
if self.num_attributes:
attrs = np.random.choice((1.0, -1.0), size = (1, self.num_attributes))
feed = {self.seed_images: img, self.image_attributes: attrs}
else:
feed = {self.seed_images: img}
y = self.session.run(self.generator_outputs, feed)
return [img] + y
def generate_from(self, seed_image):
if self.num_attributes:
img, attrs = seed_image
else:
img = seed_image
img = cv2.imread(img, cv2.IMREAD_COLOR)
img = cv2.resize(img, (2 ** self.log2_input_size, 2 ** self.log2_input_size))
img = np.expand_dims(np.float32(img / 128.0 - 1.0), 0)
if self.num_attributes:
feed = {self.seed_images: img, self.image_attributes: [attrs]}
else:
feed = {self.seed_images: img}
y = self.session.run(self.generator_outputs, feed)
return [img] + y

Why does this method throw an error in terms of number of arguments?

I have the following piece of code and when I run the theano_build() method, it throws as error saying
File "rnn_theano.py", line 28, in __init__
self.__theano_build__()
File "rnn_theano.py", line 45, in __theano_build__
non_sequences=[U, V, W1, W12, W2],
File "/usr/local/lib/python2.7/dist-packages/theano/scan_module/scan.py", line 745, in scan
condition, outputs, updates = scan_utils.get_updates_and_outputs(fn(*args))
TypeError: forward_prop_step() takes exactly 8 arguments (7 given)
the following is the code in Theano. It is basically a two hidden layered recurrent neural network
import numpy as np
import theano as theano
import theano.tensor as T
from utils import *
import operator
class RNNTheano:
def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
# Assign instance variables
self.word_dim = word_dim
self.hidden_dim = hidden_dim
self.bptt_truncate = bptt_truncate
# Randomly initialize the network parameters
U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
W1 = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
W12 = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
W2 = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
# Theano: Created shared variables
self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
self.W1 = theano.shared(name='W1', value=W1.astype(theano.config.floatX))
self.W12 = theano.shared(name='W12', value=W12.astype(theano.config.floatX))
self.W2 = theano.shared(name='W2', value=W2.astype(theano.config.floatX))
# We store the Theano graph here
self.theano = {}
self.__theano_build__()
def forward_prop_step(self, x_t, s_t1_prev, s_t2_prev, U, V, W1, W12, W2):
s_t1 = T.tanh(U[:,x_t] + W1.dot(s_t1_prev))
s_t2 = T.tanh(W12.dot(s_t1) + W2.dot(s_t2_prev))
o_t = T.nnet.softmax(V.dot(s_t2))
return [o_t[0], s_t1, s_t2]
def __theano_build__(self):
U, V, W1, W12, W2 = self.U, self.V, self.W1, self.W12, self.W2
x = T.ivector('x')
y = T.ivector('y')
[o,s1,s2], updates = theano.scan(
self.forward_prop_step,
sequences=x,
outputs_info=[None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim))],
non_sequences=[U, V, W1, W12, W2],
truncate_gradient=self.bptt_truncate,
strict=False)
prediction = T.argmax(o, axis=1)
o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
# Gradients
dU = T.grad(o_error, U)
dV = T.grad(o_error, V)
dW1 = T.grad(o_error, W1)
dW12 = T.grad(o_error, W12)
dW2 = T.grad(o_error, W2)
# Assign functions
self.forward_propagation = theano.function([x], o)
self.predict = theano.function([x], prediction)
self.ce_error = theano.function([x, y], o_error)
self.bptt = theano.function([x, y], [dU, dV, dW1, dW12, dW2])
# SGD
learning_rate = T.scalar('learning_rate')
self.sgd_step = theano.function([x,y,learning_rate], [],
updates=[(self.U, self.U - learning_rate * dU),
(self.V, self.V - learning_rate * dV),
(self.W1, self.W1 - learning_rate * dW1)
(self.W12, self.W12 - learning_rate * dW12),
(self.W2, self.W2 - learning_rate * dW2)])
def calculate_total_loss(self, X, Y):
return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
def calculate_loss(self, X, Y):
# Divide calculate_loss by the number of words
num_words = np.sum([len(y) for y in Y])
return self.calculate_total_loss(X,Y)/float(num_words)

Try changing
return [o_t[0], s_t1, s_t2]
to
return o_t[0], s_t1, s_t2
I think the former is causing the method to return something that is coerced by Theano into a single tensor while the latter is explicitly returningthree objects as indicated in outputs_info.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

convolution neural network with backpropagation and sparsity in python - python

Related

What is hp_metric in TensorBoard and how to get rid of it?

Python codes about class or multi-line animation?

Pytorch: Custom Loss only works for batch_size == 1

How to use TensorFlow in OOP style?

Why does this method throw an error in terms of number of arguments?

Categories

Resources