I am a novice of python. The problem that I am trying is about optimization. I wanna compare two optimization algorithms, namely RMSprop and Adam with Beale function. Actually, I download the Adam algorithm online and add PMSprop to the original code. But the animation figure shows me that the particle paths of two algorithms are surprisingly same. (The path flashes.) I am sure that they should be different. And I try some severe changes of class RMSprop but the result does not change. I am not sure which step is wrong. Animation step? Or the step of calling class?
import sys
import matplotlib.pyplot as plt
import autograd.numpy as np
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LogNorm
from matplotlib import animation
from autograd import elementwise_grad,value_and_grad,grad
from scipy.optimize import minimize
from collections import defaultdict
from itertools import zip_longest
from functools import partial
f = lambda x,y: (1.5 - x + x*y)**2 + (2.25 - x + x*y**2)**2 + (2.625 - x
+x*y**3)**2
xmin, xmax, xstep = -4.5, 4.5, 0.2
ymin, ymax, ystep = -4.5, 4.5, 0.2
x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep),np.arange(ymin, ymax
+ ystep, ystep))
z = f(x,y)
minima = np.array([3.0,0.5])
minima_ = minima.reshape(-1,1)
def target_func(weights):
x,y = weights
return f(x,y)
class Adam:
def __init__(self, loss, weights, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
self.loss = loss
self.theta = weights
self.lr = lr # learning rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.get_gradient = grad(loss)
self.m = 0
self.v = 0
self.t = 0
def minimize_trace(self, path=[]):
self.t +=1
g = self.get_gradient(self.theta)
self.m = self.beta1 * self.m + (1 - self.beta1) * g
self.v = self.beta2 * self.v + (1 - self.beta2) * (g * g)
self.m_hat = self.m / (1 - self.beta1 ** self.t)
self.v_hat = self.v / (1 - self.beta2 ** self.t)
self.theta -= self.lr * self.m_hat / (self.v_hat ** 0.5 + self.epsilon)
path.append(np.copy(self.theta))
class RMSprop:
def __init__(self, loss, weights, lr=0.001, beta1=0.9, beta2=0.999,
epsilon=1e-8):
self.loss = loss
self.theta = weights
self.lr = lr # learning rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.get_gradient = grad(loss)
self.m = 0
self.v = 0
self.t = 0
def minimize_trace(self, path=[]):
self.t +=1
g = self.get_gradient(self.theta)
## self.m = self.beta1 * self.m + (1 - self.beta1) * g
self.v = self.beta2 * self.v + (1 - self.beta2) * (g * g)
self.m_hat = self.m / (1 - self.beta1 ** self.t)
self.v_hat = self.v / (1 - self.beta2 ** self.t)
self.theta -= self.lr * self.m_hat / (self.v_hat ** 0.5 +
self.epsilon)
path.append(np.copy(self.theta))
EPOCHS = 3000
SHOW_STEPS = 100
PRECISION = 1e-8
weights = np.array([1,1.5])
path_trace_adam = [np.copy(weights)]
path_trace_rmsprop = [np.copy(weights)]
adam = Adam(target_func, weights, lr = 0.01)
rmsprop = RMSprop(target_func, weights, lr = 0.01)
for i in range(EPOCHS):
adam.minimize_trace(path_trace_adam)
rmsprop.minimize_trace(path_trace_rmsprop)
print("\n final weights:{} loss:{}".format(adam.theta, adam.loss(adam.theta)))
print("\n final weights:{} loss:{}".format(rmsprop.theta, rmsprop.loss(rmsprop.theta)))
path_trace_adam = np.array(path_trace_adam).T
path_trace_rmsprop = np.array(path_trace_rmsprop).T
shape_adam = path_trace_adam.shape
shape_rmsprop = path_trace_rmsprop.shape
if shape_adam[1] > SHOW_STEPS:
show_step_adam = shape_adam[1] // SHOW_STEPS
path_trace_adam = np.array(path_trace_adam[:,::show_step_adam])
if shape_rmsprop[1] > SHOW_STEPS:
show_step_rmsprop = shape_rmsprop[1] // SHOW_STEPS
path_trace_rmsprop = np.array(path_trace_rmsprop[:,::show_step_rmsprop])
################## Visualize Convergence Trace
fig, ax = plt.subplots(figsize=(10,10))
ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap=plt.cm.jet)
ax.plot(*minima_, 'r*', markersize=12)
line_adam, = ax.plot([], [], 'r', label='Adam Optimizer', lw=2)
line_rmsprop, = ax.plot([], [], 'k', label='RMSprop Optimizer', lw=2)
point_adam, = ax.plot([], [], 'ro')
point_rmsprop, = ax.plot([], [], 'ko')
ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))
ax.legend(loc='upper left')
################### animation
def init_adam():
line_adam.set_data([], [])
point_adam.set_data([], [])
return line_adam, point_adam
def init_rmsprop():
line_rmsprop.set_data([], [])
point_rmsprop.set_data([], [])
return line_rmsprop, point_rmsprop
def animate_adam(i):
line_adam.set_data(*path_trace_adam[::,:i])
point_adam.set_data(*path_trace_adam[::,i-1:i])
return line_adam, point_adam
def animate_rmsprop(i):
line_rmsprop.set_data(*path_trace_rmsprop[::,:i])
point_rmsprop.set_data(*path_trace_rmsprop[::,i-1:i])
return line_rmsprop, point_rmsprop
anim_adam = animation.FuncAnimation(fig, animate_adam, init_func=init_adam,
frames=path_trace_adam.shape[1], interval=60,
repeat_delay=None, repeat=True, blit=True)
anim_rmsprop = animation.FuncAnimation(fig, animate_rmsprop, init_func=init_rmsprop,
frames=path_trace_rmsprop.shape[1], interval=60,
repeat_delay=None, repeat=True, blit=True)
plt.show()
The bug is here:
weights = np.array([1,1.5])
# ... truncated for brevity
adam = Adam(target_func, weights, lr = 0.01)
rmsprop = RMSprop(target_func, weights, lr = 0.01)
Since the reference to weights is shared between the two routines, every time Adam.minimize_trace and RMSprop.minimize_trace run, they modify the same array. Since the path is derived from the array, the path on both become the same.
If you copy the array before passing it to the two constructors, it should work as expected.
adam = Adam(target_func, np.copy(weights), lr = 0.01)
rmsprop = RMSprop(target_func, np.copy(weights), lr = 0.01)
Related
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt
class LinearRegression:
def __init__(self, alpha = 0.00001, n_iters=1000):
self.alpha = alpha
self.n_iters = n_iters
self.theta = None
self.b = None
def fit(self, X, y):
m, n = X.shape
self.theta = np.zeros(n)
self.b = 0
for _ in range(self.n_iters):
y_hat = np.dot(X, self.theta) + self.b
dw = (1/m) * np.dot(X.T, (y_hat-y))
db = (1/m) * np.sum(y_hat-y)
self.theta = self.theta - self.alpha * dw
self.b = self.b - self.alpha * db
def predict(self, X):
y_hat = np.dot(X, self.theta) + self.b
return y_hat
df = pd.read_csv(r".\archive\laptop_price.csv",encoding="latin1")
X = np.matrix(df['Inches'])
y = np.matrix(df['Price_euros'])
reg = LinearRegression(alpha=0.01) ;reg.fit(X,y); y_pred_line = reg.predict(X)
fig = plt.figure(figsize=(8,6))
plt.scatter(X[:, 0], y, color = "b", marker = "o", s = 30)
plt.plot(X, y_pred_line, color='black', linewidth=1, label='Prediction')
plt.show()
This code throws overflow with csv file but not with test data generated by scikit learn. Why?
I have tried changing my learning rate but that didn't help. When I generate my training data with sklearn. It gives me a pretty good line.
I am new to Tensorboard.
I am using fairly simple code running an experiment, and this is the output:
I don't remember asking for a hp_metric graph, yet here it is.
What is it and how do I get rid of it?
Full code to reproduce, using Pytorch Lightning (not that I think anyone should have to reproduce this to answer):
Please notice the ONLY line dereferencing TensorBoard is
self.logger.experiment.add_scalars("losses", {"train_loss": loss}, global_step=self.current_epoch)
import torch
from torch import nn
import torch.nn.functional as F
from typing import List, Optional
from pytorch_lightning.core.lightning import LightningModule
from Testing.Research.toy_datasets.ClustersDataset import ClustersDataset
from torch.utils.data import DataLoader
from Testing.Research.config.ConfigProvider import ConfigProvider
from pytorch_lightning import Trainer, seed_everything
from torch import optim
import os
from pytorch_lightning.loggers import TensorBoardLogger
class VAEFC(LightningModule):
# see https://towardsdatascience.com/understanding-variational-autoencoders-vaes-f70510919f73
# for possible upgrades, see https://arxiv.org/pdf/1602.02282.pdf
# https://stats.stackexchange.com/questions/332179/how-to-weight-kld-loss-vs-reconstruction-loss-in-variational-auto-encoder
def __init__(self, encoder_layer_sizes: List, decoder_layer_sizes: List, config):
super(VAEFC, self).__init__()
self._config = config
self.logger: Optional[TensorBoardLogger] = None
assert len(encoder_layer_sizes) >= 3, "must have at least 3 layers (2 hidden)"
# encoder layers
self._encoder_layers = nn.ModuleList()
for i in range(1, len(encoder_layer_sizes) - 1):
enc_layer = nn.Linear(encoder_layer_sizes[i - 1], encoder_layer_sizes[i])
self._encoder_layers.append(enc_layer)
# predict mean and covariance vectors
self._mean_layer = nn.Linear(encoder_layer_sizes[
len(encoder_layer_sizes) - 2],
encoder_layer_sizes[len(encoder_layer_sizes) - 1])
self._logvar_layer = nn.Linear(encoder_layer_sizes[
len(encoder_layer_sizes) - 2],
encoder_layer_sizes[len(encoder_layer_sizes) - 1])
# decoder layers
self._decoder_layers = nn.ModuleList()
for i in range(1, len(decoder_layer_sizes)):
dec_layer = nn.Linear(decoder_layer_sizes[i - 1], decoder_layer_sizes[i])
self._decoder_layers.append(dec_layer)
self._recon_function = nn.MSELoss(reduction='mean')
def _encode(self, x):
for i in range(len(self._encoder_layers)):
layer = self._encoder_layers[i]
x = F.relu(layer(x))
mean_output = self._mean_layer(x)
logvar_output = self._logvar_layer(x)
return mean_output, logvar_output
def _reparametrize(self, mu, logvar):
if not self.training:
return mu
std = logvar.mul(0.5).exp_()
if std.is_cuda:
eps = torch.cuda.FloatTensor(std.size()).normal_()
else:
eps = torch.FloatTensor(std.size()).normal_()
reparameterized = eps.mul(std).add_(mu)
return reparameterized
def _decode(self, z):
for i in range(len(self._decoder_layers) - 1):
layer = self._decoder_layers[i]
z = F.relu((layer(z)))
decoded = self._decoder_layers[len(self._decoder_layers) - 1](z)
# decoded = F.sigmoid(self._decoder_layers[len(self._decoder_layers)-1](z))
return decoded
def _loss_function(self, recon_x, x, mu, logvar, reconstruction_function):
"""
recon_x: generating images
x: origin images
mu: latent mean
logvar: latent log variance
"""
binary_cross_entropy = reconstruction_function(recon_x, x) # mse loss TODO see if mse or cross entropy
# loss = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
kld_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
kld = torch.sum(kld_element).mul_(-0.5)
# KL divergence Kullback–Leibler divergence, regularization term for VAE
# It is a measure of how different two probability distributions are different from each other.
# We are trying to force the distributions closer while keeping the reconstruction loss low.
# see https://towardsdatascience.com/understanding-variational-autoencoders-vaes-f70510919f73
# read on weighting the regularization term here:
# https://stats.stackexchange.com/questions/332179/how-to-weight-kld-loss-vs-reconstruction-loss-in-variational
# -auto-encoder
return binary_cross_entropy + kld * self._config.regularization_factor
def training_step(self, batch, batch_index):
orig_batch, noisy_batch, _ = batch
noisy_batch = noisy_batch.view(noisy_batch.size(0), -1)
recon_batch, mu, logvar = self.forward(noisy_batch)
loss = self._loss_function(
recon_batch,
orig_batch, mu, logvar,
reconstruction_function=self._recon_function
)
# self.logger.experiment.add_scalars("losses", {"train_loss": loss})
self.logger.experiment.add_scalars("losses", {"train_loss": loss}, global_step=self.current_epoch)
# self.logger.experiment.add_scalar("train_loss", loss, self.current_epoch)
self.logger.experiment.flush()
return loss
def train_dataloader(self):
default_dataset, train_dataset, test_dataset = ClustersDataset.clusters_dataset_by_config()
train_dataloader = DataLoader(train_dataset, batch_size=self._config.batch_size, shuffle=True)
return train_dataloader
def test_dataloader(self):
default_dataset, train_dataset, test_dataset = ClustersDataset.clusters_dataset_by_config()
test_dataloader = DataLoader(test_dataset, batch_size=self._config.batch_size, shuffle=True)
return test_dataloader
def configure_optimizers(self):
optimizer = optim.Adam(model.parameters(), lr=self._config.learning_rate)
return optimizer
def forward(self, x):
mu, logvar = self._encode(x)
z = self._reparametrize(mu, logvar)
decoded = self._decode(z)
return decoded, mu, logvar
if __name__ == "__main__":
config = ConfigProvider.get_config()
seed_everything(config.random_seed)
latent_dim = config.latent_dim
enc_layer_sizes = config.enc_layer_sizes + [latent_dim]
dec_layer_sizes = [latent_dim] + config.dec_layer_sizes
model = VAEFC(config=config, encoder_layer_sizes=enc_layer_sizes, decoder_layer_sizes=dec_layer_sizes)
logger = TensorBoardLogger(save_dir='tb_logs', name='VAEFC')
logger.hparams = config # TODO only put here relevant stuff
# trainer = Trainer(gpus=1)
trainer = Trainer(deterministic=config.is_deterministic,
#auto_lr_find=config.auto_lr_find,
#log_gpu_memory='all',
# min_epochs=99999,
max_epochs=config.num_epochs,
default_root_dir=os.getcwd(),
logger=logger
)
# trainer.tune(model)
trainer.fit(model)
print("done training vae with lightning")
ClustersDataset.py
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import torch
import numpy as np
from Testing.Research.config.ConfigProvider import ConfigProvider
class ClustersDataset(Dataset):
__default_dataset = None
__default_dataset_train = None
__default_dataset_test = None
def __init__(self, cluster_size: int, noise_factor: float = 0, transform=None, n_clusters=2, centers_radius=4.0):
super(ClustersDataset, self).__init__()
self._cluster_size = cluster_size
self._noise_factor = noise_factor
self._n_clusters = n_clusters
self._centers_radius = centers_radius
# self._transform = transform
self._size = self._cluster_size * self._n_clusters
self._create_data_clusters()
self._combine_clusters_to_array()
self._normalize_data()
self._add_noise()
# self._plot()
pass
#staticmethod
def clusters_dataset_by_config():
if ClustersDataset.__default_dataset is not None:
return \
ClustersDataset.__default_dataset, \
ClustersDataset.__default_dataset_train, \
ClustersDataset.__default_dataset_test
config = ConfigProvider.get_config()
default_dataset = ClustersDataset(
cluster_size=config.cluster_size,
noise_factor=config.noise_factor,
transform=None,
n_clusters=config.n_clusters,
centers_radius=config.centers_radius
)
train_size = int(config.train_size * len(default_dataset))
test_size = len(default_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(default_dataset, [train_size, test_size])
ClustersDataset.__default_dataset = default_dataset
ClustersDataset.__default_dataset_train = train_dataset
ClustersDataset.__default_dataset_test = test_dataset
return default_dataset, train_dataset, test_dataset
def _create_data_clusters(self):
self._clusters = [torch.zeros((self._cluster_size, 2)) for _ in range(self._n_clusters)]
centers_radius = self._centers_radius
for i, c in enumerate(self._clusters):
r, x, y = 3.0, centers_radius * np.cos(i * np.pi * 2 / self._n_clusters), centers_radius * np.sin(
i * np.pi * 2 / self._n_clusters)
cluster_length = 1.1
cluster_start = i * 2 * np.pi / self._n_clusters
cluster_end = cluster_length * (i + 1) * 2 * np.pi / self._n_clusters
cluster_inds = torch.linspace(start=cluster_start, end=cluster_end, steps=self._cluster_size,
dtype=torch.float)
c[:, 0] = r * torch.sin(cluster_inds) + y
c[:, 1] = r * torch.cos(cluster_inds) + x
def _plot(self):
plt.figure()
plt.scatter(self._noisy_values[:, 0], self._noisy_values[:, 1], s=1, color='b', label="noisy_values")
plt.scatter(self._values[:, 0], self._values[:, 1], s=1, color='r', label="values")
plt.legend(loc="upper left")
plt.show()
def _combine_clusters_to_array(self):
size = self._size
self._values = torch.zeros(size, 2)
self._labels = torch.zeros(size, dtype=torch.long)
for i, c in enumerate(self._clusters):
self._values[i * self._cluster_size: (i + 1) * self._cluster_size, :] = self._clusters[i]
self._labels[i * self._cluster_size: (i + 1) * self._cluster_size] = i
def _add_noise(self):
size = self._size
mean = torch.zeros(size, 2)
std = torch.ones(size, 2)
noise = torch.normal(mean, std)
self._noisy_values = torch.zeros(size, 2)
self._noisy_values[:] = self._values
self._noisy_values = self._noisy_values + noise * self._noise_factor
def _normalize_data(self):
values_min, values_max = torch.min(self._values), torch.max(self._values)
self._values = (self._values - values_min) / (values_max - values_min)
self._values = self._values * 2 - 1
def __len__(self):
return self._size # number of samples in the dataset
def __getitem__(self, index):
item = self._values[index, :]
noisy_item = self._noisy_values[index, :]
# if self._transform is not None:
# noisy_item = self._transform(item)
return item, noisy_item, self._labels[index]
#property
def values(self):
return self._values
#property
def noisy_values(self):
return self._noisy_values
Config values (ConfigProvider just returns those as an object)
num_epochs: 15
batch_size: 128
learning_rate: 0.0001
auto_lr_find: False
noise_factor: 0.1
regularization_factor: 0.0
cluster_size: 5000
n_clusters: 5
centers_radius: 4.0
train_size: 0.8
latent_dim: 8
enc_layer_sizes: [2, 200, 200, 200]
dec_layer_sizes: [200, 200, 200, 2]
retrain_vae: False
random_seed: 11
is_deterministic: True
It's the default setting of tensorboard in pytorch lightning. You can set default_hp_metric to false to get rid of this metric.
TensorBoardLogger(save_dir='tb_logs', name='VAEFC', default_hp_metric=False)
The hp_metric helps you track the model performance across different hyperparameters. You can check it at hparams in your tensorboard.
hp_metric (hyperparameter metric) is to help you tune your hyperparameters.
You can set this metric to whatever you like as documented in pytorch official docs.
Then, you can look through your hyperparameters and see which come out best according to whichever metric you choose.
Alternatively, if you don't want it, you can disable it as suggested in #joe32140's answer:
You can set default_hp_metric to false to get rid of this metric.
TensorBoardLogger(save_dir='tb_logs', name='VAEFC', default_hp_metric=False)
After following this blog, ,I ran into the error while trying to change the connected neurons from [13,8,1] to [13,20,20,1]
I believe I changed all areas rightfully, I even double checked matrix multiplication by hand, please help as I am a beginner.
Below are some changes I made from the original code(which you can find in the blog linked above).
class NeuralNet():
def __init__(self, layers=[13,20,20,1], learning_rate=0.001, iterations=100):
...
def init_weights(self):
np.random.seed(1)
self.params["W0"] = np.random.randn(self.layers[0], self.layers[1])
self.params['b0'] =np.random.randn(self.layers[1],)
self.params['W1'] = np.random.randn(self.layers[1],self.layers[2])
self.params['b1'] = np.random.randn(self.layers[2],)
self.params['W2'] = np.random.randn(self.layers[2],self.layers[3])
self.params['b2'] = np.random.randn(self.layers[3])
...
def forward_propagation(self):
Z0 = self.X.dot(self.params['W0']) + self.params['b0']
AL_2 = self.sigmoid(Z0)
Z1 = AL_2.dot(self.params['W1']) + self.params['b1']
AL_1 = self.sigmoid(Z1)
Z2 = AL_1.dot(self.params['W2']) + self.params['b2']
yhat = self.sigmoid(Z2)
loss = self.entropy_loss(self.y,yhat)
# save calculated parameters
self.params['Z0'] = Z0
self.params['Z1'] = Z1
self.params['Z2'] = Z2
self.params['AL_2'] = AL_2
self.params['AL_1'] = AL_1
return yhat,loss
...
def back_propagation(self,yhat):
dl_wrt_yhat = -(np.divide(self.y,yhat) - np.divide((1 - self.y),(1-yhat)))
dl_wrt_sig = yhat * (1-yhat)
dl_wrt_z2 = dl_wrt_yhat * dl_wrt_sig
dl_wrt_AL_1 = dl_wrt_z2.dot(self.params['W2'].T)
dl_wrt_w2 = self.params['AL_1'].T.dot(dl_wrt_z2)
dl_wrt_b2 = np.sum(dl_wrt_z2, axis=0)
dl_sig2 = self.params['Z1']*(1-self.params['Z1'])
dl_wrt_z1 = dl_wrt_AL_1 * dl_sig2
dl_wrt_w1 = self.params['AL_2'].T.dot(dl_wrt_z1)
dl_wrt_b1 = np.sum(dl_wrt_z1, axis=0)
dl_wrt_AL_2 = dl_wrt_z1.dot(self.params['W1'].T)
dl_sig3 = self.params['Z0']*(1-self.params['Z0'])
dl_wrt_z0 = dl_wrt_AL_2*dl_sig3
dl_wrt_b0 = np.sum(dl_wrt_z0,axis = 0)
dl_wrt_w0 = self.X.T.dot(dl_wrt_z1)
#update the weights and bias
self.params['W0'] = self.params['W0'] - self.learning_rate * dl_wrt_w0
self.params['W1'] = self.params['W1'] - self.learning_rate * dl_wrt_w1
self.params['W2'] = self.params['W2'] - self.learning_rate * dl_wrt_w2
self.params['b0'] = self.params['b0'] - self.learning_rate * dl_wrt_b0
self.params['b1'] = self.params['b1'] - self.learning_rate * dl_wrt_b1
self.params['b2'] = self.params['b2'] - self.learning_rate * dl_wrt_w2
...
def predict(self, X):
Z0 = X.dot(self.params['W0']) + self.params['b0']
AL_2 = self.sigmoid(Z0)
Z1 = AL_2.dot(self.params['W1']) + self.params['b1']
AL_1 = self.sigmoid(Z1)
Z2 = AL_1.dot(self.params['W2']) + self.params['b2']
pred = self.sigmoid(Z2)
return np.round(pred)
exact error message is
<ipython-input-68-d369d162f0f8> in forward_propagation(self)
63 AL_1 = self.sigmoid(Z1)
64
---> 65 Z2 = AL_1.dot(self.params['W2']) + self.params['b2']
66 yhat = self.sigmoid(Z2)
67
ValueError: operands could not be broadcast together with shapes (216,1) (20,1)
note that I also changed ReLU to sigmoid
I am trying to modify the code provided by neural-networks-and-deep-learning on github for network3.py. This code basically constructs a convolution neural network and trains the MNIST data set.
What I am trying to do is add the concept of back propagation and sparsity to this code. The part of code which I added is outlined between the two lines of #. I get an Typeerror: make node requires 4D tensor of kernels
I understand that the size should be of 4D (1,1,28,28) but I am not sure where and how to do this modification.
class ConvPoolLayer(object):
def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
activation_fn=sigmoid):
self.filter_shape = filter_shape
self.image_shape = image_shape
self.poolsize = poolsize
self.activation_fn=activation_fn
# initialize weights and biases
n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))
self.w = theano.shared(
np.asarray(
np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),
dtype=theano.config.floatX),
borrow=True)
#print self.w.eval()
self.b = theano.shared(
np.asarray(
np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),
dtype=theano.config.floatX),
borrow=True)
#print filter_shape[0]
#print self.b.eval()
self.params = [self.w, self.b]
def sigmoid(self, x):
return (1 / (1 + T.exp(-x)))
def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
self.inpt = inpt.reshape(self.image_shape)
############################################################
learning_rate = 0.0001
learning_rate_s = 0.0001
gamma = 1
alpha = 1 - learning_rate
v1 = T.dot(self.w, self.inpt) + self.b
y1 = self.sigmoid(v1)
diff1 = self.inpt - T.dot(T.transpose(self.w), y1)
d1 = T.dot(self.w, diff1)
d1 = T.dot(d1, (1.0 - T.dot(v1,v1)))
delta_w1_bp = learning_rate * T.dot(d1 , T.transpose(self.inpt))
delta_b1_bp = T.sum(learning_rate * d1, axis=1)
delta_w1_s = learning_rate_s * T.dot(self.sigmoid(y1),T.transpose(self.inpt))
delta_b1_s = T.sum(learning_rate_s * self.sigmoid(y1), axis=1)
total_w1 = gamma * delta_w1_bp + (1 - gamma) * delta_w1_s
total_b1 = gamma * delta_b1_bp + (1 - gamma) * delta_b1_s
self.w = (alpha * self.w) + total_w1
self.b = (alpha * self.b) + total_b1
##################################################################
conv_out = conv.conv2d(
input=self.inpt, filters=self.w, filter_shape=self.filter_shape,
image_shape=self.image_shape)
pooled_out = downsample.max_pool_2d(
input=conv_out, ds=self.poolsize, ignore_border=True)
self.output = self.activation_fn(
pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
self.output_dropout = self.output # no dropout in the convolution layers
Does anyone know how to fix this?
The main code I run to call the above script is
import network3
from network3 import Network
from network3 import ConvPoolLayer, FullyConnectedLayer, SoftmaxLayer
training_data, validation_data, test_data = network3.load_data_shared()
mini_batch_size = 10
net = Network([
ConvPoolLayer(image_shape=(mini_batch_size, 1, 28, 28),
filter_shape=(20, 1, 5, 5),
poolsize=(2, 2)),
FullyConnectedLayer(n_in=20*12*12, n_out=100),
SoftmaxLayer(n_in=100, n_out=10)], mini_batch_size)
net.SGD(training_data, 60, mini_batch_size, 0.1,
validation_data, test_data)
I have the following piece of code and when I run the theano_build() method, it throws as error saying
File "rnn_theano.py", line 28, in __init__
self.__theano_build__()
File "rnn_theano.py", line 45, in __theano_build__
non_sequences=[U, V, W1, W12, W2],
File "/usr/local/lib/python2.7/dist-packages/theano/scan_module/scan.py", line 745, in scan
condition, outputs, updates = scan_utils.get_updates_and_outputs(fn(*args))
TypeError: forward_prop_step() takes exactly 8 arguments (7 given)
the following is the code in Theano. It is basically a two hidden layered recurrent neural network
import numpy as np
import theano as theano
import theano.tensor as T
from utils import *
import operator
class RNNTheano:
def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4):
# Assign instance variables
self.word_dim = word_dim
self.hidden_dim = hidden_dim
self.bptt_truncate = bptt_truncate
# Randomly initialize the network parameters
U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim))
V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim))
W1 = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
W12 = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
W2 = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim))
# Theano: Created shared variables
self.U = theano.shared(name='U', value=U.astype(theano.config.floatX))
self.V = theano.shared(name='V', value=V.astype(theano.config.floatX))
self.W1 = theano.shared(name='W1', value=W1.astype(theano.config.floatX))
self.W12 = theano.shared(name='W12', value=W12.astype(theano.config.floatX))
self.W2 = theano.shared(name='W2', value=W2.astype(theano.config.floatX))
# We store the Theano graph here
self.theano = {}
self.__theano_build__()
def forward_prop_step(self, x_t, s_t1_prev, s_t2_prev, U, V, W1, W12, W2):
s_t1 = T.tanh(U[:,x_t] + W1.dot(s_t1_prev))
s_t2 = T.tanh(W12.dot(s_t1) + W2.dot(s_t2_prev))
o_t = T.nnet.softmax(V.dot(s_t2))
return [o_t[0], s_t1, s_t2]
def __theano_build__(self):
U, V, W1, W12, W2 = self.U, self.V, self.W1, self.W12, self.W2
x = T.ivector('x')
y = T.ivector('y')
[o,s1,s2], updates = theano.scan(
self.forward_prop_step,
sequences=x,
outputs_info=[None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim))],
non_sequences=[U, V, W1, W12, W2],
truncate_gradient=self.bptt_truncate,
strict=False)
prediction = T.argmax(o, axis=1)
o_error = T.sum(T.nnet.categorical_crossentropy(o, y))
# Gradients
dU = T.grad(o_error, U)
dV = T.grad(o_error, V)
dW1 = T.grad(o_error, W1)
dW12 = T.grad(o_error, W12)
dW2 = T.grad(o_error, W2)
# Assign functions
self.forward_propagation = theano.function([x], o)
self.predict = theano.function([x], prediction)
self.ce_error = theano.function([x, y], o_error)
self.bptt = theano.function([x, y], [dU, dV, dW1, dW12, dW2])
# SGD
learning_rate = T.scalar('learning_rate')
self.sgd_step = theano.function([x,y,learning_rate], [],
updates=[(self.U, self.U - learning_rate * dU),
(self.V, self.V - learning_rate * dV),
(self.W1, self.W1 - learning_rate * dW1)
(self.W12, self.W12 - learning_rate * dW12),
(self.W2, self.W2 - learning_rate * dW2)])
def calculate_total_loss(self, X, Y):
return np.sum([self.ce_error(x,y) for x,y in zip(X,Y)])
def calculate_loss(self, X, Y):
# Divide calculate_loss by the number of words
num_words = np.sum([len(y) for y in Y])
return self.calculate_total_loss(X,Y)/float(num_words)
Try changing
return [o_t[0], s_t1, s_t2]
to
return o_t[0], s_t1, s_t2
I think the former is causing the method to return something that is coerced by Theano into a single tensor while the latter is explicitly returningthree objects as indicated in outputs_info.