Style loss is always zero - python

I am trying to use feature reconstruction and style reconstruction losses on my model. For this, I followed the example code on PyTorch website for “Neural Style Transfer”.
https://pytorch.org/tutorials/advanced/neural_style_tutorial.html
Although the feature loss is calculated without problem, the style loss is always zero. And I could not find the reason since everything looks fine in the implementation. The calculation methods are the same as the proposed mathematical methods for these loss functions. Besides, as you know, the style and feature losses are almost the same in terms of calculation except Gram matrix step in style loss and no problem in feature loss.
Could anyone help me with this situation?
class Feature_and_style_losses():
def __init__(self, ):
self.vgg_model = models.vgg19(pretrained=True).features.cuda().eval()
self.content_layers = ['conv_16']
self.style_layers = ['conv_5']
def calculate_feature_and_style_losses(self, input_, target, feature_coefficient, style_coefficient):
i = 0
feature_losses = []
style_losses = []
for layer_ in self.vgg_model.children():
if isinstance(layer_, nn.Conv2d):
i += 1
name = "conv_{}".format(i)
if name in self.content_layers:
features_input = self.vgg_model(input_).detach()
features_target = self.vgg_model(target).detach()
feature_losses.append(self.feature_loss(features_input, features_target))
if name in self.style_layers:
style_input = self.vgg_model(input_).detach()
style_target = self.vgg_model(target).detach()
style_losses.append(self.style_loss(style_input, style_target))
feature_loss_value = (torch.mean(torch.from_numpy(np.array(feature_losses, dtype=np.float32)))) * feature_coefficient
style_loss_value = (torch.mean(torch.from_numpy(np.array(style_losses, dtype=np.float32)))) * style_coefficient
return feature_loss_value, style_loss_value
def feature_loss(self, input_, target):
target = target.detach()
feature_reconstruction_loss = F.mse_loss(input_, target)
return feature_reconstruction_loss
def gram_matrix(self, input_):
a, b, c, d = input_.size() #??? check size
features = input_.view(a*b, c*d)
#features_t = features.transpose(1, 2)
#G = features.bmm(features_t) / (b*c*d)
#print(features.shape)
G = torch.mm(features, features.t())
return G.div(a*b*c*d)
return G
def style_loss(self, input_, target):
G_input = self.gram_matrix(input_)
G_target = self.gram_matrix(target).detach()
#style_reconstruction_loss = self.feature_loss(G_input, G_target)
style_reconstruction_loss = F.mse_loss(G_input, G_target)
return style_reconstruction_loss
feature_loss_ = Feature_and_style_losses()
...
for e in range(epochs):
for i, batch in enumerate(dataloader):
...
real_C = Variable(batch["C"].type(Tensor))
fake_C = independent_decoder(features_all)
f_loss, s_loss = feature_loss_.calculate_feature_and_style_losses(fake_C, real_C, 1, 10)
loss_G_3 = loss_GAN_3 + lambda_pixel * (loss_pixel_3_object + loss_pixel_3_scene) * 0.5 + f_loss + s_loss
loss_G_3.backward(retain_graph=True)
optimizer_independent_decoder.step()
Best.

Related

Represent multiple normal priors with a single call in Bayesian logistic regression with Tensorflow

I am trying to properly represent multiple normal priors with a function call in the joint distribution and run mcmc.sample_chain with NUTS kernel. I used the excellent blog post here for a sketch with the following MCMC implementation.
def trace_fn(_, pkr):
return (
pkr.inner_results.inner_results.target_log_prob,
pkr.inner_results.inner_results.leapfrogs_taken,
pkr.inner_results.inner_results.has_divergence,
pkr.inner_results.inner_results.energy,
pkr.inner_results.inner_results.log_accept_ratio
)
def run_nuts(
target_log_prob_fn,
inits,
trace_fn=trace_fn,
bijectors_list=None,
num_steps=5000,
num_burnin=500,
n_chains=n_chains):
step_size = np.random.rand(n_chains, 1)*.5 + 1.
if not isinstance(inits, list):
inits = [inits]
if bijectors_list is None:
bijectors_list = [tfb.Identity()]*len(inits)
kernel = tfp.mcmc.DualAveragingStepSizeAdaptation(
tfp.mcmc.TransformedTransitionKernel(
inner_kernel=tfp.mcmc.NoUTurnSampler(
target_log_prob_fn,
step_size=[step_size]*len(inits)
),
bijector=bijectors_list
),
target_accept_prob=.8,
num_adaptation_steps=int(0.8*num_burnin),
step_size_setter_fn=lambda pkr, new_step_size: pkr._replace(
inner_results=pkr.inner_results._replace(step_size=new_step_size)
),
step_size_getter_fn=lambda pkr: pkr.inner_results.step_size,
log_accept_prob_getter_fn=lambda pkr: pkr.inner_results.log_accept_ratio,
)
res = tfp.mcmc.sample_chain(
num_results=num_steps,
num_burnin_steps=num_burnin,
current_state=inits,
kernel=kernel,
trace_fn=trace_fn
)
return res
I can get the MCMC working when individually specifying the priors but not when declaring them as a batch.
This works
dtype=tf.float32
root = tfd.JointDistributionCoroutine.Root
def basic_logistic(data_df):
def _generator():
a = yield root(tfd.Sample(tfd.Normal(0,10),1, name='a'))
b = yield root(tfd.Sample(tfd.Normal(0,10),1, name='b'))
c = yield root(tfd.Sample(tfd.Normal(0,10),1, name='c'))
l = a+tf.cast(data_df['x1'],dtype)*b + tf.cast(data_df['x2'],dtype)*c
print(l)
y = yield tfd.Independent(
tfd.Bernoulli(
logits = l,
name = 'success'
),
reinterpreted_batch_ndims=1
)
return tfd.JointDistributionCoroutine(_generator)
arm_0_test = basic_logistic(arm_0_test_df)
arm_0_log_prob = lambda *args: arm_0_test.log_prob(args + (tf.cast(arm_0_test_df['y'],dtype),))
n_chains=3
arm0_res = run_nuts(arm_0_log_prob, [tf.ones((n_chains,1)), tf.ones((n_chains,1)), tf.ones((n_chains,1))])
This does not
dtype=tf.float32
root = tfd.JointDistributionCoroutine.Root
def basic_logistic_multiple(X_df):
X_df_copy = X_df.copy()
n_features = X_df_copy.shape[1] + 1 # have to include intercept term
prior_means = [0 for i in range(n_features)] # list of prior means
print(prior_means)
prior_sds = [10 for i in range(n_features)] # list of prior sds
X_df_copy.insert(0, 'intercept', np.ones(X_df_copy.shape[0])) # Add column of 1s for intercept
X = tf.convert_to_tensor(X_df_copy, dtype=dtype)
def _generator():
beta = yield root(tfd.Sample(
tfd.Normal(prior_means, prior_sds, name='beta')
))
print(beta)
l = tf.tensordot(X, beta, axes=1)
# l = tf.reshape(l, (l.shape[0], ))
print(l)
y = yield tfd.Independent(
tfd.Bernoulli(
logits = l,
name = 'success'
),
reinterpreted_batch_ndims=1
)
return tfd.JointDistributionCoroutine(_generator)
arm_0_test = basic_logistic_multiple(arm_0_test_df)
arm_0_log_prob = lambda *args: arm_0_test.log_prob(args + (tf.cast(arm_0_test_df['y'],dtype),))
n_chains=3
init_beta, _ = arm_0_test.sample(n_chains)
init_beta = tf.zeros_like(init_beta)
arm0_res = run_nuts(arm_0_log_prob, [init_beta,])
I get the following error
ValueError: Dimensions must be equal, but are 3 and 1000000 for '{{node mcmc_sample_chain/dual_averaging_step_size_adaptation___init__/_bootstrap_results/transformed_kernel_bootstrap_results/NoUTurnSampler/.bootstrap_results/process_args/maybe_call_fn_and_grads/value_and_gradients/value_and_gradient/JointDistributionCoroutine_CONSTRUCTED_AT_top_level/log_prob/add_1}} = AddV2[T=DT_FLOAT](mcmc_sample_chain/dual_averaging_step_size_adaptation___init__/_bootstrap_results/transformed_kernel_bootstrap_results/NoUTurnSampler/.bootstrap_results/process_args/maybe_call_fn_and_grads/value_and_gradients/value_and_gradient/JointDistributionCoroutine_CONSTRUCTED_AT_top_level/log_prob/add, mcmc_sample_chain/dual_averaging_step_size_adaptation___init__/_bootstrap_results/transformed_kernel_bootstrap_results/NoUTurnSampler/.bootstrap_results/process_args/maybe_call_fn_and_grads/value_and_gradients/value_and_gradient/JointDistributionCoroutine_CONSTRUCTED_AT_top_level/log_prob/Independentsuccess/log_prob/Sum)' with input shapes: [3,3], [1000000].
I can sample from both Jointdistributions fine so I believe it is something clashing in the sample_chain function. Possibly my initial state declaration?
Thanks for any help!

My Deep Neural network for L no.of layers is not working properly

Github link for entire code
I have coded the Forward and Back Propagation algorithm for a Deep neural network from scratch. I am using Gradient descent to reduce cost error, however, the cost does not seem to decrease when I am using 3 hidden layers. any recommendations to improve my code? is there any bug in it?
I have mentioned my entire Github code link here.
def backward_propagation(parameters, AL, cache, X, Y):
m = X.shape[1]
L = len(parameters)//2
grads = {}
cache['A' + str(0)] = X
dZ = AL-Y
A_prev = cache['A' + str(L-1)]
dW = (1/m)*np.dot(dZ,A_prev.T)
db = (1/m)*np.sum(dZ,axis=1,keepdims=True)
grads["dW"+str(L)] = dW
grads["db"+str(L)] = db
grads["dZ"+str(L)] = dZ
for l in range(L-1,0,-1): # from reverse
A = cache['A' + str(l)]
W = parameters['W' + str(l+1)]
dZ_next = grads["dZ"+str(l+1)]
A_prev = cache['A' + str(l-1)]
temp = np.zeros(A.shape)
temp[A < 0] = 0
temp[A >= 0] = 1
dZ = np.dot(W.T,dZ_next)*temp
dW = (1/m)*np.dot(dZ,A_prev.T)
db = (1/m)*np.sum(dZ,axis=1,keepdims=True)
grads["dW"+str(l)] = dW
grads["db"+str(l)] = db
grads["dZ"+str(l)] = dZ
return grads

Values for custom Tensorflow Optimization Algorithm not converging

I wanted to create an implementation of the Swarm Optimization Algorithm for deep neural networks that is the Fireworks Algorithm.
I was finally able to create a TensorFlow optimizer class that implements the same using THIS repository.
But even after implementation, my accuracy seems to be around ~11% during training. (please refer to THIS collab notebook for code)
To test out the code SEE MY IMPLEMENTATION ON A COLLAB NOTEBOOK
How can I resolve this issue.
also my main optimizer code is,
# https://github.com/cilatpku/firework-algorithm/blob/master/fwa/BBFWA.py
class Firework(optimizer.Optimizer):
def __init__(self,
# params for prob
evaluator = None,
dim = 2,
upper_bound = 100,
lower_bound = -100,
max_iter = 10000,
max_eval = 20000,
# params for method
sp_size = 200,
init_amp = 200,
name="Firework", use_locking=False, **kwargs):
super(Firework, self).__init__(use_locking, name)
## Parameters
# params of method
self.sp_size = sp_size # total spark size
self.init_amp = init_amp # initial dynamic amplitude
# load params
self.evaluator = evaluator
self.dim = dim
self.upper_bound = upper_bound
self.lower_bound = lower_bound
self.max_iter = max_iter
self.max_eval = max_eval
## States
# private init states
self._num_iter = 0
self._num_eval = 0
self._dyn_amp = init_amp
# public states
self.best_idv = None # best individual found
self.best_fit = None # best fitness found
self.trace = [] # trace of best individual in each generation
## Fireworks
self.fireworks = np.random.uniform(self.lower_bound, self.upper_bound, [1, self.dim])
self.fireworks = self.fireworks.tolist()
self.fits = self.evaluator(self.fireworks)
## Tensor versions of the constructor arguments, created in _prepare().
self.dim_t = None
self.upper_bound_t = None
self.lower_bound_t = None
self.max_iter_t = None
self.max_eval_t = None
self.sp_size_t = None
self.init_amp_t = None
self.fireworks_t = None
self.fits_t = None
def _create_slots(self, var_list):
"""For each model variable, create the optimizer variable associated with it.
TensorFlow calls these optimizer variables "slots"."""
# Create slots for the first and second moments.
for v in var_list:
self._zeros_slot(v, "fireworks", self._name)
for v in var_list:
self._zeros_slot(v, "fits", self._name)
def _prepare(self):
# self.evaluator_t = ops.convert_to_tensor(self.evaluator, name="evaloator")
self.dim_t = ops.convert_to_tensor(self.dim, name="dimention")
self.upper_bound_t = ops.convert_to_tensor(self.upper_bound, name="upper_bound")
self.lower_bound_t = ops.convert_to_tensor(self.lower_bound, name="lower_bound")
self.max_iter_t = ops.convert_to_tensor(self.max_iter, name="max_iterations")
self.max_eval_t = ops.convert_to_tensor(self.max_eval, name="max_eval")
self.sp_size_t = ops.convert_to_tensor(self.sp_size, name="sp_size")
self.init_amp_t = ops.convert_to_tensor(self.init_amp, name="init_amp")
self.fireworks_t = ops.convert_to_tensor(self.fireworks, name="fireworks")
self.fits_t = ops.convert_to_tensor(self.fits, name="fits")
print(self.fireworks_t)
def _resource_apply_dense(self, grad, var):
evaluator = self.evaluator
dim_t = math_ops.cast(self.dim_t, var.dtype.base_dtype)
upper_bound_t = math_ops.cast(self.upper_bound_t, var.dtype.base_dtype)
lower_bound_t = math_ops.cast(self.lower_bound_t, var.dtype.base_dtype)
max_iter_t = math_ops.cast(self.max_iter_t, var.dtype.base_dtype)
max_eval_t = math_ops.cast(self.max_eval_t, var.dtype.base_dtype)
sp_size_t = math_ops.cast(self.sp_size_t, var.dtype.base_dtype)
init_amp_t = math_ops.cast(self.init_amp_t, var.dtype.base_dtype)
fits = self.get_slot(grad, "fits")
fireworks = self.get_slot(var, "fireworks")
fireworks_update, fits_update = self.iter(self.fireworks, self.fits)
self.fireworks = fireworks_update
self.fits = fits_update
fireworks_update_t = math_ops.cast(fireworks_update, var.dtype.base_dtype)
fits_update_t = math_ops.cast(fits_update, var.dtype.base_dtype)
self.fireworks_t = fireworks_update_t
self.fits_t = fits_update_t
print("fireworks_update : ", fireworks_update)
print("fits_update : ", fits_update)
#Create an op that groups multiple operations
#When this op finishes, all ops in input have finished
return control_flow_ops.group(*[fireworks_update_t, fits_update_t])
## Helper functions
def iter(self, fireworks, fits):
print("...\n")
e_sparks, e_fits = self._explode(fireworks, fits)
n_fireworks, n_fits = self._select(fireworks, fits, e_sparks, e_fits)
# update states
if n_fits[0] < fits[0]:
self._dyn_amp *= 1.2
else:
self._dyn_amp *= 0.9
self._num_iter += 1
self._num_eval += len(e_sparks)
self.best_idv = n_fireworks[0]
self.best_fit = n_fits[0]
self.trace.append([n_fireworks[0], n_fits[0], self._dyn_amp])
fireworks = n_fireworks
fits = n_fits
return fireworks, fits
def _explode(self, fireworks, fits):
bias = np.random.uniform(-self._dyn_amp, self._dyn_amp, [self.sp_size, self.dim])
rand_samples = np.random.uniform(self.lower_bound, self.upper_bound, [self.sp_size, self.dim])
e_sparks = fireworks + bias
in_bound = (e_sparks > self.lower_bound) * (e_sparks < self.upper_bound)
e_sparks = in_bound * e_sparks + (1 - in_bound) * rand_samples
e_sparks = e_sparks.tolist()
e_fits = self.evaluator(e_sparks)
return e_sparks, e_fits
def _select(self, fireworks, fits, e_sparks, e_fits):
idvs = fireworks + e_sparks
fits = fits + e_fits
idx = np.argmin(fits)
return [idvs[idx]], [fits[idx]]
##################################################
##################################################
def get_config(self):
base_config = super().get_config()
return {
**base_config,
"learning_rate": self._serialize_hyperparameter("learning_rate"),
"decay": self._serialize_hyperparameter("decay"),
"momentum": self._serialize_hyperparameter("momentum"),
}
def _apply_dense(self, grad, var):
raise NotImplementedError("Dense gradient updates are not supported.")
def _apply_sparse(self, grad, var):
raise NotImplementedError("Sparse gradient updates are not supported.")
def _resource_apply_sparse(self, grad, var):
raise NotImplementedError("Sparse Resource gradient updates are not supported.")

Pytorch PPO implementation is not learning

This PPO implementation has a bug somewhere and I can't figure out what's wrong. The network returns a normal distribution and a value estimate from the critic. The last layer of the actor provides four F.tanhed action values, which are used as mean value for the distribution. nn.Parameter(torch.zeros(action_dim)) is the standard deviation.
The trajectories for 20 parallel agents are added to the same memory. Episode length is 1000 and memory.sample() returns a np.random.permutation of the 20k memory entries as tensors with batches of size 64. Before stacking the batch tensors, the values are stored as (1,-1) tensors in collection.deques. The returned tensors are detach()ed.
environment
brain_name = envs.brain_names[0]
env_info = envs.reset(train_mode=True)[brain_name]
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
update step
def clipped_surrogate_update(policy, memory, num_epochs=10, clip_param=0.2, gradient_clip=5, beta=0.001, value_loss_coeff=0.5):
advantages_batch, states_batch, log_probs_old_batch, returns_batch, actions_batch = memory.sample()
advantages_batch = (advantages_batch - advantages_batch.mean()) / advantages_batch.std()
for _ in range(num_epochs):
for i in range(len(advantages_batch)):
advantages_sample = advantages_batch[i]
states_sample = states_batch[i]
log_probs_old_sample = log_probs_old_batch[i]
returns_sample = returns_batch[i]
actions_sample = actions_batch[i]
dist, values = policy(states_sample)
log_probs_new = dist.log_prob(actions_sample.to(device)).sum(-1).unsqueeze(-1)
entropy = dist.entropy().sum(-1).unsqueeze(-1).mean()
ratio = (log_probs_new - log_probs_old_sample).exp()
clipped_ratio = torch.clamp(ratio, 1-clip_param, 1+clip_param)
clipped_surrogate_loss = -torch.min(ratio*advantages_sample, clipped_ratio*advantages_sample).mean()
value_function_loss = (returns_sample - values).pow(2).mean()
Loss = clipped_surrogate_loss - beta * entropy + value_loss_coeff * value_function_loss
optimizer_policy.zero_grad()
Loss.backward()
torch.nn.utils.clip_grad_norm_(policy.parameters(), gradient_clip)
optimizer_policy.step()
del Loss
data sampling
def collect_trajectories(envs, env_info, policy, memory, tmax=200, nrand=0, gae_tau = 0.95, discount = 0.995):
next_episode = False
states = env_info.vector_observations
n_agents = len(env_info.agents)
state_list=[]
reward_list=[]
prob_list=[]
action_list=[]
value_list=[]
if nrand > 0:
# perform nrand random steps
for _ in range(nrand):
actions = np.random.randn(num_agents, action_size)
actions = np.clip(actions, -1, 1)
env_info = envs.step(actions)[brain_name]
states = env_info.vector_observations
for t in range(tmax):
states = torch.FloatTensor(states).to(device)
dist, values = policy(states)
actions = dist.sample()
probs = dist.log_prob(actions).sum(-1).unsqueeze(-1)
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
state_list.append(states)
reward_list.append(rewards)
prob_list.append(probs)
action_list.append(actions)
value_list.append(values)
states = next_states
if np.any(dones):
next_episode = True
break
_, next_value = policy(torch.FloatTensor(states).to(device))
reward_arr = np.array(reward_list)
undiscounted_rewards = np.sum(reward_arr, axis=0)
state_arr = torch.stack(state_list)
prob_arr = torch.stack(prob_list)
action_arr = torch.stack(action_list)
value_arr = torch.stack(value_list)
reward_arr = torch.FloatTensor(reward_arr[:, :, np.newaxis])
advantage_list = []
return_list = []
returns = next_value.detach()
advantages = torch.FloatTensor(np.zeros((n_agents, 1)))
for i in reversed(range(state_arr.shape[0])):
returns = reward_arr[i] + discount * returns
td_error = reward_arr[i] + discount * next_value - value_arr[i]
advantages = advantages * gae_tau * discount + td_error
next_value = value_arr[i]
advantage_list.append(advantages.detach())
return_list.append(returns.detach())
advantage_arr = torch.stack(advantage_list)
return_arr = torch.stack(return_list)
for i in range(state_arr.shape[0]):
memory.add({'advantages': advantage_arr[i],
'states': state_arr[i],
'log_probs_old': prob_arr[i],
'returns': return_arr[i],
'actions': action_arr[i]})
return undiscounted_rewards, next_episode
In the Generalized Advantage Estimation loop advantages and returns are added in reversed order.
advantage_list.insert(0, advantages.detach())
return_list.insert(0, returns.detach())

Implementing the Rprop algorithm in Keras

I am trying to implement the resilient backpropagation optimizer for Keras (link), but the challenging part was being able to perform an update on each individual parameter based on whether its corresponding gradient is positive, negative or zero. I wrote the code below as a start towards implementing the Rprop optimizer. However, I can't seem to find a way to access the parameters individually. Looping over params (as in the code below) returns p, g, g_old, s, wChangeOld at each iteration which are all matrices.
Is there a way where I could iterate over the individual parameters and update them ? It would also work if I could index the parameter vector based on the sign of its gradients.
class Rprop(Optimizer):
def __init__(self, init_step=0.01, **kwargs):
super(Rprop, self).__init__(**kwargs)
self.init_step = K.variable(init_step, name='init_step')
self.iterations = K.variable(0., name='iterations')
self.posStep = 1.2
self.negStep = 0.5
self.minStep = 1e-6
self.maxStep = 50.
def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
shapes = [K.get_variable_shape(p) for p in params]
stepList = [K.ones(shape)*self.init_step for shape in shapes]
wChangeOldList = [K.zeros(shape) for shape in shapes]
grads_old = [K.zeros(shape) for shape in shapes]
self.weights = stepList + grads_old + wChangeOldList
self.updates = []
for p, g, g_old, s, wChangeOld in zip(params, grads, grads_old,
stepList, wChangeOldList):
change = K.sign(g * g_old)
if change > 0:
s_new = K.minimum(s * self.posStep, self.maxStep)
wChange = s_new * K.sign(g)
g_new = g
elif change < 0:
s_new = K.maximum(s * self.posStep, self.maxStep)
wChange = - wChangeOld
g_new = 0
else:
s_new = s
wChange = s_new * K.sign(g)
g_new = p
self.updates.append(K.update(g_old, g_new))
self.updates.append(K.update(wChangeOld, wChange))
self.updates.append(K.update(s, s_new))
new_p = p - wChange
# Apply constraints
if p in constraints:
c = constraints[p]
new_p = c(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'init_step': float(K.get_value(self.init_step))}
base_config = super(Rprop, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
I was looking for an RProp algorithm in Keras as well and found this question. I took the liberty of adapting your code to my purpose and post it back here now. So far it seems to work quite well, but I didn't test it extensively.
Disclaimer: I'm very new to keras but have a lot of experience with theano (and blocks). Further I tested this only with theano as a backend, but not tensorflow.
class RProp(Optimizer):
def __init__(self, init_alpha=1e-3, scale_up=1.2, scale_down=0.5, min_alpha=1e-6, max_alpha=50., **kwargs):
super(RProp, self).__init__(**kwargs)
self.init_alpha = K.variable(init_alpha, name='init_alpha')
self.scale_up = K.variable(scale_up, name='scale_up')
self.scale_down = K.variable(scale_down, name='scale_down')
self.min_alpha = K.variable(min_alpha, name='min_alpha')
self.max_alpha = K.variable(max_alpha, name='max_alpha')
def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
shapes = [K.get_variable_shape(p) for p in params]
alphas = [K.variable(numpy.ones(shape) * self.init_alpha) for shape in shapes]
old_grads = [K.zeros(shape) for shape in shapes]
self.weights = alphas + old_grads
self.updates = []
for param, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
new_alpha = K.switch(
K.greater(grad * old_grad, 0),
K.minimum(alpha * self.scale_up, self.max_alpha),
K.maximum(alpha * self.scale_down, self.min_alpha)
)
new_param = param - K.sign(grad) * new_alpha
# Apply constraints
if param in constraints:
c = constraints[param]
new_param = c(new_param)
self.updates.append(K.update(param, new_param))
self.updates.append(K.update(alpha, new_alpha))
self.updates.append(K.update(old_grad, grad))
return self.updates
def get_config(self):
config = {
'init_alpha': float(K.get_value(self.init_alpha)),
'scale_up': float(K.get_value(self.scale_up)),
'scale_down': float(K.get_value(self.scale_down)),
'min_alpha': float(K.get_value(self.min_alpha)),
'max_alpha': float(K.get_value(self.max_alpha)),
}
base_config = super(RProp, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
Important notes:
RProp is often not included in machine learning libraries for a reason: It does not work at all unless you use full-batch learning. And full-batch learning is only useful if you have a small training set.
Adam (Keras builtin) outperforms this RProp algorithm. Maybe because that's just how it is, or maybe because I made a mistake :)
A few comments about your code (referring to your original variable names):
wChange is never used across iterations, so you don't need to store those in permanent variables.
change > 0 does not do what you think it does because change is a tensor variable. What you want here is a element-wise comparison, use K.switch() instead.
You used maxStep twice instead of using minStep the other time.
The situation where change is zero is negligible, since that almost never happens in practice.
g_new = 0 and g_new = p are both completely bogus and should be g_new = g as in the first if branch.
I'm new to keras and Python but I modified the code above for my purposes a bit.
It is incredibly fast and simple algorithm due to using full-batch learning and partial derivatives. In my tests it outperformed all other backpropagation algorithms, including Adam. I tested it with Tensorflow and CNTK as a backend.
Modified Rprop without Weight-Backtracking:
https://pdfs.semanticscholar.org/df9c/6a3843d54a28138a596acc85a96367a064c2.pdf
class iRprop_(Optimizer):
def __init__(self, init_alpha=0.01, scale_up=1.2, scale_down=0.5, min_alpha=0.00001, max_alpha=50., **kwargs):
super(iRprop_, self).__init__(**kwargs)
self.init_alpha = K.variable(init_alpha, name='init_alpha')
self.scale_up = K.variable(scale_up, name='scale_up')
self.scale_down = K.variable(scale_down, name='scale_down')
self.min_alpha = K.variable(min_alpha, name='min_alpha')
self.max_alpha = K.variable(max_alpha, name='max_alpha')
def get_updates(self, params, loss):
grads = self.get_gradients(loss, params)
shapes = [K.get_variable_shape(p) for p in params]
alphas = [K.variable(K.ones(shape) * self.init_alpha) for shape in shapes]
old_grads = [K.zeros(shape) for shape in shapes]
self.weights = alphas + old_grads
self.updates = []
for p, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
grad = K.sign(grad)
new_alpha = K.switch(
K.greater(grad * old_grad, 0),
K.minimum(alpha * self.scale_up, self.max_alpha),
K.switch(K.less(grad * old_grad, 0),K.maximum(alpha * self.scale_down, self.min_alpha),alpha)
)
grad = K.switch(K.less(grad * old_grad, 0),K.zeros_like(grad),grad)
new_p = p - grad * new_alpha
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
self.updates.append(K.update(alpha, new_alpha))
self.updates.append(K.update(old_grad, grad))
return self.updates
def get_config(self):
config = {
'init_alpha': float(K.get_value(self.init_alpha)),
'scale_up': float(K.get_value(self.scale_up)),
'scale_down': float(K.get_value(self.scale_down)),
'min_alpha': float(K.get_value(self.min_alpha)),
'max_alpha': float(K.get_value(self.max_alpha)),
}
base_config = super(iRprop_, self).get_config()
return dict(list(base_config.items()) + list(config.items()))

Categories