Pytorch PPO implementation is not learning

Pytorch PPO implementation is not learning - python

This PPO implementation has a bug somewhere and I can't figure out what's wrong. The network returns a normal distribution and a value estimate from the critic. The last layer of the actor provides four F.tanhed action values, which are used as mean value for the distribution. nn.Parameter(torch.zeros(action_dim)) is the standard deviation.
The trajectories for 20 parallel agents are added to the same memory. Episode length is 1000 and memory.sample() returns a np.random.permutation of the 20k memory entries as tensors with batches of size 64. Before stacking the batch tensors, the values are stored as (1,-1) tensors in collection.deques. The returned tensors are detach()ed.
environment
brain_name = envs.brain_names[0]
env_info = envs.reset(train_mode=True)[brain_name]
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
update step
def clipped_surrogate_update(policy, memory, num_epochs=10, clip_param=0.2, gradient_clip=5, beta=0.001, value_loss_coeff=0.5):
advantages_batch, states_batch, log_probs_old_batch, returns_batch, actions_batch = memory.sample()
advantages_batch = (advantages_batch - advantages_batch.mean()) / advantages_batch.std()
for _ in range(num_epochs):
for i in range(len(advantages_batch)):
advantages_sample = advantages_batch[i]
states_sample = states_batch[i]
log_probs_old_sample = log_probs_old_batch[i]
returns_sample = returns_batch[i]
actions_sample = actions_batch[i]
dist, values = policy(states_sample)
log_probs_new = dist.log_prob(actions_sample.to(device)).sum(-1).unsqueeze(-1)
entropy = dist.entropy().sum(-1).unsqueeze(-1).mean()
ratio = (log_probs_new - log_probs_old_sample).exp()
clipped_ratio = torch.clamp(ratio, 1-clip_param, 1+clip_param)
clipped_surrogate_loss = -torch.min(ratio*advantages_sample, clipped_ratio*advantages_sample).mean()
value_function_loss = (returns_sample - values).pow(2).mean()
Loss = clipped_surrogate_loss - beta * entropy + value_loss_coeff * value_function_loss
optimizer_policy.zero_grad()
Loss.backward()
torch.nn.utils.clip_grad_norm_(policy.parameters(), gradient_clip)
optimizer_policy.step()
del Loss
data sampling
def collect_trajectories(envs, env_info, policy, memory, tmax=200, nrand=0, gae_tau = 0.95, discount = 0.995):
next_episode = False
states = env_info.vector_observations
n_agents = len(env_info.agents)
state_list=[]
reward_list=[]
prob_list=[]
action_list=[]
value_list=[]
if nrand > 0:
# perform nrand random steps
for _ in range(nrand):
actions = np.random.randn(num_agents, action_size)
actions = np.clip(actions, -1, 1)
env_info = envs.step(actions)[brain_name]
states = env_info.vector_observations
for t in range(tmax):
states = torch.FloatTensor(states).to(device)
dist, values = policy(states)
actions = dist.sample()
probs = dist.log_prob(actions).sum(-1).unsqueeze(-1)
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
state_list.append(states)
reward_list.append(rewards)
prob_list.append(probs)
action_list.append(actions)
value_list.append(values)
states = next_states
if np.any(dones):
next_episode = True
break
_, next_value = policy(torch.FloatTensor(states).to(device))
reward_arr = np.array(reward_list)
undiscounted_rewards = np.sum(reward_arr, axis=0)
state_arr = torch.stack(state_list)
prob_arr = torch.stack(prob_list)
action_arr = torch.stack(action_list)
value_arr = torch.stack(value_list)
reward_arr = torch.FloatTensor(reward_arr[:, :, np.newaxis])
advantage_list = []
return_list = []
returns = next_value.detach()
advantages = torch.FloatTensor(np.zeros((n_agents, 1)))
for i in reversed(range(state_arr.shape[0])):
returns = reward_arr[i] + discount * returns
td_error = reward_arr[i] + discount * next_value - value_arr[i]
advantages = advantages * gae_tau * discount + td_error
next_value = value_arr[i]
advantage_list.append(advantages.detach())
return_list.append(returns.detach())
advantage_arr = torch.stack(advantage_list)
return_arr = torch.stack(return_list)
for i in range(state_arr.shape[0]):
memory.add({'advantages': advantage_arr[i],
'states': state_arr[i],
'log_probs_old': prob_arr[i],
'returns': return_arr[i],
'actions': action_arr[i]})
return undiscounted_rewards, next_episode

In the Generalized Advantage Estimation loop advantages and returns are added in reversed order.
advantage_list.insert(0, advantages.detach())
return_list.insert(0, returns.detach())

Related

Reinforcement Learning - only size-1 arrays can be converted to Python scalars - is it data problem?

I'm new to pytorch and even though I was searching for this error I can't seem to understand where axactly I'm doing something wrong.
I'm trying to run a codewith a model that trades 3 different stocks. My data is a csv file with three columns with closing prices of stocks.
I'm trying to run this part of code
env.reset()
# In case you're running this a second time with the same model, delete the gradients
del model.rewards[:]
del model.saved_actions[:]
gamma = 0.9
log_interval = 60
def finish_episode():
R = 0
saved_actions = model.saved_actions
policy_losses = []
value_losses = []
rewards = []
for r in model.rewards[::-1]:
R = r + (gamma * R)
rewards.insert(0, R)
rewards = torch.tensor(rewards)
epsilon = (torch.rand(1) / 1e4) - 5e-5
# With different architectures, I found the following standardization step sometimes
# helpful, sometimes unhelpful.
# rewards = (rewards - rewards.mean()) / (rewards.std(unbiased=False) + epsilon)
# Alternatively, comment it out and use the following line instead:
rewards += epsilon
for (log_prob, value), r in zip(saved_actions, rewards):
reward = torch.tensor(r - value.item()).cuda()
policy_losses.append(-log_prob * reward)
value_losses.append(F.smooth_l1_loss(value, torch.tensor([r]).cuda()))
optimizer.zero_grad()
loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
loss = torch.clamp(loss, -1e-5, 1e5)
loss.backward()
optimizer.step()
del model.rewards[:]
del model.saved_actions[:]
running_reward = 0
for episode in range(0, 4000):
state = env.reset()
reward = 0
done = False
msg = None
while not done:
action = model.act(state)
state, reward, done, msg = env.step(action)
model.rewards.append(reward)
if done:
break
running_reward = running_reward * (1 - 1/log_interval) + reward * (1/log_interval)
finish_episode()
# Resetting the hidden state seems unnecessary - it's effectively random from the previous
# episode anyway, more random than a bunch of zeros.
# model.reset_hidden()
if msg["msg"] == "done" and env.portfolio_value() > env.starting_portfolio_value * 1.1 and running_reward > 500:
print("Early Stopping: " + str(int(reward)))
break
if episode % log_interval == 0:
print("""Episode {}: started at {:.1f}, finished at {:.1f} because {} # t={}, \
last reward {:.1f}, running reward {:.1f}""".format(episode, env.starting_portfolio_value, \
env.portfolio_value(), msg["msg"], env.cur_timestep, reward, running_reward))
But I'm getting such an error:
TypeError Traceback (most recent call last)
<ipython-input-91-ce955397be85> in <module>()
45 msg = None
46 while not done:
---> 47 action = model.act(state)
48 state, reward, done, msg = env.step(action)
49 model.rewards.append(reward)
1 frames
<ipython-input-89-f463539c7fe3> in forward(self, x)
16
17 def forward(self, x):
---> 18 x = torch.tensor(x).cuda()
19 x = torch.sigmoid(self.input_layer(x))
20 x = torch.tanh(self.hidden_1(x))
TypeError: only size-1 arrays can be converted to Python scalars
This is the part of code with forward function defined
class Policy(nn.Module):
def __init__(self):
super(Policy, self).__init__()
self.input_layer = nn.Linear(11, 128)
self.hidden_1 = nn.Linear(128, 128)
self.hidden_2 = nn.Linear(32,31)
self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
self.rnn = nn.GRU(128, 32, 2)
self.action_head = nn.Linear(31, 5)
self.value_head = nn.Linear(31, 1)
self.saved_actions = []
self.rewards = []
def reset_hidden(self):
self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
def forward(self, x):
x = torch.tensor(x).cuda()
x = torch.sigmoid(self.input_layer(x))
x = torch.tanh(self.hidden_1(x))
x, self.hidden_state = self.rnn(x.view(1,-1,128), self.hidden_state.data)
x = F.relu(self.hidden_2(x.squeeze()))
action_scores = self.action_head(x)
state_values = self.value_head(x)
return F.softmax(action_scores, dim=-1), state_values
def act(self, state):
probs, state_value = self.forward(state)
m = Categorical(probs)
action = m.sample()
if action == 1 and env.state[0] < 1: action = torch.LongTensor([2]).squeeze().cuda()
if action == 4 and env.state[1] < 1: action = torch.LongTensor([2]).squeeze().cuda()
if action == 6 and env.state[2] < 1: action = torch.LongTensor([2]).squeeze().cuda()
self.saved_actions.append((m.log_prob(action), state_value))
return action.item()
Can you please direct me where I should make changes? Is it the data I'm feeding my model with, or something different?
Thank you so much for help

You are passing state = env.reset() to:
action = model.act(state)
probs, state_value = self.forward(state)
x = torch.tensor(x).cuda()
And hence torch is throwing an error. It expects a numeric or array type input.

Style loss is always zero

I am trying to use feature reconstruction and style reconstruction losses on my model. For this, I followed the example code on PyTorch website for “Neural Style Transfer”.
https://pytorch.org/tutorials/advanced/neural_style_tutorial.html
Although the feature loss is calculated without problem, the style loss is always zero. And I could not find the reason since everything looks fine in the implementation. The calculation methods are the same as the proposed mathematical methods for these loss functions. Besides, as you know, the style and feature losses are almost the same in terms of calculation except Gram matrix step in style loss and no problem in feature loss.
Could anyone help me with this situation?
class Feature_and_style_losses():
def __init__(self, ):
self.vgg_model = models.vgg19(pretrained=True).features.cuda().eval()
self.content_layers = ['conv_16']
self.style_layers = ['conv_5']
def calculate_feature_and_style_losses(self, input_, target, feature_coefficient, style_coefficient):
i = 0
feature_losses = []
style_losses = []
for layer_ in self.vgg_model.children():
if isinstance(layer_, nn.Conv2d):
i += 1
name = "conv_{}".format(i)
if name in self.content_layers:
features_input = self.vgg_model(input_).detach()
features_target = self.vgg_model(target).detach()
feature_losses.append(self.feature_loss(features_input, features_target))
if name in self.style_layers:
style_input = self.vgg_model(input_).detach()
style_target = self.vgg_model(target).detach()
style_losses.append(self.style_loss(style_input, style_target))
feature_loss_value = (torch.mean(torch.from_numpy(np.array(feature_losses, dtype=np.float32)))) * feature_coefficient
style_loss_value = (torch.mean(torch.from_numpy(np.array(style_losses, dtype=np.float32)))) * style_coefficient
return feature_loss_value, style_loss_value
def feature_loss(self, input_, target):
target = target.detach()
feature_reconstruction_loss = F.mse_loss(input_, target)
return feature_reconstruction_loss
def gram_matrix(self, input_):
a, b, c, d = input_.size() #??? check size
features = input_.view(a*b, c*d)
#features_t = features.transpose(1, 2)
#G = features.bmm(features_t) / (b*c*d)
#print(features.shape)
G = torch.mm(features, features.t())
return G.div(a*b*c*d)
return G
def style_loss(self, input_, target):
G_input = self.gram_matrix(input_)
G_target = self.gram_matrix(target).detach()
#style_reconstruction_loss = self.feature_loss(G_input, G_target)
style_reconstruction_loss = F.mse_loss(G_input, G_target)
return style_reconstruction_loss
feature_loss_ = Feature_and_style_losses()
...
for e in range(epochs):
for i, batch in enumerate(dataloader):
...
real_C = Variable(batch["C"].type(Tensor))
fake_C = independent_decoder(features_all)
f_loss, s_loss = feature_loss_.calculate_feature_and_style_losses(fake_C, real_C, 1, 10)
loss_G_3 = loss_GAN_3 + lambda_pixel * (loss_pixel_3_object + loss_pixel_3_scene) * 0.5 + f_loss + s_loss
loss_G_3.backward(retain_graph=True)
optimizer_independent_decoder.step()
Best.

Why does the result when restoring a saved DDPG model differ significantly from the result when saving it?

I save the trained model after a certain number of episodes with the special save() function of the DDPG class (the network is saved when the reward reaches zero), but when I restore the model again using saver.restore(), the network gives out a reward equal to approximately -1800. Why is this happening, maybe I'm doing something wrong? My network:
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
self.sess.run(tf.global_variables_initializer())
self.saver = tf.train.Saver()
self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:]
self.sess.run(self.atrain, {self.state: batch_states})
self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
# we define a function store_transition which stores all the transition information in the buffer
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
self.learn()
# we define the function build_actor_network for builing our actor network and after crtic network
def build_actor_network(self, s, scope, trainable)
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):
self.saver.save(self.sess, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
no_of_episodes = 300
# for each episodes
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
# episodic reward
ep_reward = 0
for j in range(epsiode_steps):
env.render()
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 200:
ddpg.save()
print("save")
quit()
if j == epsiode_steps - 1:
total_reward.append(ep_reward)
print('Episode:', i, ' Reward: %i' % int(ep_reward))
break

Implementing A3C on TensorFlow 2

After finishing Coursera's Practical RL course on A3C, I'm trying to implement my own A3C agent using tensorflow 2. To start, I'm training it on the Cartpole environment but I can't get good results. For now, I've already launched several training with the following code, changing the entropy coefficient to see its impact (the results are shown below). Does it come from my implementation, or is it more a fine-tuning issue ?
class A3C:
def __init__(self, state_dim, n_actions, optimizer=tf.keras.optimizers.Adam(1e-3)):
self.state_input = Input(shape=state_dim)
self.x = Dense(256, activation='relu')(self.state_input)
self.head_v = Dense(1, activation='linear')(self.x)
self.head_p = Dense(n_actions, activation='linear')(self.x)
self.network = tf.keras.Model(inputs=[self.state_input], outputs=[self.head_v, self.head_p])
self.optimizer = optimizer
def forward(self, state):
return self.network(state)
def sample(self, logits):
policy = np.exp(logits.numpy()) / np.sum(np.exp(logits.numpy()), axis=-1, keepdims=True)
return np.array([np.random.choice(len(p), p=p) for p in policy])
def evaluate(agent, env, n_games=1): """Plays an a game from start till done, returns per-game rewards """
game_rewards = []
for _ in range(n_games):
state = env.reset()
total_reward = 0
while True:
action = agent.sample(agent.forward(np.array([state]))[1])[0]
state, reward, done, info = env.step(action)
total_reward += reward
if done: break
game_rewards.append(total_reward)
return game_rewards
class EnvBatch:
def __init__(self, n_envs = 10):
self.envs = [gym.make(env_id) for _ in range(n_envs)]
def reset(self):
return np.array([env.reset() for env in self.envs])
def step(self, actions):
results = [env.step(a) for env, a in zip(self.envs, actions)]
new_obs, rewards, done, infos = map(np.array, zip(*results))
for i in range(len(self.envs)):
if done[i]:
new_obs[i] = self.envs[i].reset()
return new_obs, rewards, done, infos
env_id = "CartPole-v0"
env = gym.make(env_id)
state_dim = env.observation_space.shape
n_actions = env.action_space.n
agent = A3C(state_dim, n_actions)
env_batch = EnvBatch(10)
batch_states = env_batch.reset()
gamma=0.99
rewards_history = []
entropy_history = []
for i in trange(200000):
with tf.GradientTape() as t:
batch_values, batch_logits = agent.forward(batch_states)
batch_actions = agent.sample(batch_logits)
batch_next_states, batch_rewards, batch_dones, _ = env_batch.step(batch_actions)
batch_next_values, btach_next_logits = agent.forward(batch_next_states)
batch_next_values *= (1 - batch_dones)
probs = tf.nn.softmax(batch_logits)
logprobs = tf.nn.log_softmax(batch_logits)
logp_actions = tf.reduce_sum(logprobs * tf.one_hot(batch_actions, n_actions), axis=-1)
advantage = batch_rewards + gamma*batch_next_values - batch_values
entropy = -tf.reduce_sum(probs * logprobs, 1, name="entropy")
actor_loss = - tf.reduce_mean(logp_actions * tf.stop_gradient(advantage)) - 0.005 * tf.reduce_mean(entropy)
target_state_values = batch_rewards + gamma*batch_next_values
critic_loss = tf.reduce_mean((batch_values - tf.stop_gradient(target_state_values))**2 )
loss = actor_loss + critic_loss
var_list = agent.network.trainable_variables
grads = t.gradient(loss,var_list)
agent.optimizer.apply_gradients(zip(grads, var_list))
batch_states = batch_next_states
entropy_history.append(np.mean(entropy))
if i % 500 == 0:
if i % 2500 == 0:
rewards_history.append(np.mean(evaluate(agent, env, n_games=3)))
clear_output(True)
plt.figure(figsize=[8, 4])
plt.subplot(1, 2, 1)
plt.plot(rewards_history, label='rewards')
plt.title("Session rewards")
plt.grid()
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(entropy_history, label='entropy')
plt.title("Policy entropy")
plt.grid()
plt.legend()
plt.show()
Beta = 0.005 - Training 1
Beta = 0.005 - Training 2
Beta = 0.005 - Training 3
Beta = 0.05 - Training 1
Beta = 0.05 - Training 2
Beta = 0.05 - Training 3

I've looked through your code, and it doesn't look like there's any problem with the algorithm. That is, it seems to me that the Hyper Parameter was chosen incorrectly. Try different Hyper Parameter Sets. If it doesn't work properly, refer to repository

The critic loss is wrong. You should get first expect returns, predicting the next state and iterate over it with bellman equation.
Here is an example:
def getExpectedReturns(self, states, next_states, done, rewards, standarize=True):
# Get next value
if done[-1] == 1.0:
arr_idx = np.zeros((rewards.shape[0], 1))
arr_idx[-1] = 1.0
values_rewards_sum_one_hot = tf.convert_to_tensor(arr_idx, dtype=tf.float32)
next_value = tf.reduce_sum(rewards * values_rewards_sum_one_hot, axis=0)
else:
values_rewards_sum = self.model_a2c(next_states)[-1]
arr_idx = np.zeros((rewards.shape[0], 1))
arr_idx[0] = 1.0
values_rewards_sum_one_hot = tf.convert_to_tensor(arr_idx, dtype=tf.float32)
next_value = tf.reduce_sum(values_rewards_sum * values_rewards_sum_one_hot, axis=0)
# Iterate over rewards
list_true_values = []
for i in reversed(range(0, len(rewards))):
if done[i]==0.0:
next_value = rewards[i] + next_value * self.gamma
else:
next_value = rewards[i]
list_true_values.append(next_value)
list_true_values.reverse()
list_true_values = tf.convert_to_tensor(list_true_values, dtype=tf.float32)
if standarize:
list_true_values = ((list_true_values - tf.math.reduce_mean(list_true_values)) /
(tf.math.reduce_std(list_true_values) + tf.constant(1e-12)))
return list_true_values
with tf.GradientTape() as tape:
# Advantage
returns = self.getExpectedReturns(states, next_states, done, rewards, standarize=False)
actions_probs_logits, values = self.model_a2c(states)
advantage = returns - values
advantage = tf.squeeze(advantage)
# Actions probs
actions_probs_softmax = tf.nn.softmax(actions_probs_logits)
actions_log_probs_softmax = tf.nn.log_softmax(actions_probs_logits)
actions_one_hot = tf.one_hot(actions, self.num_actions, 1.0, 0.0)
actions_log_probs = tf.reduce_sum(actions_log_probs_softmax * actions_one_hot, axis=-1)
# Entropy
entropy = self.entropy_coef * tf.reduce_mean(actions_probs_softmax * actions_log_probs_softmax, axis=1)
# Losses
actor_loss = -tf.reduce_mean(actions_log_probs * tf.stop_gradient(advantage), axis=0)
critic_loss = self.critic_coef * tf.reduce_mean(tf.math.pow(advantage, 2), axis=0)
total_loss = actor_loss + critic_loss - entropy

Tensorflow evaluate: Aborted (core dumped)

tl;dr: I input a word to my model, and am supposed to get a list of similar words and their associated measures of similarity back. I get an error: Aborted (core dumped).
My goal is to determine which words are similar to an input word, based on their feature vectors. I have model already trained. I load it and call two functions:
def main(argv=None):
model = NVDM(args)
sess_saver = tf.train.Saver()
sess = tf.Session()
init = tf.initialize_all_variables()
sess.run(init)
loaded = load_for_similar(sess, sess_saver) #my function
wm = word_match(sess, loaded[0], loaded[1], "bottle", loaded[2], loaded[3], topN=5)
My problem is that I can't print out the words which are similar and the associated similarity measure. I tried (in main):
sess.run(wm)
wm[0].eval(session=sess)
print(wm)
All of which gave me the error:
F tensorflow/core/kernels/strided_slice_op.cc:316] Check failed: tmp.CopyFrom(input.Slice(begin[0], end[0]), final_shape)
Aborted (core dumped)
This tells me I'm not running the session properly. What am I doing wrong?
Details on the functions, just in case:
The function 'load_for_similar' restores the weights and bias of the decoder in my model (a variational autoencoder), and normalizes them. It also reverses the order of the keys and values in my vocabulary dictionary for later use:
def load_for_similar(sess, saver_obj):
saver_obj.restore(sess, "./CA_checkpoints/saved_model.ckpt")
vocab_file = '/path/to/vocab.pkl'
t1 = loader_object(vocab_file)
v1 = t1.get_vocab()
v1_rev = {k:v for v, k in v1.iteritems()}
decoder_mat = tf.get_collection(tf.GraphKeys.VARIABLES, scope='decoder')[0]
decoder_bias = tf.get_collection(tf.GraphKeys.VARIABLES, scope='decoder')[1]
return (find_norm(decoder_mat), find_norm(decoder_bias), v1, v1_rev)
To find similar words, I pass the normalized weight matrix and bias in to an new function, along with the feature vector of my word (vec):
def find_similar(sess, Weights, vec, bias):
dists = tf.add(tf.reduce_sum(tf.mul(Weights, vec)), bias)
best = argsort(sess, dists, reverse=True)
dist_sort = tf.nn.top_k(dists, k=dists.get_shape().as_list()[0], sorted=True).values
return dist_sort, best
Finally, I want to match the words that are closest to my supplied word, "bottle":
def word_match(sess, norm_mat , norm_bias, word_ , vocab, vocab_inverse , topN = 10):
idx = vocab[word_]
similarity_meas , indexes = find_similar(sess, norm_mat , norm_mat[idx], norm_bias)
words = tf.gather(vocab_inverse.keys(), indexes[:topN])
return (words, similarity_meas[:topN])
EDIT: in response to mrry's comment, here is the model (I hope this is what you wanted?). This code depends on utils.py, a separate utilities file. I will include that as well. Please note that this code is heavily based on Yishu Miao's and Sarath Nair's.
class NVDM(object):
""" Neural Variational Document Model -- BOW VAE.
"""
def __init__(self,
vocab_size=15000, #was 2000
n_hidden=500,
n_topic=50,
n_sample=1,
learning_rate=1e-5,
batch_size=100, #was 64
non_linearity=tf.nn.tanh):
self.vocab_size = vocab_size
self.n_hidden = n_hidden
self.n_topic = n_topic
self.n_sample = n_sample
self.non_linearity = non_linearity
self.learning_rate = learning_rate/batch_size #CA
self.batch_size = batch_size
self.x = tf.placeholder(tf.float32, [None, vocab_size], name='input')
self.mask = tf.placeholder(tf.float32, [None], name='mask') # mask paddings
# encoder
with tf.variable_scope('encoder'):
self.enc_vec = utils.mlp(self.x, [self.n_hidden, self.n_hidden])
self.mean = utils.linear(self.enc_vec, self.n_topic, scope='mean')
self.logsigm = utils.linear(self.enc_vec,
self.n_topic,
bias_start_zero=True,
matrix_start_zero=False,
scope='logsigm')
self.kld = -0.5 * tf.reduce_sum(1 - tf.square(self.mean) + 2 * self.logsigm - tf.exp(2 * self.logsigm), 1)
self.kld = self.mask*self.kld # mask paddings
with tf.variable_scope('decoder'):
if self.n_sample ==1: # single sample
p1 = tf.cast(tf.reduce_sum(self.mask), tf.int32) #needed for random normal generation
eps = tf.random_normal((p1, self.n_topic), 0, 1)
doc_vec = tf.mul(tf.exp(self.logsigm), eps) + self.mean
logits = tf.nn.log_softmax(utils.linear(doc_vec, self.vocab_size, scope='projection'))
self.recons_loss = -tf.reduce_sum(tf.mul(logits, self.x), 1)
# multiple samples
else:
eps = tf.random_normal((self.n_sample*batch_size, self.n_topic), 0, 1)
eps_list = tf.split(0, self.n_sample, eps)
recons_loss_list = []
for i in xrange(self.n_sample):
if i > 0: tf.get_variable_scope().reuse_variables()
curr_eps = eps_list[i]
doc_vec = tf.mul(tf.exp(self.logsigm), curr_eps) + self.mean
logits = tf.nn.log_softmax(utils.linear(doc_vec, self.vocab_size, scope='projection'))
recons_loss_list.append(-tf.reduce_sum(tf.mul(logits, self.x), 1))
self.recons_loss = tf.add_n(recons_loss_list) / self.n_sample
self.objective = self.recons_loss + self.kld
optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
fullvars = tf.trainable_variables()
enc_vars = utils.variable_parser(fullvars, 'encoder')
dec_vars = utils.variable_parser(fullvars, 'decoder')
enc_grads = tf.gradients(self.objective, enc_vars)
dec_grads = tf.gradients(self.objective, dec_vars)
self.optim_enc = optimizer.apply_gradients(zip(enc_grads, enc_vars))
self.optim_dec = optimizer.apply_gradients(zip(dec_grads, dec_vars))
def minibatch_bow(it1, Instance1, n_samples, batch_size, used_ints = set()):
available = set(np.arange(n_samples)) - used_ints #
if len(available) < batch_size:
indices = np.array(list(available))
else:
indices = np.random.choice(tuple(available), batch_size, replace=False)
used = used_ints
mb = itemgetter(*indices)(it1)
batch_xs = Instance1._bag_of_words(mb, vocab_size=15000)
batch_flattened = np.ravel(batch_xs)
index_positions = np.where(batch_flattened > 0)[0]
return (batch_xs, index_positions, set(indices)) #batch_xs[0] is the bag of words; batch_xs[1] is the 0/1 word used/not;
def train(sess, model, train_file, vocab_file, saver_obj, training_epochs, alternate_epochs, batch_size):
Instance1 = testchunk_Nov23.testLoader(train_file, vocab_file)
data_set = Instance1.get_batch(batch_size) #get all minibatches of size 100
n_samples = Instance1.num_reviews()
train_batches = list(data_set) #this is an itertools.chain object
it1_train = list(itertools.chain(*train_batches)) #length is 732,356. This is all the reviews.atch_size
if len(it1_train) % batch_size != 0:
total_batch = int(len(it1_train)/batch_size) + 1
else:
total_batch = int(len(it1_train)/batch_size)
trainfilesave = "train_ELBO_and_perplexity_Dec1.txt"
#Training
train_time = time.time()
for epoch in range(training_epochs):
for switch in xrange(0, 2):
if switch == 0:
optim = model.optim_dec
print_mode = 'updating decoder'
else:
optim = model.optim_enc
print_mode = 'updating encoder'
with open(trainfilesave, 'w') as f:
for i in xrange(alternate_epochs):
loss_sum = 0.0
kld_sum = 0.0
word_count = 0
used_indices = set()
for idx_batch in range(total_batch): #train_batches:
mb = minibatch_bow(it1_train, Instance1, n_samples, batch_size, used_ints=used_indices)
print('minibatch', idx_batch)
used_indices.update(mb[2])
num_mb = np.ones(mb[0][0].shape[0])
input_feed = {model.x.name: mb[0][0], model.mask: num_mb}
_, (loss, kld) = sess.run((optim,[model.objective, model.kld]) , input_feed)
loss_sum += np.sum(loss)
And the utils.py file:
def linear(inputs,
output_size,
no_bias=False,
bias_start_zero=False,
matrix_start_zero=False,
scope=None):
"""Define a linear connection."""
with tf.variable_scope(scope or 'Linear'):
if matrix_start_zero:
matrix_initializer = tf.constant_initializer(0)
else:
matrix_initializer = None
if bias_start_zero:
bias_initializer = tf.constant_initializer(0)
else:
bias_initializer = None
input_size = inputs.get_shape()[1].value
matrix = tf.get_variable('Matrix', [input_size, output_size],
initializer=matrix_initializer)
bias_term = tf.get_variable('Bias', [output_size],
initializer=bias_initializer)
output = tf.matmul(inputs, matrix)
if not no_bias:
output = output + bias_term
return output
def mlp(inputs,
mlp_hidden=[],
mlp_nonlinearity=tf.nn.tanh,
scope=None):
"""Define an MLP."""
with tf.variable_scope(scope or 'Linear'):
mlp_layer = len(mlp_hidden)
res = inputs
for l in xrange(mlp_layer):
res = mlp_nonlinearity(linear(res, mlp_hidden[l], scope='l'+str(l)))
return res

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Pytorch PPO implementation is not learning - python

In the Generalized Advantage Estimation loop advantages and returns are added in reversed order. advantage_list.insert(0, advantages.detach()) return_list.insert(0, returns.detach())

Related

Reinforcement Learning - only size-1 arrays can be converted to Python scalars - is it data problem?

Style loss is always zero

Why does the result when restoring a saved DDPG model differ significantly from the result when saving it?

Implementing A3C on TensorFlow 2

Tensorflow evaluate: Aborted (core dumped)

Categories

Resources