Hello my DDPG model that I have implemented in TF 2 get's horrible results at every env on openai-gym that has continuous actions I need help to find what's the problem. I run this on my GPU. On env Pendulum I get -1200/-1000 rewards on every episode. This code is from a course I took on udemy but it was written in TF1.x and I rewrote it in TF2 but his TF1.x implementation had better results. Here is the code:
import tensorflow as tf
import numpy as np
import os
import gym
import random
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model
class ReplayBuffer():
def __init__(self, obs_dim, act_dim, size):
self.obs1_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
self.obs2_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
self.act_buf = np.zeros([size, act_dim], dtype=np.float32)
self.reward_buf = np.zeros(size, dtype=np.float32)
self.done_buf = np.zeros(size, dtype=np.float32)
self.current = 0
self.count = 0
self.size = size
def add_experience(self, state, action, reward, next_state, done):
self.obs1_buf[self.current] = state
self.act_buf[self.current] = action
self.reward_buf[self.current] = reward
self.obs2_buf[self.current] = next_state
self.done_buf[self.current] = done
self.current = (self.current + 1) % self.size
self.count = min(self.count+1, self.size)
def sample_batch(self, batch_size=32):
idx = np.random.randint(0, self.count, size=batch_size)
return dict(s=self.obs1_buf[idx],
s2=self.obs2_buf[idx],
a=self.act_buf[idx],
r=self.reward_buf[idx],
d=self.done_buf[idx])
class DDPG():
def __init__(self, env, num_states, num_actions, action_max):
self.env = env
self.num_states = num_states
self.num_actions = num_actions
self.action_max = action_max
self.gamma = 0.99
self.decay = 0.995
self.mu_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
self.q_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
def mu_model(hidden_layers):
inp = Input(shape=(self.num_states, ))
x = inp
for layers in hidden_layers[:-1]:
x = Dense(layers, activation='relu')(x)
x = Dense(hidden_layers[-1], activation='tanh')(x)
mu_model = Model(inp, x)
return mu_model
self.mu_model = mu_model([300, self.num_actions])
def q_model(inp_state, inp_act, hidden_layers):
inp_state = Input(shape=(inp_state, ))
inp_mu = Input(shape=(inp_act, ))
inp = concatenate([inp_state, inp_mu])
x = inp
for layers in hidden_layers[:-1]:
x = Dense(layers, activation='relu')(x)
x = Dense(hidden_layers[-1], activation='linear')(x)
q_model = Model([inp_state, inp_mu], x)
return q_model
self.q_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])
self.q_target_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])
#Eself.mu_do_minimize = tf.function(self.mu_minimize, input_signature=[
#tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state')])
self.q_do_minimize = tf.function(self.q_minimize, input_signature=[
tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state'),
tf.TensorSpec(shape=(None, self.num_actions), dtype=tf.float32, name='action'),
tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='next_state'),
tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='reward'),
tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='done_flags')])
#tf.function
def train_mu(self, state):
with tf.GradientTape() as tape:
actions = self.mu_model(state, training=True)
critic_value = self.q_model([state, actions], training=True)
# Used `-value` as we want to maximize the value given
# by the critic for our actions
actor_loss = -tf.math.reduce_mean(critic_value)
actor_grad = tape.gradient(actor_loss, self.mu_model.trainable_variables)
self.mu_optimizer.apply_gradients(
zip(actor_grad, self.mu_model.trainable_variables)
)
def q_minimize(self, state, action, next_state, reward, done):
def calc_loss():
q_targ = reward + self.gamma * (1 - done) * self.q_target_model([next_state, action])
q = self.q_model([state, action])
cost = tf.reduce_mean((q - q_targ)**2)
return cost
self.q_optimizer.minimize(calc_loss, self.q_model.trainable_variables)
def train(self, state, action, reward, done, next_state):
state = np.atleast_2d(state)
next_state = np.atleast_2d(next_state)
action = np.atleast_2d(action)
reward = np.atleast_1d(reward)
done = np.atleast_1d(done)
self.update_target_net()
self.train_mu(state)
self.q_do_minimize(state, action, next_state, reward, done)
def update_target_net(self):
mu_weights = np.array(self.mu_model.get_weights())
q_weights = np.array(self.q_model.get_weights())
#print(mu_weights.shape)
#print(q_weights.shape)
mu_target_weights = np.array(self.mu_target_model.get_weights())
q_target_weights = np.array(self.q_target_model.get_weights())
self.q_target_model.set_weights(self.decay * q_weights + (1 - self.decay) * q_target_weights)
def get_action(self, states, noise=None):
if noise is None: noise = self.ACT_NOISE_SCALE
if len(states.shape) == 1: states = states.reshape(1,-1)
action = self.mu_model.predict_on_batch(states)[0]
if noise != 0:
action += noise * np.random.randn(self.num_actions)
action = np.clip(action, -self.action_max, self.action_max)
return action
def play_one(env, agent, replay_buffer, gamma=0.99, noise=0.1, max_episode_len=1000, start_steps=10000, num_train_ep=100, batch_size=100, test_ep_agent=25):
returns = []
num_steps = 0
for ep in range(num_train_ep):
s, ep_return, ep_len, d = env.reset(), 0, 0, False
while not (d or ep_len == max_episode_len):
env.render()
if num_steps > start_steps:
a = agent.get_action(s, noise)
else:
a = env.action_space.sample()
num_steps+=1
if num_steps == start_steps:
print("USING AGENT ACTIONS NOW")
s2, r, d, _ = env.step(a)
ep_return+=r
ep_len+=1
#print(s.shape)
d = False if ep_len == max_episode_len else d
replay_buffer.add_experience(s, a, r, s2, d)
s = s2
for _ in range(ep_len):
batch = replay_buffer.sample_batch()
state, next_state, action, reward, done = batch['s'], batch['s2'], batch['a'], batch['r'], batch['d']
loss = agent.train(state, action, reward, done, next_state)
returns.append(ep_return)
print('Iter:', ep, 'Rewards:', ep_return)
return returns
if __name__ == '__main__':
env = gym.make('Pendulum-v0')
obs_dim1 = env.observation_space.shape[0]
act_dim1 = env.action_space.shape[0]
action_max1 = env.action_space.high[0]
actor = DDPG(env, obs_dim1, act_dim1, action_max1)
replay_buffer = ReplayBuffer(obs_dim1, act_dim1, size=100000)
returns = play_one(env, actor, replay_buffer)
Thanks you in advance!
First things that comes to mind is the learning rate: 0.01 is too high, even for pendulum. Try a lower learning rate (eg 1e-3 for the actor and 5e-3 for the critic).
Also a couple of things look off in your code:
There is no target network for the actor. Why is that? IIRC ddpg has target network for both actor and critic.
Usually it is better to initialize main and target network with the same parameters. You can do that with target_model.set_weights(model.get_weights())
In the function play_one the training steps are done after playing a whole episode. This is probably ok, but there is no need to: because pendulum is not real time you don't need your code to be fast, so you can train while playing.
If you want to take a look I implemented ddpg in tensorflow 2 a while back. It solves pendulum in 80ish episodes.
GitHub
Related
I am trying to build the autoencoder structure detailed in this IEEE article. The autoencoder uses a separable loss function where it is required that I create a custom loss function for the "cluster loss" term of the separable loss function as a function of the average output of the encoder. I create my own layer called RffConnected that calculates the cluster loss and utilizes the add_loss method. Otherwise, this RffConnected layer should act as just a normal deep layer.
Here are my relevant code snippets:
import matplotlib.pyplot as plot
from mpl_toolkits.axes_grid1 import ImageGrid
import numpy as np
import math
from matplotlib.figure import Figure
import tensorflow as tf
import keras
from keras import layers
import random
import time
from os import listdir
#loads data from a text file
def loadData(basePath, samplesPerFile, sampleRate):
real = []
imag = []
fileOrder = []
for file in listdir(basePath):
if((file != "READ_ME") and ((file != "READ_ME.txt"))):
fid = open(basePath + "\\" + file, "r")
fileOrder.append(file)
t = 0
sampleEvery = samplesPerFile / sampleRate
temp1 = []
temp2 = []
times = []
for line in fid.readlines():
times.append(t)
samples = line.split("\t")
temp1.append(float(samples[0]))
temp2.append(float(samples[1]))
t = t + sampleEvery
real.append(temp1)
imag.append(temp2)
fid.close()
real = np.array(real)
imag = np.array(imag)
return real, imag, times, fileOrder
#####################################################################################################
#Breaks up and randomizes data
def breakUpData(real, imag, times, numPartitions, basePath):
if(len(real) % numPartitions != 0):
raise ValueError("Error: The length of the dataset must be divisible by the number of partitions.")
newReal = []
newImag = []
newTimes = []
fileOrder = listdir(basePath)
dataFiles = []
interval = int(len(real[0]) / numPartitions)
for i in range(0, interval):
newTimes.append(times[i])
for i in range(0, len(real)):
tempI = []
tempQ = []
for j in range(0, len(real[0])):
tempI.append(real[i, j])
tempQ.append(imag[i, j])
if((j + 1) % interval == 0):
newReal.append(tempI)
newImag.append(tempQ)
#fileName = fileOrder[i][0: fileOrder[i].find("_") + 3]
dataFiles.append(fileOrder[i])
tempI = []
tempQ = []
#randomizes the broken up dataset and the file list
for i in range(0, len(newReal)):
r = random.randint(0, len(newReal) - 1)
tempReal = newReal[i]
tempImag = newImag[i]
newReal[i] = newReal[r]
newImag[i] = newImag[r]
newReal[r] = tempReal
newImag[r] = tempImag
tempFile = dataFiles[i]
dataFiles[i] = dataFiles[r]
dataFiles[r] = tempFile
#return np.array(newReal), np.array(newImag), newTimes, dataFiles
return newReal, newImag, newTimes, dataFiles
#####################################################################################################
#custom loss layer for the RffAe-S that calculates the clustering loss term
class RffConnected(layers.Layer):
def __init__(self, output_dim, batchSize, beta, alpha):
super(RffConnected, self).__init__()
# self.total = tf.Variable(initial_value=tf.zeros((input_dim,)), trainable=False)
#array = np.zeros(output_dim)
self.iters = 0.0
self.beta = beta
self.alpha = alpha
self.batchSize = batchSize
self.output_dim = output_dim
self.sum = tf.zeros(output_dim, tf.float64)
self.moving_average = tf.zeros(output_dim, tf.float64)
self.clusterloss = tf.zeros(output_dim, tf.float64)
self.sum = tf.cast(self.sum, tf.float32)
self.moving_average = tf.cast(self.moving_average, tf.float32)
self.clusterloss = tf.cast(self.clusterloss, tf.float32)
# self.sum = keras.Input(shape=(self.output_dim,))
# self.moving_average = keras.Input(shape=(self.output_dim,))
# self.clusterloss = keras.Input(shape=(self.output_dim,))
def build(self, input_shape):
self.kernel = self.add_weight(name = 'kernel', \
shape = (int(input_shape[-1]), self.output_dim), \
initializer = 'normal', trainable = True)
#self.kernel = tf.cast(self.kernel, tf.float64)
super(RffConnected, self).build(int(input_shape[-1]))
def call(self, inputs):
#keeps track of training epochs
self.iters = self.iters + 1
#inputs = tf.cast(inputs, tf.float64)
#where this custom layer acts as a normal layer- the loss then uses this
#calc = keras.backend.dot(inputs, self.kernel)
calc = tf.matmul(inputs, self.kernel)
#cumulative sum of deep encoded features
#self.sum = state_ops.assign(self.sum, tf.reshape(tf.math.add(self.sum, calc), tf.shape(self.sum)))
#self.sum = tf.ops.state_ops.assign(self.sum, tf.math.add(self.sum, calc))
#self.sum.assign_add(calc)
self.sum = tf.math.add(self.sum, calc)
#calculate the moving average and loss if we have already trained one batch
if(self.iters >= self.batchSize):
self.moving_average = tf.math.divide(self.sum, self.iters)
self.clusterloss = tf.math.exp(\
tf.math.multiply(-1 * self.beta, tf.math.reduce_sum(tf.math.square(tf.math.subtract(inputs, self.moving_average)))))
#self.add_loss(tf.math.multiply(self.clusterloss, self.alpha))
self.add_loss(self.clusterloss.numpy() * self.alpha)
return calc
#####################################################################################################
def customloss(y_true, y_pred):
loss = tf.square(y_true - y_pred)
print(loss)
return loss
#####################################################################################################
realTraining = np.array(real[0:2200])
realTesting = np.array(real[2200:-1])
imagTraining = np.array(imag[0:2200])
imagTesting = np.array(imag[2200:-1])
numInputs = len(realTraining[0])
i_sig = keras.Input(shape=(numInputs,))
q_sig = keras.Input(shape=(numInputs,))
iRff = tf.keras.layers.experimental.RandomFourierFeatures(numInputs, \
kernel_initializer='gaussian', scale=9.0)(i_sig)
rff1 = keras.Model(inputs=i_sig, outputs=iRff)
qRff = tf.keras.layers.experimental.RandomFourierFeatures(numInputs, \
kernel_initializer='gaussian', scale=9.0)(q_sig)
rff2 = keras.Model(inputs=q_sig, outputs=qRff)
combined = layers.Concatenate()([iRff, qRff])
combineRff = tf.keras.layers.experimental.RandomFourierFeatures(4 * numInputs, \
kernel_initializer='gaussian', scale=10.0)(combined)
preprocess = keras.Model(inputs=[iRff, qRff], outputs=combineRff)
#print(realTraining[0:5])
preprocessedTraining = preprocess.predict([realTraining, imagTraining])
preprocessedTesting = preprocess.predict([realTesting, imagTesting])
################## Entering Encoder ######################
encoderIn = keras.Input(shape=(4*numInputs,))
#connected1 = layers.Dense(100, activation="sigmoid")(encoderIn)
clusterLossLayer = RffConnected(100, 30, 1.00, 100.00)(encoderIn)
#clusterLossLayer = myRffConnected(256)(connected1)
encoder = keras.Model(inputs=encoderIn, outputs=clusterLossLayer)
################## Entering Decoder ######################
connected2 = layers.Dense(125, activation="sigmoid")(clusterLossLayer)
relu1 = layers.ReLU()(connected2)
dropout = layers.Dropout(0.2)(relu1)
reshape1 = layers.Reshape((25, 5, 1))(dropout)
bn1 = layers.BatchNormalization()(reshape1)
trans1 = layers.Conv2DTranspose(1, (4, 2))(bn1)
ups1 = layers.UpSampling2D(size=(2, 1))(trans1)
relu2 = layers.ReLU()(ups1)
bn2 = layers.BatchNormalization()(relu2)
trans2 = layers.Conv2DTranspose(1, (4, 2))(bn2)
ups2 = layers.UpSampling2D(size=(2, 1))(trans2)
relu3 = layers.ReLU()(ups2)
bn3 = layers.BatchNormalization()(relu3)
trans3 = layers.Conv2DTranspose(1, (5, 2))(bn3)
ups3 = layers.UpSampling2D(size=(2, 1))(trans3)
relu4 = layers.ReLU()(ups3)
bn4 = layers.BatchNormalization()(relu4)
trans4 = layers.Conv2DTranspose(1, (7, 1))(bn4)
reshape2 = layers.Reshape((4*numInputs, 1, 1))(trans4)
autoencoder = keras.Model(inputs=encoderIn, outputs=reshape2)
encoded_input = keras.Input(shape=(None, 100))
decoder_layer = autoencoder.layers[-1]
#autoencoder.summary()
autoencoder.compile(optimizer='adam', loss=[autoencoder.losses[-1], customloss], metrics=['accuracy', 'accuracy'])
autoencoder.fit(preprocessedTraining, preprocessedTraining, epochs=100, batch_size=20, shuffle=True, validation_data=(preprocessedTesting, preprocessedTesting))
It seems like it runs for two training epochs then it gives me an error. I end up getting this error when I run it:
ValueError: Could not interpret loss function identifier: Tensor("rff_connected_137/Const:0", shape=(100,), dtype=float32)
I've already spent a considerable amount of time debugging this thing, although if you spot any more errors I would appreciate a heads-up. Thank you in advance.
According to the documentation of the keras Keras Model Training-Loss, the 'loss' attribute can take the value of float tensor (except for the sparse loss functions returning integer arrays) with a specific shape.
If it is necessary to combine two loss functions, it would be better to perform mathematical calculations within your custom loss function to return an output of float tensor. This reference might be a help Keras CustomLoss definition.
I tried making a simple implementation of ddpg with a queue, target networks and noise but the robot consistently freezes up into a single pose instead of developing a walking pattern. I think that the cause of this problem is in including incorrect hyperparameters but I am not sure, the code itself might be problematic.
import numpy as np
from tensorflow.keras import layers, Model
from tensorflow.keras.models import clone_model
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
# take a single action
def step(env, actor, state, noise):
# returns: the conditions of the robot
action = actor(state[None])[0]
next_state, reward, done, _ = env.step(action + noise)
return np.concatenate([state, action]), reward, done, next_state
def update_queues(env, actor, state, states_actions, rewards, target_rewards,
target_actor, target_critic, discount, noise):
state_action, reward, done, next_state = step(env, actor, state, noise)
target_action = target_actor(next_state[None])[0]
target_reward = discount * target_critic(np.concatenate([next_state, target_action])[None])[0]
states_actions[-1], rewards[-1], target_rewards[-1] = state_action, reward, target_reward
states_actions[:-1], rewards[:-1], target_rewards[:-1] = states_actions[1:], rewards[1:], target_rewards[1:]
return states_actions, rewards, target_rewards, next_state, done
def update_critic(y_tag, critic_vars, critic_optimizer, target_rewards, rewards):
y = tf.convert_to_tensor((target_rewards + rewards)[None].T)
critic_optimizer.apply_gradients(zip((y-y_tag)**2, critic_vars))
def update_actor(actor_vars, actor_optimizer, y_tag, rewards):
y = tf.convert_to_tensor(rewards[None].T)
actor_optimizer.apply_gradients(zip(y-y_tag, actor_vars))
def update_target_models(actor_vars, critic_vars, target_actor_vars, target_critic_vars, update_factor):
for (a, b) in zip(target_actor_vars, actor_vars):
a.assign(b * update_factor + a * (1 - update_factor))
for (a, b) in zip(target_critic_vars, critic_vars):
a.assign(b * update_factor + a * (1 - update_factor))
# soft error somewhere inside here
def train(env, actor, critic, discount, episode_num, queue_size, update_factor, max_episode_length, noise_factors):
divisor = episode_num // 100
divisor_rewards = np.zeros(100)
reward_num = 0
target_actor, target_critic = clone_model(actor), clone_model(critic)
rewards, target_rewards = np.zeros((2, queue_size), dtype = np.float32)
states_actions = np.zeros((queue_size, state_size + action_size))
actor_vars, critic_vars = actor.trainable_variables, critic.trainable_variables
target_actor_vars, target_critic_vars = target_actor.trainable_variables, target_critic.trainable_variables
for e_n in np.arange(episode_num):
next_state, done = env.reset(), False
for i in np.arange(max_episode_length):
reward_num += 1
noise = np.random.uniform(-noise_factors[e_n], noise_factors[e_n], action_size)
states_actions, rewards, target_rewards, next_state, done = update_queues(
env, actor, next_state, states_actions, rewards,
target_rewards, target_actor, target_critic, discount, noise)
if done:
break
y_tag = critic(states_actions)
update_critic(y_tag, critic_vars, critic_optimizer, target_rewards, rewards)
update_actor(actor_vars, actor_optimizer, y_tag, rewards)
update_target_models(actor_vars, critic_vars, target_actor_vars, target_critic_vars, update_factor)
if e_n % divisor == 0:
print(f"rewards are: {np.sum(rewards[-reward_num:]) / divisor}, {int(e_n * 100 / episode_num)}%")
divisor_rewards[e_n // divisor] = np.sum(rewards[-reward_num:])
reward_num = 0
return divisor_rewards
# set up the environment
env = gym.make("BipedalWalker-v3")
state_size = 24
action_size = 4
kernel_init = tf.random_uniform_initializer(minval=-0.03, maxval=0.03)
actor_inputs = layers.Input(shape = state_size)
out = layers.Dense(action_size * 2, activation = "linear")(actor_inputs)
actor_output = layers.Dense(action_size, activation = "linear")(out)
actor = Model(actor_inputs, actor_output)
actor_optimizer = tf.optimizers.RMSprop(learning_rate=0.0001)
# set up the critic model and optimizer
critic_inputs = layers.Input(shape = state_size + action_size)
critic_output = layers.Dense(1, activation = "linear")(critic_inputs)
critic = Model(critic_inputs, critic_output)
critic_optimizer = tf.optimizers.RMSprop(learning_rate=0.001)
# train the actor and critic
progress = train(env, actor, critic, 0.99, 10000, 2000, 0.02, 100, np.linspace(0.3, 0, 10000))
plt.plot(progress)
# see how the actor walks
state = env.reset()
for i in range(300):
env.render(mode='rgb_array')
_, _, done, state_action = step(env, actor, state, 0)
state = state_action[:state_size]
if done:
break
env.close()
I have implemented PPO for Cartpole-VO environment. However, it does not converge in certain iterations of the game. Sometimes it gets stuck in local optima. I have implemented the algorithm using the TD-0 advantage i.e.
A(s_t) = R(t+1) + \gamma V(S_{t+1}) - V(S_t)
Here is my code:
def running_average(x, n):
N = n
kernel = np.ones(N)
conv_len = x.shape[0]-N
y = np.zeros(conv_len)
for i in range(conv_len):
y[i] = kernel # x[i:i+N] # matrix multiplication operator: np.mul
y[i] /= N
return y
class ActorNetwork(nn.Module):
def __init__(self, state_dim, n_actions, learning_rate=0.0003, epsilon_clipping=0.3, update_epochs=10):
super().__init__()
self.n_actions = n_actions
self.model = nn.Sequential(
nn.Linear(state_dim, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, n_actions),
nn.Softmax(dim=-1)
).float()
self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
self.epsilon_clipping = epsilon_clipping
self.update_epochs = update_epochs
def forward(self, X):
return self.model(X)
def predict(self, state):
if state.ndim < 2:
action_probs = self.model(torch.FloatTensor(state).unsqueeze(0).float())
else:
action_probs = self.model(torch.FloatTensor(state))
return action_probs.squeeze(0).data.numpy()
def update(self, states, actions, deltas, old_prob):
batch_size = len(states)
state_batch = torch.Tensor(states)
action_batch = torch.Tensor(actions)
delta_batch = torch.Tensor(deltas)
old_prob_batch = torch.Tensor(old_prob)
for k in range(self.update_epochs):
pred_batch = self.model(state_batch)
prob_batch = pred_batch.gather(dim=1, index=action_batch.long().view(-1, 1)).squeeze()
ratio = torch.exp(torch.log(prob_batch) - torch.log(old_prob_batch))
clipped = torch.clamp(ratio, 1 - self.epsilon_clipping, 1 + self.epsilon_clipping) * delta_batch
loss_r = -torch.min(ratio*delta_batch, clipped)
loss = torch.mean(loss_r)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
class CriticNetwork(nn.Module):
def __init__(self, state_dim, learning_rate=0.001):
super().__init__()
self.model = nn.Sequential(
nn.Linear(state_dim, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, 1),
).float()
self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
def forward(self, X):
return self.model(X)
def predict(self, state):
if state.ndim < 2:
values = self.model(torch.FloatTensor(state).unsqueeze(0).float())
else:
values = self.model(torch.FloatTensor(state))
return values.data.numpy()
def update(self, states, targets):
state_batch = torch.Tensor(states)
target_batch = torch.Tensor(targets)
pred_batch = self.model(state_batch)
loss = torch.nn.functional.mse_loss(pred_batch, target_batch.unsqueeze(1))
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def train_ppo_agent(env, episode_length, max_episodes, gamma, visualize_step, learning_rate_actor=0.0003, learning_rate_critic=0.001, epsilon_clipping=0.2, actor_update_epochs=10):
model_actor = ActorNetwork(env.observation_space.shape[0], env.action_space.n, learning_rate=learning_rate_actor,
epsilon_clipping=epsilon_clipping, update_epochs=actor_update_epochs)
model_critic = CriticNetwork(env.observation_space.shape[0], learning_rate=learning_rate_critic)
EPISODE_LENGTH = episode_length
MAX_EPISODES = max_episodes
GAMMA = gamma
VISUALIZE_STEP = max(1, visualize_step)
score = []
for episode in range(MAX_EPISODES):
curr_state = env.reset()
done = False
all_episode_t = []
score_episode = 0
for t in range(EPISODE_LENGTH):
act_prob = model_actor.predict(curr_state)
action = np.random.choice(np.array(list(range(env.action_space.n))), p=act_prob)
value = model_critic.predict(curr_state)
prev_state = curr_state
curr_state, reward, done, info = env.step(action)
score_episode += reward
e_t = {'state': prev_state, 'action':action, 'action_prob':act_prob[action],'reward': reward, 'value': value}
all_episode_t.append(e_t)
if done:
break
score.append(score_episode)
episode_values = [all_episode_t[t]['value'] for t in range(len(all_episode_t))]
next_state_estimates = [episode_values[i].item() for i in range(1, len(episode_values))]
next_state_estimates.append(0)
boostrap_estimate = []
for t in range(len(all_episode_t)):
G = all_episode_t[t]['reward'] + GAMMA * next_state_estimates[t]
boostrap_estimate.append(G)
episode_target = np.array(boostrap_estimate)
episode_values = np.array(episode_values)
# compute the advantage for each state in the episode: R_{t+1} + \gamma * V(S_{t+1}) - V_{t}
adv_batch = episode_target-episode_values
state_batch = np.array([all_episode_t[t]['state'] for t in range(len(all_episode_t))])
action_batch = np.array([all_episode_t[t]['action'] for t in range(len(all_episode_t))])
old_actor_prob = np.array([all_episode_t[t]['action_prob'] for t in range(len(all_episode_t))])
model_actor.update(state_batch, action_batch, adv_batch, old_actor_prob)
model_critic.update(state_batch, episode_target)
# print the status after every VISUALIZE_STEP episodes
if episode % VISUALIZE_STEP == 0 and episode > 0:
print('Episode {}\tAverage Score: {:.2f}'.format(episode, np.mean(score[-VISUALIZE_STEP:-1])))
# domain knowledge applied to stop training: if the average score across last 100 episodes is greater than 195, game is solved
if np.mean(score[-100:-1]) > 195:
break
# Training plot: Episodic reward over Training Episodes
score = np.array(score)
avg_score = running_average(score, visualize_step)
plt.figure(figsize=(15, 7))
plt.ylabel("Episodic Reward", fontsize=12)
plt.xlabel("Training Episodes", fontsize=12)
plt.plot(score, color='gray', linewidth=1)
plt.plot(avg_score, color='blue', linewidth=3)
plt.scatter(np.arange(score.shape[0]), score, color='green', linewidth=0.3)
plt.savefig("temp/cartpole_ppo_training_plot.pdf")
# return the trained models
return model_actor, model_critic
def main():
env = gym.make('CartPole-v0')
episode_length = 300
n_episodes = 5000
gamma = 0.99
vis_steps = 100
learning_rate_actor = 0.0003
actor_update_epochs = 10
epsilon_clipping = 0.2
learning_rate_critic = 0.001
# train the PPO agent
model_actor, model_critic = train_ppo_agent(env, episode_length, n_episodes, gamma, vis_steps,
learning_rate_actor=learning_rate_actor,
learning_rate_critic=learning_rate_critic,
epsilon_clipping=epsilon_clipping,
actor_update_epochs=actor_update_epochs)
Am I missing something, or is this kind of behaviour expected if one uses simple TD-0 advantages for PPO, given the nature of the Cartpole environment?
If you remove the "-" (the negative marker) in line:
loss_r = -torch.min(ratio*delta_batch, clipped)
The score will then start to steadily increase over time. Before this fix you had negative loss which would increase over time. This is not how loss should work for neural networks. As gradient descent works to minimize the loss. So you want a positive loss which can be minimized by optimizer.
Hope my answer is somewhat clear, and sorry I cannot go into deeper detail.
My run can be seen in the attached image:
I save the trained model after a certain number of episodes with the special save() function of the DDPG class (the network is saved when the reward reaches zero), but when I restore the model again using saver.restore(), the network gives out a reward equal to approximately -1800. Why is this happening, maybe I'm doing something wrong? My network:
import tensorflow as tf
import numpy as np
import gym
epsiode_steps = 500
# learning rate for actor
lr_a = 0.001
# learning rate for critic
lr_c = 0.002
gamma = 0.9
alpha = 0.01
memory = 10000
batch_size = 32
render = True
class DDPG(object):
def __init__(self, no_of_actions, no_of_states, a_bound, ):
self.memory = np.zeros((memory, no_of_states * 2 + no_of_actions + 1), dtype=np.float32)
# initialize pointer to point to our experience buffer
self.pointer = 0
self.sess = tf.Session()
self.noise_variance = 3.0
self.no_of_actions, self.no_of_states, self.a_bound = no_of_actions, no_of_states, a_bound,
self.state = tf.placeholder(tf.float32, [None, no_of_states], 's')
self.next_state = tf.placeholder(tf.float32, [None, no_of_states], 's_')
self.reward = tf.placeholder(tf.float32, [None, 1], 'r')
with tf.variable_scope('Actor'):
self.a = self.build_actor_network(self.state, scope='eval', trainable=True)
a_ = self.build_actor_network(self.next_state, scope='target', trainable=False)
with tf.variable_scope('Critic'):
q = self.build_crtic_network(self.state, self.a, scope='eval', trainable=True)
q_ = self.build_crtic_network(self.next_state, a_, scope='target', trainable=False)
self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
# update target value
self.soft_replace = [
[tf.assign(at, (1 - alpha) * at + alpha * ae), tf.assign(ct, (1 - alpha) * ct + alpha * ce)]
for at, ae, ct, ce in zip(self.at_params, self.ae_params, self.ct_params, self.ce_params)]
q_target = self.reward + gamma * q_
td_error = tf.losses.mean_squared_error(labels=(self.reward + gamma * q_), predictions=q)
self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, name="adam-ink", var_list=self.ce_params)
a_loss = - tf.reduce_mean(q)
# train the actor network with adam optimizer for minimizing the loss
self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
tf.summary.FileWriter("logs2", self.sess.graph)
# initialize all variables
self.sess.run(tf.global_variables_initializer())
self.saver = tf.train.Saver()
self.saver.restore(self.sess, "Pendulum/nn.ckpt")
def choose_action(self, s):
a = self.sess.run(self.a, {self.state: s[np.newaxis, :]})[0]
a = np.clip(np.random.normal(a, self.noise_variance), -2, 2)
return a
def learn(self):
# soft target replacement
self.sess.run(self.soft_replace)
indices = np.random.choice(memory, size=batch_size)
batch_transition = self.memory[indices, :]
batch_states = batch_transition[:, :self.no_of_states]
batch_actions = batch_transition[:, self.no_of_states: self.no_of_states + self.no_of_actions]
batch_rewards = batch_transition[:, -self.no_of_states - 1: -self.no_of_states]
batch_next_state = batch_transition[:, -self.no_of_states:]
self.sess.run(self.atrain, {self.state: batch_states})
self.sess.run(self.ctrain, {self.state: batch_states, self.a: batch_actions, self.reward: batch_rewards,
self.next_state: batch_next_state})
# we define a function store_transition which stores all the transition information in the buffer
def store_transition(self, s, a, r, s_):
trans = np.hstack((s, a, [r], s_))
index = self.pointer % memory
self.memory[index, :] = trans
self.pointer += 1
if self.pointer > memory:
self.noise_variance *= 0.99995
self.learn()
# we define the function build_actor_network for builing our actor network and after crtic network
def build_actor_network(self, s, scope, trainable)
with tf.variable_scope(scope):
l1 = tf.layers.dense(s, 30, activation=tf.nn.tanh, name='l1', trainable=trainable)
a = tf.layers.dense(l1, self.no_of_actions, activation=tf.nn.tanh, name='a', trainable=trainable)
return tf.multiply(a, self.a_bound, name="scaled_a")
def build_crtic_network(self, s, a, scope, trainable):
with tf.variable_scope(scope):
n_l1 = 30
w1_s = tf.get_variable('w1_s', [self.no_of_states, n_l1], trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.no_of_actions, n_l1], trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
net = tf.nn.tanh(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
q = tf.layers.dense(net, 1, trainable=trainable)
return q
def save(self):
self.saver.save(self.sess, "Pendulum/nn.ckpt")
env = gym.make("Pendulum-v0")
env = env.unwrapped
env.seed(1)
no_of_states = env.observation_space.shape[0]
no_of_actions = env.action_space.shape[0]
a_bound = env.action_space.high
ddpg = DDPG(no_of_actions, no_of_states, a_bound)
total_reward = []
no_of_episodes = 300
# for each episodes
for i in range(no_of_episodes):
# initialize the environment
s = env.reset()
# episodic reward
ep_reward = 0
for j in range(epsiode_steps):
env.render()
# select action by adding noise through OU process
a = ddpg.choose_action(s)
# peform the action and move to the next state s
s_, r, done, info = env.step(a)
# store the the transition to our experience buffer
# sample some minibatch of experience and train the network
ddpg.store_transition(s, a, r, s_)
# update current state as next state
s = s_
# add episodic rewards
ep_reward += r
if int(ep_reward) == 0 and i > 200:
ddpg.save()
print("save")
quit()
if j == epsiode_steps - 1:
total_reward.append(ep_reward)
print('Episode:', i, ' Reward: %i' % int(ep_reward))
break
After finishing Coursera's Practical RL course on A3C, I'm trying to implement my own A3C agent using tensorflow 2. To start, I'm training it on the Cartpole environment but I can't get good results. For now, I've already launched several training with the following code, changing the entropy coefficient to see its impact (the results are shown below). Does it come from my implementation, or is it more a fine-tuning issue ?
class A3C:
def __init__(self, state_dim, n_actions, optimizer=tf.keras.optimizers.Adam(1e-3)):
self.state_input = Input(shape=state_dim)
self.x = Dense(256, activation='relu')(self.state_input)
self.head_v = Dense(1, activation='linear')(self.x)
self.head_p = Dense(n_actions, activation='linear')(self.x)
self.network = tf.keras.Model(inputs=[self.state_input], outputs=[self.head_v, self.head_p])
self.optimizer = optimizer
def forward(self, state):
return self.network(state)
def sample(self, logits):
policy = np.exp(logits.numpy()) / np.sum(np.exp(logits.numpy()), axis=-1, keepdims=True)
return np.array([np.random.choice(len(p), p=p) for p in policy])
def evaluate(agent, env, n_games=1): """Plays an a game from start till done, returns per-game rewards """
game_rewards = []
for _ in range(n_games):
state = env.reset()
total_reward = 0
while True:
action = agent.sample(agent.forward(np.array([state]))[1])[0]
state, reward, done, info = env.step(action)
total_reward += reward
if done: break
game_rewards.append(total_reward)
return game_rewards
class EnvBatch:
def __init__(self, n_envs = 10):
self.envs = [gym.make(env_id) for _ in range(n_envs)]
def reset(self):
return np.array([env.reset() for env in self.envs])
def step(self, actions):
results = [env.step(a) for env, a in zip(self.envs, actions)]
new_obs, rewards, done, infos = map(np.array, zip(*results))
for i in range(len(self.envs)):
if done[i]:
new_obs[i] = self.envs[i].reset()
return new_obs, rewards, done, infos
env_id = "CartPole-v0"
env = gym.make(env_id)
state_dim = env.observation_space.shape
n_actions = env.action_space.n
agent = A3C(state_dim, n_actions)
env_batch = EnvBatch(10)
batch_states = env_batch.reset()
gamma=0.99
rewards_history = []
entropy_history = []
for i in trange(200000):
with tf.GradientTape() as t:
batch_values, batch_logits = agent.forward(batch_states)
batch_actions = agent.sample(batch_logits)
batch_next_states, batch_rewards, batch_dones, _ = env_batch.step(batch_actions)
batch_next_values, btach_next_logits = agent.forward(batch_next_states)
batch_next_values *= (1 - batch_dones)
probs = tf.nn.softmax(batch_logits)
logprobs = tf.nn.log_softmax(batch_logits)
logp_actions = tf.reduce_sum(logprobs * tf.one_hot(batch_actions, n_actions), axis=-1)
advantage = batch_rewards + gamma*batch_next_values - batch_values
entropy = -tf.reduce_sum(probs * logprobs, 1, name="entropy")
actor_loss = - tf.reduce_mean(logp_actions * tf.stop_gradient(advantage)) - 0.005 * tf.reduce_mean(entropy)
target_state_values = batch_rewards + gamma*batch_next_values
critic_loss = tf.reduce_mean((batch_values - tf.stop_gradient(target_state_values))**2 )
loss = actor_loss + critic_loss
var_list = agent.network.trainable_variables
grads = t.gradient(loss,var_list)
agent.optimizer.apply_gradients(zip(grads, var_list))
batch_states = batch_next_states
entropy_history.append(np.mean(entropy))
if i % 500 == 0:
if i % 2500 == 0:
rewards_history.append(np.mean(evaluate(agent, env, n_games=3)))
clear_output(True)
plt.figure(figsize=[8, 4])
plt.subplot(1, 2, 1)
plt.plot(rewards_history, label='rewards')
plt.title("Session rewards")
plt.grid()
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(entropy_history, label='entropy')
plt.title("Policy entropy")
plt.grid()
plt.legend()
plt.show()
Beta = 0.005 - Training 1
Beta = 0.005 - Training 2
Beta = 0.005 - Training 3
Beta = 0.05 - Training 1
Beta = 0.05 - Training 2
Beta = 0.05 - Training 3
I've looked through your code, and it doesn't look like there's any problem with the algorithm. That is, it seems to me that the Hyper Parameter was chosen incorrectly. Try different Hyper Parameter Sets. If it doesn't work properly, refer to repository
The critic loss is wrong. You should get first expect returns, predicting the next state and iterate over it with bellman equation.
Here is an example:
def getExpectedReturns(self, states, next_states, done, rewards, standarize=True):
# Get next value
if done[-1] == 1.0:
arr_idx = np.zeros((rewards.shape[0], 1))
arr_idx[-1] = 1.0
values_rewards_sum_one_hot = tf.convert_to_tensor(arr_idx, dtype=tf.float32)
next_value = tf.reduce_sum(rewards * values_rewards_sum_one_hot, axis=0)
else:
values_rewards_sum = self.model_a2c(next_states)[-1]
arr_idx = np.zeros((rewards.shape[0], 1))
arr_idx[0] = 1.0
values_rewards_sum_one_hot = tf.convert_to_tensor(arr_idx, dtype=tf.float32)
next_value = tf.reduce_sum(values_rewards_sum * values_rewards_sum_one_hot, axis=0)
# Iterate over rewards
list_true_values = []
for i in reversed(range(0, len(rewards))):
if done[i]==0.0:
next_value = rewards[i] + next_value * self.gamma
else:
next_value = rewards[i]
list_true_values.append(next_value)
list_true_values.reverse()
list_true_values = tf.convert_to_tensor(list_true_values, dtype=tf.float32)
if standarize:
list_true_values = ((list_true_values - tf.math.reduce_mean(list_true_values)) /
(tf.math.reduce_std(list_true_values) + tf.constant(1e-12)))
return list_true_values
with tf.GradientTape() as tape:
# Advantage
returns = self.getExpectedReturns(states, next_states, done, rewards, standarize=False)
actions_probs_logits, values = self.model_a2c(states)
advantage = returns - values
advantage = tf.squeeze(advantage)
# Actions probs
actions_probs_softmax = tf.nn.softmax(actions_probs_logits)
actions_log_probs_softmax = tf.nn.log_softmax(actions_probs_logits)
actions_one_hot = tf.one_hot(actions, self.num_actions, 1.0, 0.0)
actions_log_probs = tf.reduce_sum(actions_log_probs_softmax * actions_one_hot, axis=-1)
# Entropy
entropy = self.entropy_coef * tf.reduce_mean(actions_probs_softmax * actions_log_probs_softmax, axis=1)
# Losses
actor_loss = -tf.reduce_mean(actions_log_probs * tf.stop_gradient(advantage), axis=0)
critic_loss = self.critic_coef * tf.reduce_mean(tf.math.pow(advantage, 2), axis=0)
total_loss = actor_loss + critic_loss - entropy