Reinforcement Learning - only size-1 arrays can be converted to Python scalars - is it data problem? - python

I'm new to pytorch and even though I was searching for this error I can't seem to understand where axactly I'm doing something wrong.
I'm trying to run a codewith a model that trades 3 different stocks. My data is a csv file with three columns with closing prices of stocks.
I'm trying to run this part of code
env.reset()
# In case you're running this a second time with the same model, delete the gradients
del model.rewards[:]
del model.saved_actions[:]
gamma = 0.9
log_interval = 60
def finish_episode():
R = 0
saved_actions = model.saved_actions
policy_losses = []
value_losses = []
rewards = []
for r in model.rewards[::-1]:
R = r + (gamma * R)
rewards.insert(0, R)
rewards = torch.tensor(rewards)
epsilon = (torch.rand(1) / 1e4) - 5e-5
# With different architectures, I found the following standardization step sometimes
# helpful, sometimes unhelpful.
# rewards = (rewards - rewards.mean()) / (rewards.std(unbiased=False) + epsilon)
# Alternatively, comment it out and use the following line instead:
rewards += epsilon
for (log_prob, value), r in zip(saved_actions, rewards):
reward = torch.tensor(r - value.item()).cuda()
policy_losses.append(-log_prob * reward)
value_losses.append(F.smooth_l1_loss(value, torch.tensor([r]).cuda()))
optimizer.zero_grad()
loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
loss = torch.clamp(loss, -1e-5, 1e5)
loss.backward()
optimizer.step()
del model.rewards[:]
del model.saved_actions[:]
running_reward = 0
for episode in range(0, 4000):
state = env.reset()
reward = 0
done = False
msg = None
while not done:
action = model.act(state)
state, reward, done, msg = env.step(action)
model.rewards.append(reward)
if done:
break
running_reward = running_reward * (1 - 1/log_interval) + reward * (1/log_interval)
finish_episode()
# Resetting the hidden state seems unnecessary - it's effectively random from the previous
# episode anyway, more random than a bunch of zeros.
# model.reset_hidden()
if msg["msg"] == "done" and env.portfolio_value() > env.starting_portfolio_value * 1.1 and running_reward > 500:
print("Early Stopping: " + str(int(reward)))
break
if episode % log_interval == 0:
print("""Episode {}: started at {:.1f}, finished at {:.1f} because {} # t={}, \
last reward {:.1f}, running reward {:.1f}""".format(episode, env.starting_portfolio_value, \
env.portfolio_value(), msg["msg"], env.cur_timestep, reward, running_reward))
But I'm getting such an error:
TypeError Traceback (most recent call last)
<ipython-input-91-ce955397be85> in <module>()
45 msg = None
46 while not done:
---> 47 action = model.act(state)
48 state, reward, done, msg = env.step(action)
49 model.rewards.append(reward)
1 frames
<ipython-input-89-f463539c7fe3> in forward(self, x)
16
17 def forward(self, x):
---> 18 x = torch.tensor(x).cuda()
19 x = torch.sigmoid(self.input_layer(x))
20 x = torch.tanh(self.hidden_1(x))
TypeError: only size-1 arrays can be converted to Python scalars
This is the part of code with forward function defined
class Policy(nn.Module):
def __init__(self):
super(Policy, self).__init__()
self.input_layer = nn.Linear(11, 128)
self.hidden_1 = nn.Linear(128, 128)
self.hidden_2 = nn.Linear(32,31)
self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
self.rnn = nn.GRU(128, 32, 2)
self.action_head = nn.Linear(31, 5)
self.value_head = nn.Linear(31, 1)
self.saved_actions = []
self.rewards = []
def reset_hidden(self):
self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
def forward(self, x):
x = torch.tensor(x).cuda()
x = torch.sigmoid(self.input_layer(x))
x = torch.tanh(self.hidden_1(x))
x, self.hidden_state = self.rnn(x.view(1,-1,128), self.hidden_state.data)
x = F.relu(self.hidden_2(x.squeeze()))
action_scores = self.action_head(x)
state_values = self.value_head(x)
return F.softmax(action_scores, dim=-1), state_values
def act(self, state):
probs, state_value = self.forward(state)
m = Categorical(probs)
action = m.sample()
if action == 1 and env.state[0] < 1: action = torch.LongTensor([2]).squeeze().cuda()
if action == 4 and env.state[1] < 1: action = torch.LongTensor([2]).squeeze().cuda()
if action == 6 and env.state[2] < 1: action = torch.LongTensor([2]).squeeze().cuda()
self.saved_actions.append((m.log_prob(action), state_value))
return action.item()
Can you please direct me where I should make changes? Is it the data I'm feeding my model with, or something different?
Thank you so much for help

You are passing state = env.reset() to:
action = model.act(state)
probs, state_value = self.forward(state)
x = torch.tensor(x).cuda()
And hence torch is throwing an error. It expects a numeric or array type input.

Related

Python negative Value Error dimensions are not allowed

I am implementing genetic algorithm but I am facing an error after the first generation with the message: ValueError: negative dimensions are not allowed
I actually change the nfilters parameter from nfilters=[74,27,23] to nfilters=[64,128,256], I don't know if it is due to this parameters.
I declared my class sequential as follow:
class CNN(Sequential):
def __init__(self,nfilters,sfilters):
super().__init__()
tf.random.set_seed(0)
self.add(Conv2D(nfilters[0],kernel_size=(sfilters[0],sfilters[0]),padding='same',activation='relu',input_shape=(50,50,3)))
self.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
self.add(Conv2D(nfilters[1],kernel_size=(sfilters[1],sfilters[1]),padding='same',activation='relu'))
self.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
self.add(Conv2D(nfilters[2],kernel_size=(sfilters[2],sfilters[2]),padding='same',activation='relu'))
self.add(Conv2D(nfilters[2], kernel_size=(sfilters[2], sfilters[2]), padding='same', activation='relu'))
self.add(Flatten())
self.add(Dropout(0.5))
self.add(Dense(128,activation='relu'))
self.add(Dropout(0.5))
self.add(Dense(128, activation='relu'))
self.add(Dense(num_classes, activation='sigmoid'))
self.compile(loss=keras.losses.binary_crossentropy,
optimizer=tf.optimizers.Adam(learning_rate=0.001),
metrics=['accuracy'])
nfilters = [64,128,256] #nfilters = [74,27,23]
sfilters = [9,3,2] #sfilters = [9,3,2]
Then my class genetic is declared as the following:
class Genetic:
def __init__(self,pop_size,nlayers,max_nfilters,max_sfilters):
self.pop_size = pop_size
self.nlayers = nlayers
self.max_nfilters = max_nfilters
self.max_sfilters = max_sfilters
self.max_acc = 0
self.best_arch = np.zeros((1,6))
self.gen_acc = []
def generate_population(self):
np.random.seed(0)
pop_nlayers = np.random.randint(1,self.max_nfilters,(self.pop_size,self.nlayers))
pop_sfilters = np.random.randint(1,self.max_sfilters,(self.pop_size,self.nlayers))
pop_total = np.concatenate((pop_nlayers,pop_sfilters),axis=1)
return pop_total
def select_parents(self,pop,nparents,fitness):
parents = np.zeros((nparents,pop.shape[1]))
for i in range(nparents):
best = np.argmax(fitness)
parents[i] = pop[best]
fitness[best] = -99999
return parents
def crossover(self,parents):
nchild = self.pop_size - parents.shape[0]
nparents = parents.shape[0]
child = np.zeros((nchild,parents.shape[1]))
for i in range(nchild):
first = i % nparents
second = (i+1) % nparents
child[i,:2] = parents[first][:2]
child[i,2] = parents[second][2]
child[i,3:5] = parents[first][3:5]
child[i,5] = parents[second][5]
return child
def mutation(self,child):
for i in range(child.shape[0]):
val = np.random.randint(1,6)
ind = np.random.randint(1,4) - 1
if child[i][ind] + val > 100:
child[i][ind] -= val
else:
child[i][ind] += val
val = np.random.randint(1,4)
ind = np.random.randint(4,7) - 1
if child[i][ind] + val > 20:
child[i][ind] -= val
else:
child[i][ind] += val
return child
def fitness(self,pop,X,Y,epochs):
pop_acc = []
for i in range(pop.shape[0]):
nfilters = pop[i][0:3]
sfilters = pop[i][3:]
model = CNN(nfilters,sfilters)
#H = model.fit_generator(datagen.flow(X,Y,batch_size=256),epochs=epochs,callbacks=[early_stopping_monitor])
H = model.fit_generator(datagen.flow(X,Y,batch_size=256),steps_per_epoch=len(X_trainRusReshaped) / batch_size,epochs=epochs,validation_data=(X_testRusReshaped, Y_testRusHot),callbacks=[early_stopping_monitor])
acc = H.history['accuracy']
pop_acc.append(max(acc)*100)
if max(pop_acc) > self.max_acc:
self.max_acc = max(pop_acc)
self.best_arch = pop[np.argmax(pop_acc)]
self.gen_acc.append(max(pop_acc))
return pop_acc
def smooth_curve(self,factor,gen):
smoothed_points = []
for point in self.gen_acc:
if smoothed_points:
prev = smoothed_points[-1]
smoothed_points.append(prev*factor + point * (1-factor))
else:
smoothed_points.append(point)
plt.plot(range(gen+1),smoothed_points,'g',label='Smoothed training acc')
plt.xticks(np.arange(gen+1))
plt.legend()
plt.title('Fitness Accuracy vs Generations')
plt.xlabel('Generations')
plt.ylabel('Fitness (%)')
plt.show()
plt.savefig('smoothCurve.png')
When I launch these lines of codes, I have the error after 20 epochs on the first generation:
#Starting Genetic Algoritm
pop_size = 2 #10
nlayers = 3 #3
max_nfilters = 500 #100
max_sfilters = 20
epochs = 20
num_generations = 2 #10
genCNN = Genetic(pop_size,nlayers,max_nfilters,max_sfilters)
pop = genCNN.generate_population()
for i in range(num_generations+1):
pop_acc = genCNN.fitness(pop,X_trainRusReshaped,Y_trainRusHot,epochs)
print('Best Accuracy at the generation {}: {}'.format(i,genCNN.max_acc))
parents = genCNN.select_parents(pop,5,pop_acc.copy())
child = genCNN.crossover(parents)
child = genCNN.mutation(child)
pop = np.concatenate((parents,child),axis=0).astype('int')
Any idea where this error is coming from? I tried to increase max_filters from 100 to 500 but it does not solved anything.

TypeError: argument of type 'method' is not iterable using RL Classifiers

I am working on a classification problem using Reinforcement Learning using the link https://github.com/gcamfer/Anomaly-ReactionRL/blob/master/Notebooks/AE_RL_awid.ipynb
I am facing error in the part of code shown below:
if __name__ == "__main__":
# Train batch
batch_size = 1
# batch of memory ExpRep
minibatch_size = 10
ExpRep = True
iterations_episode = 100
# Initialization of the enviroment
env = RLenv("train",batch_size=batch_size,
iterations_episode=iterations_episode)
# obs_size = size of the state
obs_size = env.data_shape[1]-len(env.all_attack_names)
#num_episodes = int(env.data_shape[0]/(iterations_episode)/10)
num_episodes = 100
'''
Definition for the defensor agent.
'''
defender_valid_actions = list(range(len(env.attack_types))) # only detect type of attack
defender_num_actions = len(defender_valid_actions)
def_epsilon = 1 # exploration
min_epsilon = 0.01 # min value for exploration
def_gamma = 0.001
def_decay_rate = 0.999
def_hidden_size = 100
def_hidden_layers = 2
def_learning_rate = .01
defender_agent = DefenderAgent(defender_valid_actions,obs_size,"EpsilonGreedy",
epoch_length = iterations_episode,
epsilon = def_epsilon,
min_epsilon = min_epsilon,
decay_rate = def_decay_rate,
gamma = def_gamma,
hidden_size=def_hidden_size,
hidden_layers=def_hidden_layers,
minibatch_size = minibatch_size,
mem_size = 1000,
learning_rate=def_learning_rate,
ExpRep=ExpRep)
#Pretrained defender
#defender_agent.model_network.model.load_weights("models/type_model.h5")
'''
Definition for the attacker agent.
In this case the exploration is better to be greater
The correlation sould be greater too so gamma bigger
'''
attack_valid_actions = list(range(len(env.attack_names)))
attack_num_actions = len(attack_valid_actions)
att_epsilon = 1
min_epsilon = 0.99 # min value for exploration
att_gamma = 0.001
att_decay_rate = 0.99
att_hidden_layers = 1
att_hidden_size = 100
att_learning_rate = 0.2
attacker_agent = AttackAgent(attack_valid_actions,obs_size,"EpsilonGreedy",
epoch_length = iterations_episode,
epsilon = att_epsilon,
min_epsilon = min_epsilon,
decay_rate = att_decay_rate,
gamma = att_gamma,
hidden_size=att_hidden_size,
hidden_layers=att_hidden_layers,
minibatch_size = minibatch_size,
mem_size = 1000,
learning_rate=att_learning_rate,
ExpRep=ExpRep)
# Statistics
att_reward_chain = []
def_reward_chain = []
att_loss_chain = []
def_loss_chain = []
def_total_reward_chain = []
att_total_reward_chain = []
# Print parameters
print("-------------------------------------------------------------------------------")
print("Total epoch: {} | Iterations in epoch: {}"
"| Minibatch from mem size: {} | Total Samples: {}|".format(num_episodes,
iterations_episode,minibatch_size,
num_episodes*iterations_episode))
print("-------------------------------------------------------------------------------")
print("Dataset shape: {}".format(env.data_shape))
print("-------------------------------------------------------------------------------")
print("Attacker parameters: Num_actions={} | gamma={} |"
" epsilon={} | ANN hidden size={} | "
"ANN hidden layers={}|".format(attack_num_actions,
att_gamma,att_epsilon, att_hidden_size,
att_hidden_layers))
print("-------------------------------------------------------------------------------")
print("Defense parameters: Num_actions={} | gamma={} | "
"epsilon={} | ANN hidden size={} |"
" ANN hidden layers={}|".format(defender_num_actions,
def_gamma,def_epsilon,def_hidden_size,
def_hidden_layers))
print("-------------------------------------------------------------------------------")
# Main loop
attacks_by_epoch = []
attack_labels_list = []
for epoch in range(num_episodes):
start_time = time.time()
att_loss = 0.
def_loss = 0.
def_total_reward_by_episode = 0
att_total_reward_by_episode = 0
# Reset enviromet, actualize the data batch with random state/attacks
states = env.reset()
# Get actions for actual states following the policy
attack_actions = attacker_agent.act(states)
states = env.get_states(attack_actions)
done = False
attacks_list = []
# Iteration in one episode
for i_iteration in range(iterations_episode):
attacks_list.append(attack_actions[0])
# apply actions, get rewards and new state
act_time = time.time()
defender_actions = defender_agent.act(states)
#Enviroment actuation for this actions
next_states,def_reward, att_reward,next_attack_actions, done = env.act(defender_actions,attack_actions)
# If the epoch*batch_size*iterations_episode is largest than the df
attacker_agent.learn(states,attack_actions,next_states,att_reward,done)
defender_agent.learn(states,defender_actions,next_states,def_reward,done)
act_end_time = time.time()
# Train network, update loss after at least minibatch_learns
if ExpRep and epoch*iterations_episode + i_iteration >= minibatch_size:
def_loss += defender_agent.update_model()
att_loss += attacker_agent.update_model()
elif not ExpRep:
def_loss += defender_agent.update_model()
att_loss += attacker_agent.update_model()
update_end_time = time.time()
# Update the state
states = next_states
attack_actions = next_attack_actions
# Update statistics
def_total_reward_by_episode += np.sum(def_reward,dtype=np.int32)
att_total_reward_by_episode += np.sum(att_reward,dtype=np.int32)
attacks_by_epoch.append(attacks_list)
# Update user view
def_reward_chain.append(def_total_reward_by_episode)
att_reward_chain.append(att_total_reward_by_episode)
def_loss_chain.append(def_loss)
att_loss_chain.append(att_loss)
end_time = time.time()
print("\r\n|Epoch {:03d}/{:03d}| time: {:2.2f}|\r\n"
"|Def Loss {:4.4f} | Def Reward in ep {:03d}|\r\n"
"|Att Loss {:4.4f} | Att Reward in ep {:03d}|"
.format(epoch, num_episodes,(end_time-start_time),
def_loss, def_total_reward_by_episode,
att_loss, att_total_reward_by_episode))
print("|Def Estimated: {}| Att Labels: {}".format(env.def_estimated_labels,
env.def_true_labels))
attack_labels_list.append(env.def_true_labels)
and the error I am facing is
TypeError: argument of type 'method' is not iterable
Any help would be appreciated. Thanks in advance.

I get horrible results with my DDPG model TF2

Hello my DDPG model that I have implemented in TF 2 get's horrible results at every env on openai-gym that has continuous actions I need help to find what's the problem. I run this on my GPU. On env Pendulum I get -1200/-1000 rewards on every episode. This code is from a course I took on udemy but it was written in TF1.x and I rewrote it in TF2 but his TF1.x implementation had better results. Here is the code:
import tensorflow as tf
import numpy as np
import os
import gym
import random
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model
class ReplayBuffer():
def __init__(self, obs_dim, act_dim, size):
self.obs1_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
self.obs2_buf = np.zeros([size, obs_dim, ], dtype=np.float32)
self.act_buf = np.zeros([size, act_dim], dtype=np.float32)
self.reward_buf = np.zeros(size, dtype=np.float32)
self.done_buf = np.zeros(size, dtype=np.float32)
self.current = 0
self.count = 0
self.size = size
def add_experience(self, state, action, reward, next_state, done):
self.obs1_buf[self.current] = state
self.act_buf[self.current] = action
self.reward_buf[self.current] = reward
self.obs2_buf[self.current] = next_state
self.done_buf[self.current] = done
self.current = (self.current + 1) % self.size
self.count = min(self.count+1, self.size)
def sample_batch(self, batch_size=32):
idx = np.random.randint(0, self.count, size=batch_size)
return dict(s=self.obs1_buf[idx],
s2=self.obs2_buf[idx],
a=self.act_buf[idx],
r=self.reward_buf[idx],
d=self.done_buf[idx])
class DDPG():
def __init__(self, env, num_states, num_actions, action_max):
self.env = env
self.num_states = num_states
self.num_actions = num_actions
self.action_max = action_max
self.gamma = 0.99
self.decay = 0.995
self.mu_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
self.q_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
def mu_model(hidden_layers):
inp = Input(shape=(self.num_states, ))
x = inp
for layers in hidden_layers[:-1]:
x = Dense(layers, activation='relu')(x)
x = Dense(hidden_layers[-1], activation='tanh')(x)
mu_model = Model(inp, x)
return mu_model
self.mu_model = mu_model([300, self.num_actions])
def q_model(inp_state, inp_act, hidden_layers):
inp_state = Input(shape=(inp_state, ))
inp_mu = Input(shape=(inp_act, ))
inp = concatenate([inp_state, inp_mu])
x = inp
for layers in hidden_layers[:-1]:
x = Dense(layers, activation='relu')(x)
x = Dense(hidden_layers[-1], activation='linear')(x)
q_model = Model([inp_state, inp_mu], x)
return q_model
self.q_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])
self.q_target_model = q_model(self.num_states, self.num_actions, hidden_layers=[300, 1])
#Eself.mu_do_minimize = tf.function(self.mu_minimize, input_signature=[
#tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state')])
self.q_do_minimize = tf.function(self.q_minimize, input_signature=[
tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='state'),
tf.TensorSpec(shape=(None, self.num_actions), dtype=tf.float32, name='action'),
tf.TensorSpec(shape=(None, self.num_states), dtype=tf.float32, name='next_state'),
tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='reward'),
tf.TensorSpec(shape=(None, ), dtype=tf.float32, name='done_flags')])
#tf.function
def train_mu(self, state):
with tf.GradientTape() as tape:
actions = self.mu_model(state, training=True)
critic_value = self.q_model([state, actions], training=True)
# Used `-value` as we want to maximize the value given
# by the critic for our actions
actor_loss = -tf.math.reduce_mean(critic_value)
actor_grad = tape.gradient(actor_loss, self.mu_model.trainable_variables)
self.mu_optimizer.apply_gradients(
zip(actor_grad, self.mu_model.trainable_variables)
)
def q_minimize(self, state, action, next_state, reward, done):
def calc_loss():
q_targ = reward + self.gamma * (1 - done) * self.q_target_model([next_state, action])
q = self.q_model([state, action])
cost = tf.reduce_mean((q - q_targ)**2)
return cost
self.q_optimizer.minimize(calc_loss, self.q_model.trainable_variables)
def train(self, state, action, reward, done, next_state):
state = np.atleast_2d(state)
next_state = np.atleast_2d(next_state)
action = np.atleast_2d(action)
reward = np.atleast_1d(reward)
done = np.atleast_1d(done)
self.update_target_net()
self.train_mu(state)
self.q_do_minimize(state, action, next_state, reward, done)
def update_target_net(self):
mu_weights = np.array(self.mu_model.get_weights())
q_weights = np.array(self.q_model.get_weights())
#print(mu_weights.shape)
#print(q_weights.shape)
mu_target_weights = np.array(self.mu_target_model.get_weights())
q_target_weights = np.array(self.q_target_model.get_weights())
self.q_target_model.set_weights(self.decay * q_weights + (1 - self.decay) * q_target_weights)
def get_action(self, states, noise=None):
if noise is None: noise = self.ACT_NOISE_SCALE
if len(states.shape) == 1: states = states.reshape(1,-1)
action = self.mu_model.predict_on_batch(states)[0]
if noise != 0:
action += noise * np.random.randn(self.num_actions)
action = np.clip(action, -self.action_max, self.action_max)
return action
def play_one(env, agent, replay_buffer, gamma=0.99, noise=0.1, max_episode_len=1000, start_steps=10000, num_train_ep=100, batch_size=100, test_ep_agent=25):
returns = []
num_steps = 0
for ep in range(num_train_ep):
s, ep_return, ep_len, d = env.reset(), 0, 0, False
while not (d or ep_len == max_episode_len):
env.render()
if num_steps > start_steps:
a = agent.get_action(s, noise)
else:
a = env.action_space.sample()
num_steps+=1
if num_steps == start_steps:
print("USING AGENT ACTIONS NOW")
s2, r, d, _ = env.step(a)
ep_return+=r
ep_len+=1
#print(s.shape)
d = False if ep_len == max_episode_len else d
replay_buffer.add_experience(s, a, r, s2, d)
s = s2
for _ in range(ep_len):
batch = replay_buffer.sample_batch()
state, next_state, action, reward, done = batch['s'], batch['s2'], batch['a'], batch['r'], batch['d']
loss = agent.train(state, action, reward, done, next_state)
returns.append(ep_return)
print('Iter:', ep, 'Rewards:', ep_return)
return returns
if __name__ == '__main__':
env = gym.make('Pendulum-v0')
obs_dim1 = env.observation_space.shape[0]
act_dim1 = env.action_space.shape[0]
action_max1 = env.action_space.high[0]
actor = DDPG(env, obs_dim1, act_dim1, action_max1)
replay_buffer = ReplayBuffer(obs_dim1, act_dim1, size=100000)
returns = play_one(env, actor, replay_buffer)
Thanks you in advance!
First things that comes to mind is the learning rate: 0.01 is too high, even for pendulum. Try a lower learning rate (eg 1e-3 for the actor and 5e-3 for the critic).
Also a couple of things look off in your code:
There is no target network for the actor. Why is that? IIRC ddpg has target network for both actor and critic.
Usually it is better to initialize main and target network with the same parameters. You can do that with target_model.set_weights(model.get_weights())
In the function play_one the training steps are done after playing a whole episode. This is probably ok, but there is no need to: because pendulum is not real time you don't need your code to be fast, so you can train while playing.
If you want to take a look I implemented ddpg in tensorflow 2 a while back. It solves pendulum in 80ish episodes.
GitHub

Implementing A3C on TensorFlow 2

After finishing Coursera's Practical RL course on A3C, I'm trying to implement my own A3C agent using tensorflow 2. To start, I'm training it on the Cartpole environment but I can't get good results. For now, I've already launched several training with the following code, changing the entropy coefficient to see its impact (the results are shown below). Does it come from my implementation, or is it more a fine-tuning issue ?
class A3C:
def __init__(self, state_dim, n_actions, optimizer=tf.keras.optimizers.Adam(1e-3)):
self.state_input = Input(shape=state_dim)
self.x = Dense(256, activation='relu')(self.state_input)
self.head_v = Dense(1, activation='linear')(self.x)
self.head_p = Dense(n_actions, activation='linear')(self.x)
self.network = tf.keras.Model(inputs=[self.state_input], outputs=[self.head_v, self.head_p])
self.optimizer = optimizer
def forward(self, state):
return self.network(state)
def sample(self, logits):
policy = np.exp(logits.numpy()) / np.sum(np.exp(logits.numpy()), axis=-1, keepdims=True)
return np.array([np.random.choice(len(p), p=p) for p in policy])
def evaluate(agent, env, n_games=1): """Plays an a game from start till done, returns per-game rewards """
game_rewards = []
for _ in range(n_games):
state = env.reset()
total_reward = 0
while True:
action = agent.sample(agent.forward(np.array([state]))[1])[0]
state, reward, done, info = env.step(action)
total_reward += reward
if done: break
game_rewards.append(total_reward)
return game_rewards
class EnvBatch:
def __init__(self, n_envs = 10):
self.envs = [gym.make(env_id) for _ in range(n_envs)]
def reset(self):
return np.array([env.reset() for env in self.envs])
def step(self, actions):
results = [env.step(a) for env, a in zip(self.envs, actions)]
new_obs, rewards, done, infos = map(np.array, zip(*results))
for i in range(len(self.envs)):
if done[i]:
new_obs[i] = self.envs[i].reset()
return new_obs, rewards, done, infos
env_id = "CartPole-v0"
env = gym.make(env_id)
state_dim = env.observation_space.shape
n_actions = env.action_space.n
agent = A3C(state_dim, n_actions)
env_batch = EnvBatch(10)
batch_states = env_batch.reset()
gamma=0.99
rewards_history = []
entropy_history = []
for i in trange(200000):
with tf.GradientTape() as t:
batch_values, batch_logits = agent.forward(batch_states)
batch_actions = agent.sample(batch_logits)
batch_next_states, batch_rewards, batch_dones, _ = env_batch.step(batch_actions)
batch_next_values, btach_next_logits = agent.forward(batch_next_states)
batch_next_values *= (1 - batch_dones)
probs = tf.nn.softmax(batch_logits)
logprobs = tf.nn.log_softmax(batch_logits)
logp_actions = tf.reduce_sum(logprobs * tf.one_hot(batch_actions, n_actions), axis=-1)
advantage = batch_rewards + gamma*batch_next_values - batch_values
entropy = -tf.reduce_sum(probs * logprobs, 1, name="entropy")
actor_loss = - tf.reduce_mean(logp_actions * tf.stop_gradient(advantage)) - 0.005 * tf.reduce_mean(entropy)
target_state_values = batch_rewards + gamma*batch_next_values
critic_loss = tf.reduce_mean((batch_values - tf.stop_gradient(target_state_values))**2 )
loss = actor_loss + critic_loss
var_list = agent.network.trainable_variables
grads = t.gradient(loss,var_list)
agent.optimizer.apply_gradients(zip(grads, var_list))
batch_states = batch_next_states
entropy_history.append(np.mean(entropy))
if i % 500 == 0:
if i % 2500 == 0:
rewards_history.append(np.mean(evaluate(agent, env, n_games=3)))
clear_output(True)
plt.figure(figsize=[8, 4])
plt.subplot(1, 2, 1)
plt.plot(rewards_history, label='rewards')
plt.title("Session rewards")
plt.grid()
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(entropy_history, label='entropy')
plt.title("Policy entropy")
plt.grid()
plt.legend()
plt.show()
Beta = 0.005 - Training 1
Beta = 0.005 - Training 2
Beta = 0.005 - Training 3
Beta = 0.05 - Training 1
Beta = 0.05 - Training 2
Beta = 0.05 - Training 3
I've looked through your code, and it doesn't look like there's any problem with the algorithm. That is, it seems to me that the Hyper Parameter was chosen incorrectly. Try different Hyper Parameter Sets. If it doesn't work properly, refer to repository
The critic loss is wrong. You should get first expect returns, predicting the next state and iterate over it with bellman equation.
Here is an example:
def getExpectedReturns(self, states, next_states, done, rewards, standarize=True):
# Get next value
if done[-1] == 1.0:
arr_idx = np.zeros((rewards.shape[0], 1))
arr_idx[-1] = 1.0
values_rewards_sum_one_hot = tf.convert_to_tensor(arr_idx, dtype=tf.float32)
next_value = tf.reduce_sum(rewards * values_rewards_sum_one_hot, axis=0)
else:
values_rewards_sum = self.model_a2c(next_states)[-1]
arr_idx = np.zeros((rewards.shape[0], 1))
arr_idx[0] = 1.0
values_rewards_sum_one_hot = tf.convert_to_tensor(arr_idx, dtype=tf.float32)
next_value = tf.reduce_sum(values_rewards_sum * values_rewards_sum_one_hot, axis=0)
# Iterate over rewards
list_true_values = []
for i in reversed(range(0, len(rewards))):
if done[i]==0.0:
next_value = rewards[i] + next_value * self.gamma
else:
next_value = rewards[i]
list_true_values.append(next_value)
list_true_values.reverse()
list_true_values = tf.convert_to_tensor(list_true_values, dtype=tf.float32)
if standarize:
list_true_values = ((list_true_values - tf.math.reduce_mean(list_true_values)) /
(tf.math.reduce_std(list_true_values) + tf.constant(1e-12)))
return list_true_values
with tf.GradientTape() as tape:
# Advantage
returns = self.getExpectedReturns(states, next_states, done, rewards, standarize=False)
actions_probs_logits, values = self.model_a2c(states)
advantage = returns - values
advantage = tf.squeeze(advantage)
# Actions probs
actions_probs_softmax = tf.nn.softmax(actions_probs_logits)
actions_log_probs_softmax = tf.nn.log_softmax(actions_probs_logits)
actions_one_hot = tf.one_hot(actions, self.num_actions, 1.0, 0.0)
actions_log_probs = tf.reduce_sum(actions_log_probs_softmax * actions_one_hot, axis=-1)
# Entropy
entropy = self.entropy_coef * tf.reduce_mean(actions_probs_softmax * actions_log_probs_softmax, axis=1)
# Losses
actor_loss = -tf.reduce_mean(actions_log_probs * tf.stop_gradient(advantage), axis=0)
critic_loss = self.critic_coef * tf.reduce_mean(tf.math.pow(advantage, 2), axis=0)
total_loss = actor_loss + critic_loss - entropy

Pytorch PPO implementation is not learning

This PPO implementation has a bug somewhere and I can't figure out what's wrong. The network returns a normal distribution and a value estimate from the critic. The last layer of the actor provides four F.tanhed action values, which are used as mean value for the distribution. nn.Parameter(torch.zeros(action_dim)) is the standard deviation.
The trajectories for 20 parallel agents are added to the same memory. Episode length is 1000 and memory.sample() returns a np.random.permutation of the 20k memory entries as tensors with batches of size 64. Before stacking the batch tensors, the values are stored as (1,-1) tensors in collection.deques. The returned tensors are detach()ed.
environment
brain_name = envs.brain_names[0]
env_info = envs.reset(train_mode=True)[brain_name]
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
update step
def clipped_surrogate_update(policy, memory, num_epochs=10, clip_param=0.2, gradient_clip=5, beta=0.001, value_loss_coeff=0.5):
advantages_batch, states_batch, log_probs_old_batch, returns_batch, actions_batch = memory.sample()
advantages_batch = (advantages_batch - advantages_batch.mean()) / advantages_batch.std()
for _ in range(num_epochs):
for i in range(len(advantages_batch)):
advantages_sample = advantages_batch[i]
states_sample = states_batch[i]
log_probs_old_sample = log_probs_old_batch[i]
returns_sample = returns_batch[i]
actions_sample = actions_batch[i]
dist, values = policy(states_sample)
log_probs_new = dist.log_prob(actions_sample.to(device)).sum(-1).unsqueeze(-1)
entropy = dist.entropy().sum(-1).unsqueeze(-1).mean()
ratio = (log_probs_new - log_probs_old_sample).exp()
clipped_ratio = torch.clamp(ratio, 1-clip_param, 1+clip_param)
clipped_surrogate_loss = -torch.min(ratio*advantages_sample, clipped_ratio*advantages_sample).mean()
value_function_loss = (returns_sample - values).pow(2).mean()
Loss = clipped_surrogate_loss - beta * entropy + value_loss_coeff * value_function_loss
optimizer_policy.zero_grad()
Loss.backward()
torch.nn.utils.clip_grad_norm_(policy.parameters(), gradient_clip)
optimizer_policy.step()
del Loss
data sampling
def collect_trajectories(envs, env_info, policy, memory, tmax=200, nrand=0, gae_tau = 0.95, discount = 0.995):
next_episode = False
states = env_info.vector_observations
n_agents = len(env_info.agents)
state_list=[]
reward_list=[]
prob_list=[]
action_list=[]
value_list=[]
if nrand > 0:
# perform nrand random steps
for _ in range(nrand):
actions = np.random.randn(num_agents, action_size)
actions = np.clip(actions, -1, 1)
env_info = envs.step(actions)[brain_name]
states = env_info.vector_observations
for t in range(tmax):
states = torch.FloatTensor(states).to(device)
dist, values = policy(states)
actions = dist.sample()
probs = dist.log_prob(actions).sum(-1).unsqueeze(-1)
env_info = envs.step(actions.cpu().detach().numpy())[brain_name]
next_states = env_info.vector_observations
rewards = env_info.rewards
dones = env_info.local_done
state_list.append(states)
reward_list.append(rewards)
prob_list.append(probs)
action_list.append(actions)
value_list.append(values)
states = next_states
if np.any(dones):
next_episode = True
break
_, next_value = policy(torch.FloatTensor(states).to(device))
reward_arr = np.array(reward_list)
undiscounted_rewards = np.sum(reward_arr, axis=0)
state_arr = torch.stack(state_list)
prob_arr = torch.stack(prob_list)
action_arr = torch.stack(action_list)
value_arr = torch.stack(value_list)
reward_arr = torch.FloatTensor(reward_arr[:, :, np.newaxis])
advantage_list = []
return_list = []
returns = next_value.detach()
advantages = torch.FloatTensor(np.zeros((n_agents, 1)))
for i in reversed(range(state_arr.shape[0])):
returns = reward_arr[i] + discount * returns
td_error = reward_arr[i] + discount * next_value - value_arr[i]
advantages = advantages * gae_tau * discount + td_error
next_value = value_arr[i]
advantage_list.append(advantages.detach())
return_list.append(returns.detach())
advantage_arr = torch.stack(advantage_list)
return_arr = torch.stack(return_list)
for i in range(state_arr.shape[0]):
memory.add({'advantages': advantage_arr[i],
'states': state_arr[i],
'log_probs_old': prob_arr[i],
'returns': return_arr[i],
'actions': action_arr[i]})
return undiscounted_rewards, next_episode
In the Generalized Advantage Estimation loop advantages and returns are added in reversed order.
advantage_list.insert(0, advantages.detach())
return_list.insert(0, returns.detach())

Categories