Sarsa algorithm, why Q-values tend to zero? - python

I'm trying to implement Sarsa algorithm for solving a Frozen Lake environment from OpenAI gym. I've started soon to work with this but I think I understand it.
I also understand how Sarsa algorithm works, there're many sites where to find a pseudocode, and I get it. I've implemented this algorithm in my problem following all the steps, but when I check the final Q function after all the episodes I notice that all values tend to zero and I don't know why.
Here is my code, I hope someone can tell me why that happens.
import gym
import random
import numpy as np
env = gym.make('FrozenLake-v0')
#Initialize the Q matrix 16(rows)x4(columns)
Q = np.zeros([env.observation_space.n, env.action_space.n])
for i in range(env.observation_space.n):
if (i != 5) and (i != 7) and (i != 11) and (i != 12) and (i != 15):
for j in range(env.action_space.n):
Q[i,j] = np.random.rand()
#Epsilon-Greedy policy, given a state the agent chooses the action that it believes has the best long-term effect with probability 1-eps, otherwise, it chooses an action uniformly at random. Epsilon may change its value.
bestreward = 0
epsilon = 0.1
discount = 0.99
learning_rate = 0.1
num_episodes = 50000
a = [0,0,0,0,0,0,0,0,0,0]
for i_episode in range(num_episodes):
# Observe current state s
observation = env.reset()
currentState = observation
# Select action a using a policy based on Q
if np.random.rand() <= epsilon: #pick randomly
currentAction = random.randint(0,env.action_space.n-1)
else: #pick greedily
currentAction = np.argmax(Q[currentState, :])
totalreward = 0
while True:
env.render()
# Carry out an action a
observation, reward, done, info = env.step(currentAction)
if done is True:
break;
# Observe reward r and state s'
totalreward += reward
nextState = observation
# Select action a' using a policy based on Q
if np.random.rand() <= epsilon: #pick randomly
nextAction = random.randint(0,env.action_space.n-1)
else: #pick greedily
nextAction = np.argmax(Q[nextState, :])
# update Q with Q-learning
Q[currentState, currentAction] += learning_rate * (reward + discount * Q[nextState, nextAction] - Q[currentState, currentAction])
currentState = nextState
currentAction = nextAction
print "Episode: %d reward %d best %d epsilon %f" % (i_episode, totalreward, bestreward, epsilon)
if totalreward > bestreward:
bestreward = totalreward
if i_episode > num_episodes/2:
epsilon = epsilon * 0.9999
if i_episode >= num_episodes-10:
a.insert(0, totalreward)
a.pop()
print a
for i in range(env.observation_space.n):
print "-----"
for j in range(env.action_space.n):
print Q[i,j]

When a episode ends you are breaking the while loop before updating the Q function. Therefore, when the reward received by the agent is different from zero (the agent has reached the goal state), the Q function is never updated in that reward.
You should check for the end of the episode in the last part of the while loop.

Related

custom dqn agent can't output correct action

I customize a dqn agent to solve a circuit problem.for example, the state is 1D input which represents five nodes' value(node_0 to node_4),shape=(5,),and actions are choosing one of six components (like whose values are [0,1,2,3,4,5])to place in the circuit to get a new state,named state_.So the action_space is (6,).My goal is to make five values that in one state to reach fixed value as possible.For example,the initial state is [0.8,0.7,0.9,0.98,0.9] and my goal is to make five value higher than 0.95. I mean, i put a component whose value is 3, in node_0 and it becomes 0.95 from 0.8,it meets the requirments.and the node_3 don't need place a component because it is 0.98.And i set up a limit that the sum of placed components value can't over 10.
here are some hyperparameter:
gamma = 0.9
TARGET_REPLACE_ITER = 500
nodes = 5
memory_capability = 1000
batch_size = 30
epsilon_start =1
epsilon_end = 0.0001
epsilon_decay = 0.06
learning_rate = 0.001
epsilon = 1
n_state = 5
n_action = 6
I make two neural networks to do,one is eval_net and the other is target_net,the code is below:
class NN(nn.Module):
def __init__(self, ):
super(NN,self).__init__()
self.fc1 = nn.Linear(n_state, 16)
self.fc1.weight.data.normal_(0, 0.1)
self.fc2 = nn.Linear(16,32)
self.out = nn.Linear(32, n_action)
self.out.weight.data.normal_(0, 0.1)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
action_value = self.out(x)
return action_value
here is the agent:
class Ckt_Opt(object):
def __init__(self,):
self.learn_step_counter = 0
self.memory = np.zeros((memory_capability, n_state * 2 + 2))
self.memory_cntr = 0
self.eval_net, self.target_net = NN(), NN()
self.loss_func = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
def choose_action(self, state):
state = torch.unsqueeze(torch.FloatTensor(state),0)
if random.random() < epsilon:
action = random.randint(0,len(com_data) - 1) # choose a random component value,com_data is a list which means components' value ,from 0 to 5
else:
action_value = self.eval_net.forward(state)
action = torch.max(action_value, 1)[1].numpy()[0] # I copied this code from others and i guess it means choose the max Q value action
return action
def step(self,action):
self.ran_node = random.choice([a for a,x in enumerate(vio_node) if x == 1]) # x=1 means the node state value is lower than 0.95,ran_node means i randomly select node to place coponent
str1 = '.param cap_0_%d_val=%e\n' % (self.ran_node, com_data[action]) #
self.decap_param[self.ran_node] = com_data[action] # this two lines means change the placed component,it doesn't matter
def learn(self):
# target net update
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn_step_counter += 1
# sample from memory
sample_index = np.random.choice(memory_capability, batch_size)
b_memory = self.memory[sample_index, :]
b_s = torch.FloatTensor(b_memory[:, :n_state])
b_a = torch.LongTensor(b_memory[:, n_state:n_state + 1].astype(int))
b_r = torch.FloatTensor(b_memory[:, n_state + 1:n_state + 2])
b_s_ = torch.FloatTensor(b_memory[:, -n_state:])
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
q_next = self.target_net(b_s_).detach()
q_target = b_r + gamma * q_next.max(1)[0] # shape (batch, 1)
loss = self.loss_func(q_eval, q_target)
# calculate and update eval_net
self.optimizer.zero_grad()
loss.backward() # i don't understand this three lines
self.optimizer.step()
the main function is below:
ckt = Ckt_Opt()
for i in range(0,50):
ckt.reset() # no component is placed,get initial state
state = read_result() # a function to read state after taking action,here is reading initial state
for j in range(500):
action = pdn.choose_action(state)
state_,Cof,vio_node = pdn.step(action)
# Cof means whether the sum of component value is more than the limit(1 means more than limit, 0 not),vio_node means whether place component
reward = sum(-((state_ -0.95) * vio_node)**2 *500) + (nodes - sum(vio_node))if Cof == 0 else \
sum(-((state_ - 0.95) * vio_node)**2 *5000)
# equations mean give priority to placing components in nodes with low state value to improve the reward.for example,node_0 is 0.6 and node_1 is 0.9,the penalty(equals negative reward) of node_0 is -(0.6-0.95)^2 *500 = -61.25,and node_1's penalty is -(0.9-0.95)^2 *500 = -1.25
ckt.store_transition(state,action,reward,state_) # just store in the experience memory
state = state_
epsilon = epsilon_end +(epsilon_start - epsilon_end) * math.exp(-1. *epsilon_decay * i)
My goal is to find a solution that has the best reward.For example,the initial state is [0.6,0.7,0.8,0.9,0.97],and the placed component values are [5,4,0,1,0],get the best state is [0.85,0.9,0.91,0.93,0.97], it can't make every state value get over than 0.95 because of some reason.
But!!! I ran many times and always get a wired solution like [1,1,1,1,0] or [2,2,2,2,0], which is not make sense, i think it must be something wrong with choose_action function or learn function,but i can't find it because i am new to DQN
Could anyone help me ? thanks a lot

K-Arms Bandit Epsilon-Greedy Policy

I have been trying to implement Reinforcement Learning books exercise 2.5
I have written this piece of code according to this pseudo version
class k_arm:
def __init__(self, iter, method="incrementally"):
# self.iter placeholder
self.iter = iter
self.k = 10
self.eps = .1
# here is Q(a) and N(a)
self.qStar = np.zeros(self.k)
self.n = np.zeros(self.k)
# Method just for experimenting different functions
self.method = method
def pull(self):
# selecting argmax(Q(A)) action with prob. (1 - eps)
eps = np.random.uniform(0, 1, 1)
if eps < self.eps or self.qStar.argmax() == 0:
a = np.random.randint(10)
else: a = self.qStar.argmax()
# R bandit(A)
r = np.random.normal(0, 0.01, 1)
# N(A) <- N(A) + 1
self.n[a] += 1
# Q(A) <- Q(A) i / (N(A)) * (R - Q(A))
if self.method == "incrementally":
self.qStar[a] += (r - self.qStar[a]) / self.n[a]
return self.qStar[a]`
iter = 1000
rewards = np.zeros(iter)
c = k_arm(iter, method="incrementally")
for i in range(iter):
k = c.pull()
rewards[i] = k
And I get this as a result
Where I am expecting this kind of results.
I have been trying to understand where am I went missing here, but I couldn't.
Your average reward is around 0 because it is the correct estimation. Your reward function is defined as:
# R bandit(A)
r = np.random.normal(0, 0.01, 1)
This means the expected value of your reward distribution is 0 with 0.01 variance. In the book the authors use a different reward function. While this still has a fundamental issue, you could earn similar rewards if you change your code to
# R bandit(A)
r = np.random.normal(1.25, 0.01, 1)
It makes sense to give each bandit a different reward function or all your action values will be the same. So what you really should do is sample from k different distributions with different expected values. Otherwise action selection is meaningless.
Add this to your init function:
self.expected_vals = np.random.uniform(0, 2, self.k)
and change the the calculation of the reward so, that it depends on the action:
r = np.random.uniform(self.expected_vals[a], 0.5, 1)
I've also increased the variance to 0.5 as 0.01 is basically meaningless variance in the context of bandits. If your agents works correctly, his average reward should be approximately equal to np.max(self.expected_vals)

Adding prizes to OpenAI Gym maze environment

Is there a way to modify the OpenAI Gym environment slightly by adding a fixed number of prizes (e.g. 2) in random places on the map? Such that the agent will not only find the exit, but will also pick up the prizes on the way.
I have the following code for a Q learning algorithm using maze-v0
import gym
import gym_maze
import numpy as np
env = gym.make("maze-v0")
states_dic = {} #dictionary to keep the states/coordinates of the Q table
count = 0
for i in range(5):
for j in range(5):
states_dic[i, j] = count
count+=1
n_actions = env.action_space.n
#Initialize the Q-table to 0
Q_table = np.zeros((len(states_dic),n_actions))
#number of episode we will run
n_episodes = 10000
#maximum of iteration per episode
max_iter_episode = 100
#initialize the exploration probability to 1
exploration_proba = 1
#exploartion decreasing decay for exponential decreasing
exploration_decreasing_decay = 0.001
# minimum of exploration prob
min_exploration_proba = 0.01
#discounted factor
gamma = 0.99
#learning rate
lr = 0.1
rewards_per_episode = list()
#we iterate over episodes
for e in range(n_episodes):
#we initialize the first state of the episode
current_state = env.reset()
done = False
#sum the rewards that the agent gets from the environment
total_episode_reward = 0
for i in range(max_iter_episode):
env.render() # For image you MUST call this
current_coordinate_x = int(current_state[0])
current_coordinate_y = int(current_state[1])
current_Q_table_coordinates = states_dic[current_coordinate_x, current_coordinate_y]
if np.random.uniform(0,1) < exploration_proba:
action = env.action_space.sample()
else:
action = int(np.argmax(Q_table[current_Q_table_coordinates]))
next_state, reward, done, _ = env.step(action)
next_coordinate_x = int(next_state[0]) #get coordinates to be used in dictionary
next_coordinate_y = int(next_state[1]) #get coordinates to be used in dictionary
# update our Q-table using the Q-learning iteration
next_Q_table_coordinates = states_dic[next_coordinate_x, next_coordinate_y]
Q_table[current_Q_table_coordinates, action] = (1-lr) *Q_table[current_Q_table_coordinates, action] +lr*(reward + gamma*max(Q_table[next_Q_table_coordinates,:]))
total_episode_reward = total_episode_reward + reward
# If the episode is finished, we leave the for loop
if done:
break
current_state = next_state
#We update the exploration proba using exponential decay formula
exploration_proba = max(min_exploration_proba,\
np.exp(-exploration_decreasing_decay*e))
rewards_per_episode.append(total_episode_reward)

Would it be possible to use a Neural Network / AI to 'optimise' the time taken for the race?

The program when completed will aim to use AI to get the quickest possible time. The car can accelerate, brake or move at constant speed. There will be sections throughout the code (which represent corners) where the speed will have to be = to or under a certain value (depending on how tight the corner is) and I want the program to be able to decide when the best moments would be to accelerate, brake and move at constant speed would be.
Is this even possible with python? Could you create a neural network which would progressively get a better time? If so how would I go about doing something like this?
Thanks !
import time
x = 0
def TrackSimulation(distance, speed, acceleration, loopbreak, time1):
while loopbreak == 1:
if x == 1:
acceleration = 9
elif x == 2:
acceleration = -9
elif x == 0:
acceleration = 0
else:
print("Error")
if distance >= 0 and distance < 80:
speed = (speed) + ((acceleration) * 0.1)
distance = (distance) + ((speed) * 0.1)
time1 = time1 + 0.1
print((speed), " M/s")
print((distance), "M")
time.sleep(0.1)
elif distance >= 80 and distance <= 110:
if speed >= 30:
print("Too fast!")
loopbreak = 2
break
else:
print("You are in the speed checker")
speed = (speed) + ((acceleration) * 0.1)
distance = (distance) + ((speed) * 0.1)
time1 = time1 + 0.1
print((speed), " M/s")
print((distance), "M")
time.sleep(0.1)
elif distance >= 110 and distance < 200:
speed = (speed) + ((acceleration) * 0.1)
distance = (distance) + ((speed) * 0.1)
time1 = time1 + 0.1
print((speed), " M/s")
print((distance), "M")
time.sleep(0.1)
elif distance >= 200:
print("race over")
finaltime = round((time1), 3)
print("This was your time,", (finaltime))
loopbreak = 2
break
I would recommend checking out how reinforcement learning works. The core idea is this -
Reinforcement learning is about taking suitable action to maximize reward in a particular situation.
So in your case, for example, you have this track and you need to build an algorithm that allows the car to reach the goal in minimum time. This means you need to train a reinforcement learning model which minimizes the time taken to reach the goal. The model will have a few inputs parameters such as velocity, acceleration, left steer, right steer, break etc. It will start by taking random actions in this input space and trying to reach the end goal while staying on track and minimizing the time taken.
Open AI Gym provides an excellent set of tools in python to practice and learn reinforcement algorithms such as q-learning. It contains various games implemented in python that allow you to build your own models and try training your actors against a reward. Check this car racing game implemented there.
Here is a video on reinforcement learning to train Mario in Mario-kart to win the race.

np.argmax returning impossible value on Q table

I am experimenting with Q-learning using a super mario bros gym. I am trying to retrieve the best possible action using np.argmax, which should return something between 1-12. but it is returning values like 224440.. it's only returning this value sometimes and seems to be doing it more frequently as the program goes on..
I have tried logging the shape of the actions to see if I am making a mistake somewhere else, I have tried printing almost every value to see if something is being improperly set, but I can't seem to find anything.
Currently im catching these improper actions so they dont crash the program and randomizing their action, this obviously is not a solution but is for debugging purposes.
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
from collections import defaultdict
#imports
import random
import numpy as np
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT)
Q = np.zeros((240 * 256 * 3, env.action_space.n)) # state size is based on 3 dimensional values of the screen
# hyper-parameters
epsilon = 0.1
alpha = 0.5 # Learning rate
gamma = 0.5 # Decay
# number of GAMES
episodes = 500000000000
for episode in range(1, episodes):
print("Starting episode: " + str(episode))
state = env.reset()
finished = False
# number of steps
while not finished:
if random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(Q[state])
## FIX THIS!
if action > 12 or action < 0:
#print("Random: " + str(np.argmax(Q[state, :])))
print(action)
print(Q.shape)
action = env.action_space.sample()
new_state, reward, done, info = env.step(action)
Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[new_state, :]) - Q[state, action])
state = new_state
env.render()
if done:
finished = True
env.close()
It might very well be possible that I am misunderstanding some concepts here as I am still learning and experimenting with this. Any input or help would be greatly appreciated.

Categories