I have been trying to implement Reinforcement Learning books exercise 2.5
I have written this piece of code according to this pseudo version
class k_arm:
def __init__(self, iter, method="incrementally"):
# self.iter placeholder
self.iter = iter
self.k = 10
self.eps = .1
# here is Q(a) and N(a)
self.qStar = np.zeros(self.k)
self.n = np.zeros(self.k)
# Method just for experimenting different functions
self.method = method
def pull(self):
# selecting argmax(Q(A)) action with prob. (1 - eps)
eps = np.random.uniform(0, 1, 1)
if eps < self.eps or self.qStar.argmax() == 0:
a = np.random.randint(10)
else: a = self.qStar.argmax()
# R bandit(A)
r = np.random.normal(0, 0.01, 1)
# N(A) <- N(A) + 1
self.n[a] += 1
# Q(A) <- Q(A) i / (N(A)) * (R - Q(A))
if self.method == "incrementally":
self.qStar[a] += (r - self.qStar[a]) / self.n[a]
return self.qStar[a]`
iter = 1000
rewards = np.zeros(iter)
c = k_arm(iter, method="incrementally")
for i in range(iter):
k = c.pull()
rewards[i] = k
And I get this as a result
Where I am expecting this kind of results.
I have been trying to understand where am I went missing here, but I couldn't.
Your average reward is around 0 because it is the correct estimation. Your reward function is defined as:
# R bandit(A)
r = np.random.normal(0, 0.01, 1)
This means the expected value of your reward distribution is 0 with 0.01 variance. In the book the authors use a different reward function. While this still has a fundamental issue, you could earn similar rewards if you change your code to
# R bandit(A)
r = np.random.normal(1.25, 0.01, 1)
It makes sense to give each bandit a different reward function or all your action values will be the same. So what you really should do is sample from k different distributions with different expected values. Otherwise action selection is meaningless.
Add this to your init function:
self.expected_vals = np.random.uniform(0, 2, self.k)
and change the the calculation of the reward so, that it depends on the action:
r = np.random.uniform(self.expected_vals[a], 0.5, 1)
I've also increased the variance to 0.5 as 0.01 is basically meaningless variance in the context of bandits. If your agents works correctly, his average reward should be approximately equal to np.max(self.expected_vals)
Related
I customize a dqn agent to solve a circuit problem.for example, the state is 1D input which represents five nodes' value(node_0 to node_4),shape=(5,),and actions are choosing one of six components (like whose values are [0,1,2,3,4,5])to place in the circuit to get a new state,named state_.So the action_space is (6,).My goal is to make five values that in one state to reach fixed value as possible.For example,the initial state is [0.8,0.7,0.9,0.98,0.9] and my goal is to make five value higher than 0.95. I mean, i put a component whose value is 3, in node_0 and it becomes 0.95 from 0.8,it meets the requirments.and the node_3 don't need place a component because it is 0.98.And i set up a limit that the sum of placed components value can't over 10.
here are some hyperparameter:
gamma = 0.9
TARGET_REPLACE_ITER = 500
nodes = 5
memory_capability = 1000
batch_size = 30
epsilon_start =1
epsilon_end = 0.0001
epsilon_decay = 0.06
learning_rate = 0.001
epsilon = 1
n_state = 5
n_action = 6
I make two neural networks to do,one is eval_net and the other is target_net,the code is below:
class NN(nn.Module):
def __init__(self, ):
super(NN,self).__init__()
self.fc1 = nn.Linear(n_state, 16)
self.fc1.weight.data.normal_(0, 0.1)
self.fc2 = nn.Linear(16,32)
self.out = nn.Linear(32, n_action)
self.out.weight.data.normal_(0, 0.1)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
action_value = self.out(x)
return action_value
here is the agent:
class Ckt_Opt(object):
def __init__(self,):
self.learn_step_counter = 0
self.memory = np.zeros((memory_capability, n_state * 2 + 2))
self.memory_cntr = 0
self.eval_net, self.target_net = NN(), NN()
self.loss_func = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
def choose_action(self, state):
state = torch.unsqueeze(torch.FloatTensor(state),0)
if random.random() < epsilon:
action = random.randint(0,len(com_data) - 1) # choose a random component value,com_data is a list which means components' value ,from 0 to 5
else:
action_value = self.eval_net.forward(state)
action = torch.max(action_value, 1)[1].numpy()[0] # I copied this code from others and i guess it means choose the max Q value action
return action
def step(self,action):
self.ran_node = random.choice([a for a,x in enumerate(vio_node) if x == 1]) # x=1 means the node state value is lower than 0.95,ran_node means i randomly select node to place coponent
str1 = '.param cap_0_%d_val=%e\n' % (self.ran_node, com_data[action]) #
self.decap_param[self.ran_node] = com_data[action] # this two lines means change the placed component,it doesn't matter
def learn(self):
# target net update
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn_step_counter += 1
# sample from memory
sample_index = np.random.choice(memory_capability, batch_size)
b_memory = self.memory[sample_index, :]
b_s = torch.FloatTensor(b_memory[:, :n_state])
b_a = torch.LongTensor(b_memory[:, n_state:n_state + 1].astype(int))
b_r = torch.FloatTensor(b_memory[:, n_state + 1:n_state + 2])
b_s_ = torch.FloatTensor(b_memory[:, -n_state:])
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
q_next = self.target_net(b_s_).detach()
q_target = b_r + gamma * q_next.max(1)[0] # shape (batch, 1)
loss = self.loss_func(q_eval, q_target)
# calculate and update eval_net
self.optimizer.zero_grad()
loss.backward() # i don't understand this three lines
self.optimizer.step()
the main function is below:
ckt = Ckt_Opt()
for i in range(0,50):
ckt.reset() # no component is placed,get initial state
state = read_result() # a function to read state after taking action,here is reading initial state
for j in range(500):
action = pdn.choose_action(state)
state_,Cof,vio_node = pdn.step(action)
# Cof means whether the sum of component value is more than the limit(1 means more than limit, 0 not),vio_node means whether place component
reward = sum(-((state_ -0.95) * vio_node)**2 *500) + (nodes - sum(vio_node))if Cof == 0 else \
sum(-((state_ - 0.95) * vio_node)**2 *5000)
# equations mean give priority to placing components in nodes with low state value to improve the reward.for example,node_0 is 0.6 and node_1 is 0.9,the penalty(equals negative reward) of node_0 is -(0.6-0.95)^2 *500 = -61.25,and node_1's penalty is -(0.9-0.95)^2 *500 = -1.25
ckt.store_transition(state,action,reward,state_) # just store in the experience memory
state = state_
epsilon = epsilon_end +(epsilon_start - epsilon_end) * math.exp(-1. *epsilon_decay * i)
My goal is to find a solution that has the best reward.For example,the initial state is [0.6,0.7,0.8,0.9,0.97],and the placed component values are [5,4,0,1,0],get the best state is [0.85,0.9,0.91,0.93,0.97], it can't make every state value get over than 0.95 because of some reason.
But!!! I ran many times and always get a wired solution like [1,1,1,1,0] or [2,2,2,2,0], which is not make sense, i think it must be something wrong with choose_action function or learn function,but i can't find it because i am new to DQN
Could anyone help me ? thanks a lot
I'm using the L-BFGS-B optimizer to find the minima of a function. This will help me calculate sharpness for the function. However, I'm not sure if this following message is considered a normal message (i.e. Is there something wrong with my program or is this message typical?) See below:
RUNNING THE L-BFGS-B CODE
* * *
Machine precision = 2.220D-16
N = 28149514 M = 10
At X0 0 variables are exactly at the bounds
^[[C
At iterate 0 f= -3.59325D+00 |proj g|= 2.10249D-03
At iterate 1 f= -2.47853D+01 |proj g|= 4.20499D-03
Bad direction in the line search;
refresh the lbfgs memory and restart the iteration.
At iterate 2 f= -2.53202D+01 |proj g|= 4.17686D-03
At iterate 3 f= -2.53202D+01 |proj g|= 4.17686D-03
* * *
Tit = total number of iterations
Tnf = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip = number of BFGS updates skipped
Nact = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F = final function value
* * *
N Tit Tnf Tnint Skip Nact Projg F
***** 3 43 ****** 0 ***** 4.177D-03 -2.532D+01
F = -25.320247650146484
CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
Warning: more than 10 function and gradient
evaluations in the last line search. Termination
may possibly be caused by a bad search direction.
I got the following sharpness anyway which is relatively consistent with the paper I'm trying to reproduce: It's just that I'm a bit concerned with the above message.
tensor(473.0201)
Here is my code for computing sharpness:
def get_sharpness(data_loader, model, criterion, epsilon, manifolds=0):
# extract current x0
x0 = None
for p in model.parameters():
if x0 is None:
x0 = p.data.view(-1)
else:
x0 = torch.cat((x0, p.data.view(-1)))
x0 = x0.cpu().numpy()
# get current f_x
f_x0, _ = get_minus_cross_entropy(x0, data_loader, model, criterion)
f_x0 = -f_x0
logging.info('min loss f_x0 = {loss:.4f}'.format(loss=f_x0))
# find the minimum
if 0==manifolds:
x_min = np.reshape(x0 - epsilon * (np.abs(x0) + 1), (x0.shape[0], 1))
x_max = np.reshape(x0 + epsilon * (np.abs(x0) + 1), (x0.shape[0], 1))
bounds = np.concatenate([x_min, x_max], 1)
func = lambda x: get_minus_cross_entropy(x, data_loader, model, criterion, training=True)
init_guess = x0
else:
warnings.warn("Small manifolds may not be able to explore the space.")
assert(manifolds<=x0.shape[0])
#transformer = rp.GaussianRandomProjection(n_components=manifolds)
#transformer.fit(np.random.rand(manifolds, x0.shape[0]))
#A_plus = transformer.components_
#A = np.linalg.pinv(A_plus)
A_plus = np.random.rand(manifolds, x0.shape[0])*2.-1.
# normalize each column to unit length
A_plus_norm = np.linalg.norm(A_plus, axis=1)
A_plus = A_plus / np.reshape(A_plus_norm, (manifolds,1))
A = np.linalg.pinv(A_plus)
abs_bound = epsilon * (np.abs(np.dot(A_plus, x0))+1)
abs_bound = np.reshape(abs_bound, (abs_bound.shape[0], 1))
bounds = np.concatenate([-abs_bound, abs_bound], 1)
def func(y):
floss, fg = get_minus_cross_entropy(x0 + np.dot(A, y), data_loader, model, criterion, training=True)
return floss, np.dot(np.transpose(A), fg)
#func = lambda y: get_minus_cross_entropy(x0+np.dot(A, y), data_loader, model, criterion, training=True)
init_guess = np.zeros(manifolds)
#rand_selections = (np.random.rand(bounds.shape[0])+1e-6)*0.99
#init_guess = np.multiply(1.-rand_selections, bounds[:,0])+np.multiply(rand_selections, bounds[:,1])
minimum_x, f_x, d = sciopt.fmin_l_bfgs_b(func, init_guess, maxiter=10, bounds=list(bounds), disp=1, iprint=101)
#factr=10.,
#pgtol=1.e-12,
f_x = -f_x
logging.info('max loss f_x = {loss:.4f}'.format(loss=f_x))
sharpness = (f_x - f_x0)/(1+f_x0)*100
print(sharpness)
# recover the model
x0 = torch.from_numpy(x0).float()
x0 = x0.cuda()
x_start = 0
for p in model.parameters():
psize = p.data.size()
peltnum = 1
for s in psize:
peltnum *= s
x_part = x0[x_start:x_start + peltnum]
p.data = x_part.view(psize)
x_start += peltnum
return sharpness
Which was taken from this repository:
https://github.com/wenwei202/smoothout/blob/master/measure_sharpness.py
I'm concerned about exact accuracy.
First, l-bfgs-b will only give a global minimum for a convex function.
the message
CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH
is the normal convergence message.
The warning you are getting says that there are a lot of function/gradient evaluations in the line search - this can often happen when you use l-bfgs-b on non convex functions. So if the thing you're minimizing is non convex (and it seems like it might be just by glancing at the code), I would say this is normal.
I'm trying to solve XOR problem using neural network. For training I'm using genetic algorithm.
population size : 200
max_generations: 10000
crossover rate : 0,8
mutation rate : 0.1
number of weights : 9
activation function : sigmoid
selection method : high percentance for the ones with best fits
Code:
def crossover(self,wfather,wmother):
r = np.random.random()
if r <= self.crossover_perc:
new_weight= self.crossover_perc*wfather+(1-self.crossover_perc)*wmother
new_weight2=self.crossover_perc*wmother+(1-self.crossover_perc)*wfather
return new_weight,new_weight2
else:
return wfather,wmother
def select(self,fits):
percentuais = np.array(fits) / float(sum(fits))
vet = [percentuais[0]]
for p in percentuais[1:]:
vet.append(vet[-1] + p)
r = np.random.random()
#print(len(vet), r)
for i in range(len(vet)):
if r <= vet[i]:
return i
def mutate(self, weight):
r = np.random.random()
if r <= self.mut_perc:
mutr=np.random.randint(self.number_weights)
weight[mutr] = weight[mutr] + np.random.normal()
return weight
def activation_fuction(self, net):
return 1 / (1 + math.exp(-net))
Problem:
~5/10 tests works fine
Expected Output:
0,0 0
0,1 1
1,0 1
1,1 0
Tests:
Its inconsistent, sometimes i got four 0's, three 1's, multiple results
Could you help me find the error?
**Edit
All Code:
def create_initial_population(self):
population = np.random.uniform(-40, 40, [self.population_size, self.number_weights])
return population
def feedforward(self, inp1, inp2, weights):
bias = 1
x = self.activation_fuction(bias * weights[0] + (inp1 * weights[1]) + (inp2 * weights[2]))
x2 = self.activation_fuction(bias * weights[3] + (inp1 * weights[4]) + (inp2 * weights[5]))
out = self.activation_fuction(bias * weights[6] + (x * weights[7]) + (x2 * weights[8]))
print(inp1, inp2, out)
return out
def fitness(self, weights):
y1 = abs(0.0 - self.feedforward(0.0, 0.0, weights))
y2 = abs(1.0 - self.feedforward(0.0, 1.0, weights))
y3 = abs(1.0 - self.feedforward(1.0, 0.0, weights))
y4 = abs(0.0 - self.feedforward(1.0, 1.0, weights))
error = (y1 + y2 + y3 + y4) ** 2
# print("Error: ", 1/error)
return 1 / error
def sortpopbest(self, pop):
pop_with_fit = [(weights,self.fitness(weights)) for weights in pop]
sorted_population=sorted(pop_with_fit, key=lambda weights_fit: weights_fit[1]) #Worst->Best One
fits = []
pop = []
for i in sorted_population:
pop.append(i[0])
fits.append(i[1])
return pop,fits
def execute(self):
pop = self.create_initial_population()
for g in range(self.max_generations): # maximo de geracoes
pop, fits = self.sortpopbest(pop)
nova_pop=[]
for c in range(int(self.population_size/2)):
weights = pop[self.select(fits)]
weights2 = pop[self.select(fits)]
new_weights,new_weights2=self.crossover(weights,weights2)
new_weights=self.mutate(new_weights)
new_weights2=self.mutate(new_weights2)
#print(fits)
nova_pop.append(new_weights) # adiciona na nova_pop
nova_pop.append(new_weights2)
pop = nova_pop
print(len(fits),fits)
Some input:
XOR is a simple problem. With a few hundreds of random initialization, you should have some lucky ones that solve it immediately (if "solved" means that they output is correct after doing a threshold). This is a good test to see if your initialization and feed-forward pass is correct, without debugging the whole GA all at once. Or you chould just hand-craft the correct weights and biases, and see if that works.
Your initial weights (uniform -40...+40) are way too large. I guess for XOR this maybe okay-ish. But initial weights should be such that most neurons don't saturate, but aren't fully in the linear zone of sigmoid either.
After your implementation works, have a look at this numpy implementation of the feed-foward pass of a neural network for how to do it with less code.
Let's suppose that we have got a list which appends an integer in each iteration which is between 15, 32(let's call the integer rand). I want to design an algorithm which assigns a reward around 1 (between 1.25 and 0.75) to each rand. the rule for assigning the reward goes like this.
first we calculate the average of the list. Then if rand is more than average, we expect the reward to be less than 1, and if rand is less than average, the reward gets higher than 1. The more distance between average and rand, the more reward increases/decreases.
for example:
rand = 15, avg = 23 then reward = 1.25
rand = 32, avg = 23 then reward = 0.75
rand = 23, avg = 23 then reward = 1
and so on.
I had developed the code below for this algorithm:
import numpy as np
rollouts = np.array([])
i = 0
def modify_reward(lst, rand):
reward = 1
constant1 = 0.25
constant2 = 1
std = np.std(lst)
global avg
avg = np.mean(lst)
sub = np.subtract(avg, rand)
landa = sub / std if std != 0 else 0
coefficient = -1 + ( 2 / (1 + np.exp(-constant2 * landa)))
md_reward = reward + (reward * constant1 * coefficient)
return md_reward
while i < 100:
rand = np.random.randint(15, 33)
rollouts = np.append(rollouts, rand)
modified_reward = modify_reward(rollouts, rand)
i += 1
print([i,rand, avg, modified_reward])
# test the reward for upper bound and lower bound
rand1, rand2 = 15, 32
reward1, reward2 = modify_reward(rollouts, rand1), modify_reward(rollouts, rand2)
print(['reward for upper bound', rand1, avg, reward1])
print(['reward for lower bound', rand2, avg, reward2])
The algorithm works quite fine, but if you look at examples below, you would notice the problem with algorithm.
rand = 15, avg = 23.94 then reward = 1.17 # which has to be 1.25
rand = 32, avg = 23.94 then reward = 0.84 # which has to be 0.75
rand = 15, avg = 27.38 then reward = 1.15 # which has to be 1.25
rand = 32, avg = 27.38 then reward = 0.93 # which has to be 0.75
As you might have noticed, Algorithm doesn't consider the distance between avg and bounds (15, 32).
The more avg moves towards lower bound or higher bound, the more modified_reward gets unbalanced.
I need modified_reward to be uniformly assigned, no matter avg moves toward upper bound or lower bound.
Can anyone suggest some modification to this algorithm which could consider the distance between avg and bounds of the list.
Putting together these two requirements:
if rand is more than average, we expect the reward to be less than 1, and if rand is less than average, the reward gets higher than 1.
I need modified_reward to be uniformly assigned, no matter avg moves toward upper bound or lower bound.
is slightly tricky, depending on what you mean by 'uniformly'.
If you want 15 to always be rewarded with 1.25, and 32 to always be rewarded with 0.75, you can't have a single linear relationship while also respecting your first requirement.
If you are happy with two linear relationships, you can aim for a situation where modified_reward depends on rand like this:
which I produced with this Wolfram Alpha query. As you can see, this is two linear relationships, with a 'knee' at avg. I expect you'll be able to derive the formulae for each part without too much trouble.
This code implements a linear distribution of weights proportional to the distance from average towards your given limits.
import numpy as np
class Rewarder(object):
lo = 15
hi = 32
weight = 0.25
def __init__(self):
self.lst = np.array([])
def append(self, x):
self.lst = np.append(self.lst, [x])
def average(self):
return np.mean(self.lst)
def distribution(self, a, x, b):
'''
Return a number between 0 and 1 proportional to
the distance of x from a towards b.
Note: Modify this fraction if you want a normal distribution
or quadratic etc.
'''
return (x - a) / (b - a)
def reward(self, x):
avg = self.average()
if x > avg :
w = self.distribution(avg, x, self.hi)
else:
w = - self.distribution(avg, x, self.lo)
return 1 - self.weight * w
rollouts = Rewarder()
rollouts.append(23)
print rollouts.reward(15)
print rollouts.reward(32)
print rollouts.reward(23)
Producing:
1.25
0.75
1.0
The code in your question seems to be using np.std which I presume is an attempt to get a normal distribution. Remember that the normal distribution never actually gets to zero.
If you tell me what shape you want for the distribution we can modify Rewarder.distribution to suit.
Edit:
I can't access the paper you refer to but infer that you want a sigmoid style distribution of rewards giving a 0 at mean and approximately +/-0.25 at min and max. Using the error function as the weighting if we scale by 2 we get approximately 0.995 at min and max.
Override the Rewarder.distribution:
import math
class RewarderERF(Rewarder):
def distribution(self, a, x, b):
"""
Return an Error Function (sigmoid) weigthing of the distance from a.
Note: scaled to reduce error at max to ~0.003
ref: https://en.wikipedia.org/wiki/Sigmoid_function
"""
return math.erf(2.0 * super(RewarderERF, self).distribution(a, x, b))
rollouts = RewarderERF()
rollouts.append(23)
print rollouts.reward(15)
print rollouts.reward(32)
print rollouts.reward(23)
results in:
1.24878131454
0.75121868546
1.0
You can choose which error function suits your application and how much error you can accept at min and max. I'd also expect that you'd integrate all these functions into your class, I've split everything out so we can see the parts.
Regarding the calculating the mean, do you need to keep the list of values and recalculate each time or can you keep a count and running total of the sum? Then you would not need numpy for this calculation.
I don't understand why you are calculating md_reward like this. Please provide logic and reason. But
landa = sub / std if std != 0 else 0
coefficient = -1 + ( 2 / (1 + np.exp(-constant2 * landa)))
md_reward = reward + (reward * constant1 * coefficient)
will not give what you are looking for. Because lets consider below cases
for md_reward to be .75
--> coefficient should be -1
--> landa == -infinite (negative large value, i.e. , rand should be much larger than 32)
for md_reward to be 1
--> coefficient should be 0
--> landa == 0 (std == 0 or sub == 0) # which is possible
for md_reward to be 1.25
--> coefficient should be 1
--> landa == infinite (positive large value, i.e. , rand should be much smaller than 15)
If you want to normalize reward from avg to max and avg to min. check below links.
https://stats.stackexchange.com/questions/70801/how-to-normalize-data-to-0-1-range
https://stats.stackexchange.com/questions/70553/what-does-normalization-mean-and-how-to-verify-that-a-sample-or-a-distribution
Now modify your function with something below.
def modify_reward(lst, rand):
reward = 1
constant1 = 0.25
min_value = 15
max_value = 32
avg = np.mean(lst)
if rand >= avg:
md_reward = reward - constant1*(rand - avg)/(max_value - avg) # normalize rand from avg to max
else:
md_reward = reward + constant1*(1 - (rand - min_value)/(avg - min_value)) # normalize rand from min to avg
return md_reward
I have used below method
Normalized:
(X−min(X))/(max(X)−min(X))
for case rand >= avg
min(X) will be avg and max(X) is max_value
and for case rand < avg
min(X) in min_value and max(X) is avg
Hope this helps.
Try this
def modify_reward(lst, rand):
reward = 1
constant = 0.25 #Think of this as the +/- amount from initial reward
global avg
avg = np.mean(lst)
sub = np.subtract(avg, rand)
dreward = 0
if sub>0:
dreward = sub/(avg-15) #put your lower boundary instead of 15
elif sub<0:
dreward = sub/(32-avg) #put your higher boundary instead of 32
md_reward = reward +(dreward*constant)
return md_reward
This is the linear solution inspired by #AakashM. I don't know if this is what you were looking for, but this fits your description.
I'm trying to implement Sarsa algorithm for solving a Frozen Lake environment from OpenAI gym. I've started soon to work with this but I think I understand it.
I also understand how Sarsa algorithm works, there're many sites where to find a pseudocode, and I get it. I've implemented this algorithm in my problem following all the steps, but when I check the final Q function after all the episodes I notice that all values tend to zero and I don't know why.
Here is my code, I hope someone can tell me why that happens.
import gym
import random
import numpy as np
env = gym.make('FrozenLake-v0')
#Initialize the Q matrix 16(rows)x4(columns)
Q = np.zeros([env.observation_space.n, env.action_space.n])
for i in range(env.observation_space.n):
if (i != 5) and (i != 7) and (i != 11) and (i != 12) and (i != 15):
for j in range(env.action_space.n):
Q[i,j] = np.random.rand()
#Epsilon-Greedy policy, given a state the agent chooses the action that it believes has the best long-term effect with probability 1-eps, otherwise, it chooses an action uniformly at random. Epsilon may change its value.
bestreward = 0
epsilon = 0.1
discount = 0.99
learning_rate = 0.1
num_episodes = 50000
a = [0,0,0,0,0,0,0,0,0,0]
for i_episode in range(num_episodes):
# Observe current state s
observation = env.reset()
currentState = observation
# Select action a using a policy based on Q
if np.random.rand() <= epsilon: #pick randomly
currentAction = random.randint(0,env.action_space.n-1)
else: #pick greedily
currentAction = np.argmax(Q[currentState, :])
totalreward = 0
while True:
env.render()
# Carry out an action a
observation, reward, done, info = env.step(currentAction)
if done is True:
break;
# Observe reward r and state s'
totalreward += reward
nextState = observation
# Select action a' using a policy based on Q
if np.random.rand() <= epsilon: #pick randomly
nextAction = random.randint(0,env.action_space.n-1)
else: #pick greedily
nextAction = np.argmax(Q[nextState, :])
# update Q with Q-learning
Q[currentState, currentAction] += learning_rate * (reward + discount * Q[nextState, nextAction] - Q[currentState, currentAction])
currentState = nextState
currentAction = nextAction
print "Episode: %d reward %d best %d epsilon %f" % (i_episode, totalreward, bestreward, epsilon)
if totalreward > bestreward:
bestreward = totalreward
if i_episode > num_episodes/2:
epsilon = epsilon * 0.9999
if i_episode >= num_episodes-10:
a.insert(0, totalreward)
a.pop()
print a
for i in range(env.observation_space.n):
print "-----"
for j in range(env.action_space.n):
print Q[i,j]
When a episode ends you are breaking the while loop before updating the Q function. Therefore, when the reward received by the agent is different from zero (the agent has reached the goal state), the Q function is never updated in that reward.
You should check for the end of the episode in the last part of the while loop.