Adding prizes to OpenAI Gym maze environment

Adding prizes to OpenAI Gym maze environment - python

Is there a way to modify the OpenAI Gym environment slightly by adding a fixed number of prizes (e.g. 2) in random places on the map? Such that the agent will not only find the exit, but will also pick up the prizes on the way.
I have the following code for a Q learning algorithm using maze-v0
import gym
import gym_maze
import numpy as np
env = gym.make("maze-v0")
states_dic = {} #dictionary to keep the states/coordinates of the Q table
count = 0
for i in range(5):
for j in range(5):
states_dic[i, j] = count
count+=1
n_actions = env.action_space.n
#Initialize the Q-table to 0
Q_table = np.zeros((len(states_dic),n_actions))
#number of episode we will run
n_episodes = 10000
#maximum of iteration per episode
max_iter_episode = 100
#initialize the exploration probability to 1
exploration_proba = 1
#exploartion decreasing decay for exponential decreasing
exploration_decreasing_decay = 0.001
# minimum of exploration prob
min_exploration_proba = 0.01
#discounted factor
gamma = 0.99
#learning rate
lr = 0.1
rewards_per_episode = list()
#we iterate over episodes
for e in range(n_episodes):
#we initialize the first state of the episode
current_state = env.reset()
done = False
#sum the rewards that the agent gets from the environment
total_episode_reward = 0
for i in range(max_iter_episode):
env.render() # For image you MUST call this
current_coordinate_x = int(current_state[0])
current_coordinate_y = int(current_state[1])
current_Q_table_coordinates = states_dic[current_coordinate_x, current_coordinate_y]
if np.random.uniform(0,1) < exploration_proba:
action = env.action_space.sample()
else:
action = int(np.argmax(Q_table[current_Q_table_coordinates]))
next_state, reward, done, _ = env.step(action)
next_coordinate_x = int(next_state[0]) #get coordinates to be used in dictionary
next_coordinate_y = int(next_state[1]) #get coordinates to be used in dictionary
# update our Q-table using the Q-learning iteration
next_Q_table_coordinates = states_dic[next_coordinate_x, next_coordinate_y]
Q_table[current_Q_table_coordinates, action] = (1-lr) *Q_table[current_Q_table_coordinates, action] +lr*(reward + gamma*max(Q_table[next_Q_table_coordinates,:]))
total_episode_reward = total_episode_reward + reward
# If the episode is finished, we leave the for loop
if done:
break
current_state = next_state
#We update the exploration proba using exponential decay formula
exploration_proba = max(min_exploration_proba,\
np.exp(-exploration_decreasing_decay*e))
rewards_per_episode.append(total_episode_reward)

Related

custom dqn agent can't output correct action

I customize a dqn agent to solve a circuit problem.for example, the state is 1D input which represents five nodes' value(node_0 to node_4),shape=(5,),and actions are choosing one of six components (like whose values are [0,1,2,3,4,5])to place in the circuit to get a new state,named state_.So the action_space is (6,).My goal is to make five values that in one state to reach fixed value as possible.For example,the initial state is [0.8,0.7,0.9,0.98,0.9] and my goal is to make five value higher than 0.95. I mean, i put a component whose value is 3, in node_0 and it becomes 0.95 from 0.8,it meets the requirments.and the node_3 don't need place a component because it is 0.98.And i set up a limit that the sum of placed components value can't over 10.
here are some hyperparameter:
gamma = 0.9
TARGET_REPLACE_ITER = 500
nodes = 5
memory_capability = 1000
batch_size = 30
epsilon_start =1
epsilon_end = 0.0001
epsilon_decay = 0.06
learning_rate = 0.001
epsilon = 1
n_state = 5
n_action = 6
I make two neural networks to do,one is eval_net and the other is target_net,the code is below:
class NN(nn.Module):
def __init__(self, ):
super(NN,self).__init__()
self.fc1 = nn.Linear(n_state, 16)
self.fc1.weight.data.normal_(0, 0.1)
self.fc2 = nn.Linear(16,32)
self.out = nn.Linear(32, n_action)
self.out.weight.data.normal_(0, 0.1)
def forward(self, x):
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
action_value = self.out(x)
return action_value
here is the agent:
class Ckt_Opt(object):
def __init__(self,):
self.learn_step_counter = 0
self.memory = np.zeros((memory_capability, n_state * 2 + 2))
self.memory_cntr = 0
self.eval_net, self.target_net = NN(), NN()
self.loss_func = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=learning_rate)
def choose_action(self, state):
state = torch.unsqueeze(torch.FloatTensor(state),0)
if random.random() < epsilon:
action = random.randint(0,len(com_data) - 1) # choose a random component value,com_data is a list which means components' value ,from 0 to 5
else:
action_value = self.eval_net.forward(state)
action = torch.max(action_value, 1)[1].numpy()[0] # I copied this code from others and i guess it means choose the max Q value action
return action
def step(self,action):
self.ran_node = random.choice([a for a,x in enumerate(vio_node) if x == 1]) # x=1 means the node state value is lower than 0.95,ran_node means i randomly select node to place coponent
str1 = '.param cap_0_%d_val=%e\n' % (self.ran_node, com_data[action]) #
self.decap_param[self.ran_node] = com_data[action] # this two lines means change the placed component,it doesn't matter
def learn(self):
# target net update
if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
self.target_net.load_state_dict(self.eval_net.state_dict())
self.learn_step_counter += 1
# sample from memory
sample_index = np.random.choice(memory_capability, batch_size)
b_memory = self.memory[sample_index, :]
b_s = torch.FloatTensor(b_memory[:, :n_state])
b_a = torch.LongTensor(b_memory[:, n_state:n_state + 1].astype(int))
b_r = torch.FloatTensor(b_memory[:, n_state + 1:n_state + 2])
b_s_ = torch.FloatTensor(b_memory[:, -n_state:])
q_eval = self.eval_net(b_s).gather(1, b_a) # shape (batch, 1)
q_next = self.target_net(b_s_).detach()
q_target = b_r + gamma * q_next.max(1)[0] # shape (batch, 1)
loss = self.loss_func(q_eval, q_target)
# calculate and update eval_net
self.optimizer.zero_grad()
loss.backward() # i don't understand this three lines
self.optimizer.step()
the main function is below:
ckt = Ckt_Opt()
for i in range(0,50):
ckt.reset() # no component is placed,get initial state
state = read_result() # a function to read state after taking action,here is reading initial state
for j in range(500):
action = pdn.choose_action(state)
state_,Cof,vio_node = pdn.step(action)
# Cof means whether the sum of component value is more than the limit(1 means more than limit, 0 not),vio_node means whether place component
reward = sum(-((state_ -0.95) * vio_node)**2 *500) + (nodes - sum(vio_node))if Cof == 0 else \
sum(-((state_ - 0.95) * vio_node)**2 *5000)
# equations mean give priority to placing components in nodes with low state value to improve the reward.for example,node_0 is 0.6 and node_1 is 0.9,the penalty(equals negative reward) of node_0 is -(0.6-0.95)^2 *500 = -61.25,and node_1's penalty is -(0.9-0.95)^2 *500 = -1.25
ckt.store_transition(state,action,reward,state_) # just store in the experience memory
state = state_
epsilon = epsilon_end +(epsilon_start - epsilon_end) * math.exp(-1. *epsilon_decay * i)
My goal is to find a solution that has the best reward.For example,the initial state is [0.6,0.7,0.8,0.9,0.97],and the placed component values are [5,4,0,1,0],get the best state is [0.85,0.9,0.91,0.93,0.97], it can't make every state value get over than 0.95 because of some reason.
But!!! I ran many times and always get a wired solution like [1,1,1,1,0] or [2,2,2,2,0], which is not make sense, i think it must be something wrong with choose_action function or learn function,but i can't find it because i am new to DQN
Could anyone help me ? thanks a lot

K-Arms Bandit Epsilon-Greedy Policy

I have been trying to implement Reinforcement Learning books exercise 2.5
I have written this piece of code according to this pseudo version
class k_arm:
def __init__(self, iter, method="incrementally"):
# self.iter placeholder
self.iter = iter
self.k = 10
self.eps = .1
# here is Q(a) and N(a)
self.qStar = np.zeros(self.k)
self.n = np.zeros(self.k)
# Method just for experimenting different functions
self.method = method
def pull(self):
# selecting argmax(Q(A)) action with prob. (1 - eps)
eps = np.random.uniform(0, 1, 1)
if eps < self.eps or self.qStar.argmax() == 0:
a = np.random.randint(10)
else: a = self.qStar.argmax()
# R bandit(A)
r = np.random.normal(0, 0.01, 1)
# N(A) <- N(A) + 1
self.n[a] += 1
# Q(A) <- Q(A) i / (N(A)) * (R - Q(A))
if self.method == "incrementally":
self.qStar[a] += (r - self.qStar[a]) / self.n[a]
return self.qStar[a]`
iter = 1000
rewards = np.zeros(iter)
c = k_arm(iter, method="incrementally")
for i in range(iter):
k = c.pull()
rewards[i] = k
And I get this as a result
Where I am expecting this kind of results.
I have been trying to understand where am I went missing here, but I couldn't.

Your average reward is around 0 because it is the correct estimation. Your reward function is defined as:
# R bandit(A)
r = np.random.normal(0, 0.01, 1)
This means the expected value of your reward distribution is 0 with 0.01 variance. In the book the authors use a different reward function. While this still has a fundamental issue, you could earn similar rewards if you change your code to
# R bandit(A)
r = np.random.normal(1.25, 0.01, 1)
It makes sense to give each bandit a different reward function or all your action values will be the same. So what you really should do is sample from k different distributions with different expected values. Otherwise action selection is meaningless.
Add this to your init function:
self.expected_vals = np.random.uniform(0, 2, self.k)
and change the the calculation of the reward so, that it depends on the action:
r = np.random.uniform(self.expected_vals[a], 0.5, 1)
I've also increased the variance to 0.5 as 0.01 is basically meaningless variance in the context of bandits. If your agents works correctly, his average reward should be approximately equal to np.max(self.expected_vals)

Gurobi - Solving Capacity Location Problem

I am trying to solve a Capacity location Problem from the OR database named: capa
OR files here
The answer should be:
capa 19240822.449 (capacity 8000)
But I found:
Best objective 3.145815023928e+08
For sure it is a problem on my code... Can someone help me?
Maybe I am doing something wrong on the Optmization formula ... or on the construction of my shipping demand. I really do not know, because it looks correct to me.
Any help would be appreciated... thanks in advance
from itertools import product
from math import sqrt
import gurobipy as gp
from gurobipy import GRB
import time
# Get Clients and Facilities
def getFacilities_Clients(file_list):
return int(file_list[0]), int(file_list[1])
# Get Facilities Fixed Costs
def getFacilities_STRCapacity_FixedCosts(file_list, num_facilities):
shift = 2
capacity = []
cost = []
#loop to get all i location costs
for i in range(0,num_facilities*2,2):
#capacity.append(file_list[i+shift])
capacity.append(8000)
cost.append(int(file_list[i+1+shift].replace(".","")))
return capacity, cost
# Get Demand and Allocation Costs for j(customer) to each i(client)
def getClient_Demand_AllocationCosts(file_list, num_facilities, num_customers):
shift = 2 + (num_facilities*2)
demand = []
allocation_cost = []
#loop to get all j Clients
j=0
for r in range(0,num_customers):
#get demand
demand.append(int(file_list[j+shift]))
#loop to get all i location costs
for i in range(0,num_facilities):
allocation_cost.append(float(file_list[j+1+i+shift]))
#fix j
j += num_facilities+1
return demand, allocation_cost
#Read File from OR datasets
fileName='datasets/ORcapa'
ORlist = []
with open(fileName, "r") as f:
ORlist = f.read().split()
##### Sets and Indices #####
num_facilities, num_customers = getFacilities_Clients(ORlist)
capacity, fixed_cost = getFacilities_STRCapacity_FixedCosts(ORlist, num_facilities)
cartesian_prod = list(product(range(num_customers), range(num_facilities)))
# shipping costs
demand, alloc_cost = getClient_Demand_AllocationCosts(ORlist, num_facilities, num_customers)
shipping_cost = dict(zip(cartesian_prod, alloc_cost))
shipping_demand={}
for k, v in shipping_cost.items():
shipping_demand[k] = v * demand[k[0]]
#setup cost
setup_cost = fixed_cost
#demand of customer
dc = demand
#max production
maxp= capacity
start = time.time()
# MIP model formulation
m = gp.Model('CFLP')
##### Decision Variable #####
x = m.addVars(num_facilities, vtype=GRB.BINARY, name='x')
y = m.addVars(cartesian_prod, ub=1, vtype=GRB.CONTINUOUS, name='y')
##### Constraints #####
m.addConstrs((y[(c,f)] <= x[f] for c,f in cartesian_prod), name='Shipping')
m.addConstrs((gp.quicksum(y[(c,f)] for f in range(num_facilities)) == 1 for c in range(num_customers)), name='Demand')
m.addConstrs((gp.quicksum(dc[c]*y[(c,f)] for c in range(num_customers)) <= maxp[f]*x[f] for f in range(num_facilities)), name='Capacity')
##### Objective Function #####
m.setObjective(x.prod(setup_cost)+y.prod(shipping_demand), GRB.MINIMIZE)
m.Params.Method = 1
# Options are:-1=automatic, 0=primal simplex, 1=dual simplex, 2=barrier, 3=concurrent, 4=deterministic concurrent, 5=deterministic concurrent simplex
m.optimize()
end = time.time()

that solved my issue:
m.setObjective(x.prod(setup_cost)+y.prod(shipping_cost), GRB.MINIMIZE)
I was computing a different value for the shipping.

np.argmax returning impossible value on Q table

I am experimenting with Q-learning using a super mario bros gym. I am trying to retrieve the best possible action using np.argmax, which should return something between 1-12. but it is returning values like 224440.. it's only returning this value sometimes and seems to be doing it more frequently as the program goes on..
I have tried logging the shape of the actions to see if I am making a mistake somewhere else, I have tried printing almost every value to see if something is being improperly set, but I can't seem to find anything.
Currently im catching these improper actions so they dont crash the program and randomizing their action, this obviously is not a solution but is for debugging purposes.
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
from collections import defaultdict
#imports
import random
import numpy as np
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, COMPLEX_MOVEMENT)
Q = np.zeros((240 * 256 * 3, env.action_space.n)) # state size is based on 3 dimensional values of the screen
# hyper-parameters
epsilon = 0.1
alpha = 0.5 # Learning rate
gamma = 0.5 # Decay
# number of GAMES
episodes = 500000000000
for episode in range(1, episodes):
print("Starting episode: " + str(episode))
state = env.reset()
finished = False
# number of steps
while not finished:
if random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(Q[state])
## FIX THIS!
if action > 12 or action < 0:
#print("Random: " + str(np.argmax(Q[state, :])))
print(action)
print(Q.shape)
action = env.action_space.sample()
new_state, reward, done, info = env.step(action)
Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[new_state, :]) - Q[state, action])
state = new_state
env.render()
if done:
finished = True
env.close()
It might very well be possible that I am misunderstanding some concepts here as I am still learning and experimenting with this. Any input or help would be greatly appreciated.

Sarsa algorithm, why Q-values tend to zero?

I'm trying to implement Sarsa algorithm for solving a Frozen Lake environment from OpenAI gym. I've started soon to work with this but I think I understand it.
I also understand how Sarsa algorithm works, there're many sites where to find a pseudocode, and I get it. I've implemented this algorithm in my problem following all the steps, but when I check the final Q function after all the episodes I notice that all values tend to zero and I don't know why.
Here is my code, I hope someone can tell me why that happens.
import gym
import random
import numpy as np
env = gym.make('FrozenLake-v0')
#Initialize the Q matrix 16(rows)x4(columns)
Q = np.zeros([env.observation_space.n, env.action_space.n])
for i in range(env.observation_space.n):
if (i != 5) and (i != 7) and (i != 11) and (i != 12) and (i != 15):
for j in range(env.action_space.n):
Q[i,j] = np.random.rand()
#Epsilon-Greedy policy, given a state the agent chooses the action that it believes has the best long-term effect with probability 1-eps, otherwise, it chooses an action uniformly at random. Epsilon may change its value.
bestreward = 0
epsilon = 0.1
discount = 0.99
learning_rate = 0.1
num_episodes = 50000
a = [0,0,0,0,0,0,0,0,0,0]
for i_episode in range(num_episodes):
# Observe current state s
observation = env.reset()
currentState = observation
# Select action a using a policy based on Q
if np.random.rand() <= epsilon: #pick randomly
currentAction = random.randint(0,env.action_space.n-1)
else: #pick greedily
currentAction = np.argmax(Q[currentState, :])
totalreward = 0
while True:
env.render()
# Carry out an action a
observation, reward, done, info = env.step(currentAction)
if done is True:
break;
# Observe reward r and state s'
totalreward += reward
nextState = observation
# Select action a' using a policy based on Q
if np.random.rand() <= epsilon: #pick randomly
nextAction = random.randint(0,env.action_space.n-1)
else: #pick greedily
nextAction = np.argmax(Q[nextState, :])
# update Q with Q-learning
Q[currentState, currentAction] += learning_rate * (reward + discount * Q[nextState, nextAction] - Q[currentState, currentAction])
currentState = nextState
currentAction = nextAction
print "Episode: %d reward %d best %d epsilon %f" % (i_episode, totalreward, bestreward, epsilon)
if totalreward > bestreward:
bestreward = totalreward
if i_episode > num_episodes/2:
epsilon = epsilon * 0.9999
if i_episode >= num_episodes-10:
a.insert(0, totalreward)
a.pop()
print a
for i in range(env.observation_space.n):
print "-----"
for j in range(env.action_space.n):
print Q[i,j]

When a episode ends you are breaking the while loop before updating the Q function. Therefore, when the reward received by the agent is different from zero (the agent has reached the goal state), the Q function is never updated in that reward.
You should check for the end of the episode in the last part of the while loop.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Adding prizes to OpenAI Gym maze environment - python

Related

custom dqn agent can't output correct action

K-Arms Bandit Epsilon-Greedy Policy

Gurobi - Solving Capacity Location Problem

np.argmax returning impossible value on Q table

Sarsa algorithm, why Q-values tend to zero?

Categories

Resources