I have a custom environment in keras-rl with the following configurations in the constructor
def __init__(self, data):
#Declare the episode as the first episode
self.episode=1
#Initialize data
self.data=data
#Declare low and high as vectors with -inf values
self.low = numpy.array([-numpy.inf])
self.high = numpy.array([+numpy.inf])
self.observation_space = spaces.Box(self.low, self.high, dtype=numpy.float32)
#Define the space of actions as 3 (I want them to be 0, 1 and 2)
self.action_space = spaces.Discrete(3)
self.currentObservation = 0
self.limit = len(data)
#Initiates the values to be returned by the environment
self.reward = None
As you can see, my agent will perform 3 actions, depending on the action, a different reward will be calculated in the function step() below:
def step(self, action):
assert self.action_space.contains(action)
#Initiates the reward
self.reward=0
#get the reward
self.possibleGain = self.data.iloc[self.currentObservation]['delta_next_day']
#If action is 1, calculate the reward
if(action == 1):
self.reward = self.possibleGain-self.operationCost
#If action is 2, calculate the reward as negative
elif(action==2):
self.reward = (-self.possibleGain)-self.operationCost
#If action is 0, no reward
elif(action==0):
self.reward = 0
#Finish episode
self.done=True
self.episode+=1
self.currentObservation+=1
if(self.currentObservation>=self.limit):
self.currentObservation=0
#Return the state, reward and if its done or not
return self.getObservation(), self.reward, self.done, {}
The problem is the fact that, if I print the actions at every episode, they are 0, 2, and 4. I want them to be 0, 1 and 2. How can I force the agent to recognize only these 3 actions with keras-rl?
I am not sure why self.action_space = spaces.Discrete(3) is giving you actions as 0,2,4 since I cannot reproduce your error with the code snippet you posted, so I would suggest the following for defining your action
self.action_space = gym.spaces.Box(low=np.array([1]),high= np.array([3]), dtype=np.int)
And this what I get when I sample from the action space.
actions= gym.spaces.Box(low=np.array([1]),high= np.array([3]), dtype=np.int)
for i in range(10):
print(actions.sample())
[1]
[3]
[2]
[2]
[3]
[3]
[1]
[1]
[2]
[3]
Related
I am attempting to create a small working example of how to use MultiDiscrete actions spaces together with a Box observation space. One of the problems that I have run into is that the dimension returned by utilizing a normal policy does not fit with the Box dimensions. The base policy returns something of size 25, whereas I need something that is (5,5).
I have tried to alleviate this problem by generating a custom "policy" (actually a network) where I, as the last step, reshape the output to (5,5) rather than 25. This has resulted in an array of problems. I have attempted to read the documentation for how to create custom policies; however, I cannot for the life of me find the issue.
I have attempted to use policy_kwargs; however, I don't know how to write that the NN should be reshaped.
I have attempted to use a BaseFeaturesExtractor, with no luck as well.
Various combinations of 1 and 2.
I have included some of the error messages that I get for the various different attempts that I have made. Does anyone know what I am missing? Is it something completely fundamental that I have misunderstood?
import numpy as np
import gym
import torch.nn as nn
import torch as th
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor # don't know if this is necessary
# -------- Attempt using BaseFeaturesExtractor
# class CustomPolicy(BaseFeaturesExtractor): # Don't know if BaseFeaturesExtractor is correct
# def __init__(self, observation_space, action_space, features_dim: int = 25): # Features should perhaps be (5,5)
# super().__init__(observation_space, features_dim)
# --------
# Define a custom neural network architecture
class CustomPolicy():
def __init__(self, observation_space, action_space):
super().__init__()
# Define the layers of the neural network
self.fc1 = nn.Linear(observation_space.shape[0], 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, action_space.shape[0])
# Reshape the output to match the Box observation space shape
def forward(self, x):
x = nn.functional.relu(self.fc1(x))
x = nn.functional.relu(self.fc2(x))
x = self.fc3(x)
x = th.reshape(x, (5, 5))
return x
# Define the grid world environment
class GridWorldEnv(gym.Env):
def __init__(self):
self.observation_space = gym.spaces.Box(low=0, high=1, shape=(5, 5), dtype=np.float32)
self.action_space = gym.spaces.MultiDiscrete([5, 3]) # 5 movement directions, 3 movement distances
self.state = np.zeros((5, 5))
self.state[0, 0] = 1 # Start location
self.goal = (4, 4) # Goal location
self.steps = 0
self.state.flatten()
def reset(self):
self.state = np.zeros((5, 5))
self.state[0, 0] = 1 # Start location
self.goal = (4, 4) # Goal location
self.steps = 0
return self.state.flatten()
def step(self, action):
direction, distance = action
reward = -1
done = False
# Calculate the movement offset based on the selected direction and distance
if direction == 0:
offset = (distance, 0)
elif direction == 1:
offset = (-distance, 0)
elif direction == 2:
offset = (0, distance)
elif direction == 3:
offset = (0, -distance)
else:
offset = (0, 0)
# Calculate the new position based on the current position and movement offset
current_pos = np.argwhere(self.state == 1)[0]
new_pos = tuple(np.clip(current_pos + np.array(offset), 0, 4))
# Update the state with the new position
self.state[current_pos] = 0
self.state[new_pos] = 1
# Check if the agent has reached the goal
if np.argmax(self.state) == np.ravel_multi_index(self.goal, self.state.shape):
reward = 10
done = True
# Increment step count and check if episode should end
self.steps += 1
if self.steps >= 50:
done = True
return self.state, reward, done, {}
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
# Create an environment with the CustomEnv environment
env = GridWorldEnv()
# Create policy
policy = CustomPolicy(env.observation_space, env.action_space)
# Create a PPO agent with the CustomPolicy
model = PPO(policy=policy, env=env, verbose=1)
# --------- TypeError: 'CustomPolicy' object is not callable
# --------- Attempt at using policy_kwargs
# policy_kwargs = dict(activation_fn=th.nn.ReLU,
# net_arch=dict(pi=[32, 32], vf=[32, 32]))
# model = PPO("MlpPolicy", env=env, verbose=1, policy_kwargs=policy_kwargs)
# --------- ValueError: could not broadcast input array from shape (25,) into shape (5,5)
# --------- Attempt at using policy_kwargs with custom policy
# policy_kwargs = dict(
# features_extractor_class=CustomPolicy,
# features_extractor_kwargs=dict(features_dim=25), # should perhaps be (5,5)
# )
# model = PPO(policy=policy, env=env, verbose=1, policy_kwargs=policy_kwargs)
# --------- TypeError: CustomPolicy.forward() got an unexpected keyword argument 'use_sde'
# Train the agent for 1000 steps
model.learn(total_timesteps=1000)
Thank you in advance.
I have code that trains a DQN in the distributed setting. When using a standard replay buffer that all the workers push their experiences too, the code runs perfectly fine.
However, when I switch to a prioritised replay buffer things start to slow down massively. The code for the two buffers is given below:
#ray.remote
class PrioritizedReplayBuffer:
def __init__(self, capacity, alpha=0.6, beta=0.4, beta_increment_per_sampling=0.001, batch_size=128):
self.capacity = capacity
self.alpha = alpha
self.beta = beta
self.beta_increment_per_sampling = beta_increment_per_sampling
self.buffer = []
self.pos = 0
self.priorities = []
self.batch_size = batch_size
def push(self, data):
for experience in data:
max_priority = max(self.priorities) if self.buffer else 1.0
if len(self.buffer) < self.capacity:
self.buffer.append(experience)
self.priorities.append(max_priority)
else:
self.buffer[self.pos] = experience
self.priorities[self.pos] = max_priority
self.pos = (self.pos + 1) % self.capacity
def sample(self):
start = time.time()
N = len(self.buffer)
if N == self.capacity:
priorities = np.array(self.priorities)
else:
priorities = np.array(self.priorities[:self.pos])
self.beta = min(1.0, self.beta + self.beta_increment_per_sampling)
sampling_probabilities = priorities ** self.alpha
sampling_probabilities = sampling_probabilities / sampling_probabilities.sum()
indices = random.choices(range(N), k=self.batch_size, weights=sampling_probabilities)
experiences = [self.buffer[idx] for idx in indices]
weights = np.array([(self.capacity * priorities[i]) ** -self.beta for i in indices])
weights = weights / weights.max()
end = time.time()
print(f"sampling took {(end - start) / 60} minutes")
return experiences, np.array(indices), weights
def update_priorities(self, indices, priorities):
for idx, priority in zip(indices, priorities):
self.priorities[idx] = priority
def __len__(self):
return len(self.buffer)
#ray.remote
class ReplayBuffer:
def __init__(self, capacity, batch_size=128):
self.capacity = capacity
self.buffer = []
self.batch_size = batch_size
def push(self, data):
for experience in data:
self.buffer.append(experience)
def sample(self):
return random.sample(self.buffer, self.batch_size)
def __len__(self):
return len(self.buffer)
The code for my workers looks like this:
#ray.remote
class Actor(object):
def __init__(self, state_dim, action_dim, exploration_decay, exploration_min, worker_id=None, replay_buffer=None, param_server=None, push_size=20, num_grad_steps=1e6):
self.worker_id = worker_id
self.env = gym.make('LunarLander-v2')
self.net = Net(state_dim, action_dim)
# get ray_remote objects; centralized buffer and parameter server
self.replay_buffer = replay_buffer
self.param_server = param_server
self.push_size = push_size # this is how much data we need until we push to the centralised buffer
self.num_grad_steps = num_grad_steps
self.epsilon = 1
self.exploration_decay = exploration_decay
self.exploration_min = exploration_min
self.action_dim = action_dim
def act(self, state):
if np.random.uniform() < self.epsilon:
self.epsilon = max(self.epsilon * self.exploration_decay, self.exploration_min)
return np.random.randint(0, self.action_dim)
else:
state = torch.FloatTensor(state)
with torch.no_grad():
values = self.net(state)
action = torch.argmax(values)
return int(action)
def sync_with_param_server(self):
new_actor_params = ray.get(self.param_server.return_params.remote())
for param in new_actor_params:
new_actor_params[param] = torch.from_numpy(new_actor_params[param]).float()
self.net.load_state_dict(new_actor_params)
def run(self):
state = self.env.reset()
episode_reward = 0
episode = 0
ep_length = 0
grad_steps = 0
intermediate_memory = [] # this is what we will push to the buffer at once
while grad_steps < self.num_grad_steps:
ep_length += 1
action = self.act(state)
next_state, reward, done, _ = self.env.step(action)
intermediate_memory.append((state, action, reward, next_state, done))
if len(intermediate_memory) >= self.push_size:
self.replay_buffer.push.remote(intermediate_memory)
intermediate_memory = []
self.sync_with_param_server()
grad_steps = ray.get(self.param_server.return_grad_steps.remote())
# time.sleep(60 * 5)
episode_reward += reward
if done:
# print results locally
# print(f"Episode {episode}: {episode_reward}")
# print_status(self.env, time_step)
# prepare new rollout
episode += 1
episode_reward = 0
ep_length = 0
next_state = self.env.reset()
state = next_state
I've narrowed the problem down somewhat -- when I uncomment the sleep command in the actor, the speed of the code goes back to usual once this is in effect, i.e. when the actors aren't pushing any data to the buffer. The thing is that I am not sure why actors pushing to the replay buffer would cause it to be slow in the learning steps when it doesn't make a difference for the vanilla replay buffer.
Any help pinpointing what is causing the problem and how to fix it would be greatly appreciated.
PROBLEM
I'm writing a Monte-Carlo tree search algorithm to play chess in Python. I replaced the simulation stage with a custom evaluation function. My code looks perfect but for some reason acts strange. It recognizes instant wins easily enough but cannot recognize checkmate-in-2 moves and checkmate-in-3 moves positions. Any ideas?
WHAT I'VE TRIED
I tried giving it more time to search but it still cannot find the best move even when it leads to a guaranteed win in two moves. However, I noticed that results improve when I turn off the custom evaluation and use classic Monte Carlo Tree Search simulation. (To turn off custom evaluation, just don't pass any arguments into the Agent constructor.) But I really need it to work with custom evaluation because I am working on a machine learning technique for board evaluation.
I tried printing out the results of the searches to see which moves the algorithm thinks are good. It consistently ranks the best move in mate-in-2 and mate-in-3 situations among the worst. The rankings are based on the number of times the move was explored (which is how MCTS picks the best moves).
MY CODE
I've included the whole code because everything is relevant to the problem. To run this code, you may need to install python-chess (pip install python-chess).
I've struggled with this for more than a week and it's getting frustrating. Any ideas?
import math
import random
import time
import chess
import chess.engine
class Node:
def __init__(self, state, parent, action):
"""Initializes a node structure for a Monte-Carlo search tree."""
self.state = state
self.parent = parent
self.action = action
self.unexplored_actions = list(self.state.legal_moves)
random.shuffle(self.unexplored_actions)
self.colour = self.state.turn
self.children = []
self.w = 0 # number of wins
self.n = 0 # number of simulations
class Agent:
def __init__(self, custom_evaluation=None):
"""Initializes a Monte-Carlo tree search agent."""
if custom_evaluation:
self._evaluate = custom_evaluation
def mcts(self, state, time_limit=float('inf'), node_limit=float('inf')):
"""Runs Monte-Carlo tree search and returns an evaluation."""
nodes_searched = 0
start_time = time.time()
# Initialize the root node.
root = Node(state, None, None)
while (time.time() - start_time) < time_limit and nodes_searched < node_limit:
# Select a leaf node.
leaf = self._select(root)
# Add a new child node to the tree.
if leaf.unexplored_actions:
child = self._expand(leaf)
else:
child = leaf
# Evaluate the node.
result = self._evaluate(child)
# Backpropagate the results.
self._backpropagate(child, result)
nodes_searched += 1
result = max(root.children, key=lambda node: node.n)
return result
def _uct(self, node):
"""Returns the Upper Confidence Bound 1 of a node."""
c = math.sqrt(2)
# We want every WHITE node to choose the worst BLACK node and vice versa.
# Scores for each node are relative to that colour.
w = node.n - node.w
n = node.n
N = node.parent.n
try:
ucb = (w / n) + (c * math.sqrt(math.log(N) / n))
except ZeroDivisionError:
ucb = float('inf')
return ucb
def _select(self, node):
"""Returns a leaf node that either has unexplored actions or is a terminal node."""
while (not node.unexplored_actions) and node.children:
# Pick the child node with highest UCB.
selection = max(node.children, key=self._uct)
# Move to the next node.
node = selection
return node
def _expand(self, node):
"""Adds one child node to the tree."""
# Pick an unexplored action.
action = node.unexplored_actions.pop()
# Create a copy of the node state.
state_copy = node.state.copy()
# Carry out the action on the copy.
state_copy.push(action)
# Create a child node.
child = Node(state_copy, node, action)
# Add the child node to the list of children.
node.children.append(child)
# Return the child node.
return child
def _evaluate(self, node):
"""Returns an evaluation of a given node."""
# If no custom evaluation function was passed into the object constructor,
# use classic simulation.
return self._simulate(node)
def _simulate(self, node):
"""Randomly plays out to the end and returns a static evaluation of the terminal state."""
board = node.state.copy()
while not board.is_game_over():
# Pick a random action.
move = random.choice(list(board.legal_moves))
# Perform the action.
board.push(move)
return self._calculate_static_evaluation(board)
def _backpropagate(self, node, result):
"""Updates a node's values and subsequent parent values."""
# Update the node's values.
node.w += result.pov(node.colour).expectation()
node.n += 1
# Back up values to parent nodes.
while node.parent is not None:
node.parent.w += result.pov(node.parent.colour).expectation()
node.parent.n += 1
node = node.parent
def _calculate_static_evaluation(self, board):
"""Returns a static evaluation of a *terminal* board state."""
result = board.result(claim_draw=True)
if result == '1-0':
wdl = chess.engine.Wdl(wins=1000, draws=0, losses=0)
elif result == '0-1':
wdl = chess.engine.Wdl(wins=0, draws=0, losses=1000)
else:
wdl = chess.engine.Wdl(wins=0, draws=1000, losses=0)
return chess.engine.PovWdl(wdl, chess.WHITE)
def custom_evaluation(node):
"""Returns a static evaluation of a board state."""
board = node.state
# Evaluate terminal states.
if board.is_game_over(claim_draw=True):
result = board.result(claim_draw=True)
if result == '1-0':
wdl = chess.engine.Wdl(wins=1000, draws=0, losses=0)
elif result == '0-1':
wdl = chess.engine.Wdl(wins=0, draws=0, losses=1000)
else:
wdl = chess.engine.Wdl(wins=0, draws=1000, losses=0)
return chess.engine.PovWdl(wdl, chess.WHITE)
# Evaluate material.
material_balance = 0
material_balance += len(board.pieces(chess.PAWN, chess.WHITE)) * +100
material_balance += len(board.pieces(chess.PAWN, chess.BLACK)) * -100
material_balance += len(board.pieces(chess.ROOK, chess.WHITE)) * +500
material_balance += len(board.pieces(chess.ROOK, chess.BLACK)) * -500
material_balance += len(board.pieces(chess.KNIGHT, chess.WHITE)) * +300
material_balance += len(board.pieces(chess.KNIGHT, chess.BLACK)) * -300
material_balance += len(board.pieces(chess.BISHOP, chess.WHITE)) * +300
material_balance += len(board.pieces(chess.BISHOP, chess.BLACK)) * -300
material_balance += len(board.pieces(chess.QUEEN, chess.WHITE)) * +900
material_balance += len(board.pieces(chess.QUEEN, chess.BLACK)) * -900
# TODO: Evaluate mobility.
mobility = 0
# Aggregate values.
centipawn_evaluation = material_balance + mobility
# Convert evaluation from centipawns to wdl.
wdl = chess.engine.Cp(centipawn_evaluation).wdl(model='lichess')
static_evaluation = chess.engine.PovWdl(wdl, chess.WHITE)
return static_evaluation
m1 = chess.Board('8/8/7k/8/8/8/5R2/6R1 w - - 0 1') # f2h2
# WHITE can win in one move. Best move is f2-h2.
m2 = chess.Board('8/6k1/8/8/8/8/1K2R3/5R2 w - - 0 1')
# WHITE can win in two moves. Best move is e2-g2.
m3 = chess.Board('8/8/5k2/8/8/8/3R4/4R3 w - - 0 1')
# WHITE can win in three moves. Best move is d2-f2.
agent = Agent(custom_evaluation)
result = agent.mcts(m2, time_limit=30)
print(result)
this part is action transition probabilty
def _calculate_transition_prob(self, current, delta):
new_position = np.array(current) + np.array(delta)
new_position =self._limit_coordinates(new_position).astype(int)
new_state = np.ravel_multi_index(tuple(new_position), self.shape)
reward = self.reward
is_done = self._cliff[tuple(new_position)] or (tuple(new_position) == (4,11))
return [(1.0, new_state, reward, is_done)]
this part i want to use reward function as argument
def reward(reward, self):
self.reward = -100.0 if self._cliff[tuple(new_position)] else -1.0
return reward
this part is q learning (RL) algorithm
def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
Q = defaultdict(lambda: np.zeros(env.action_space.n))
episode_lengths = np.zeros(num_episodes)
episode_rewards = np.zeros(num_episodes)
policy = epsilon_greedy_policy(Q, epsilon, env.action_space.n)
for i_episode in range(num_episodes):
state = env.reset()
for t in itertools.count():
action_probs = policy(state)
action = np.random.choice(np.arange(len(action_probs)), p = action_probs)
next_state, reward, done, _ = env.step(action)
episode_rewards[i_episode] += reward
episode_lengths[i_episode] = t
Look at what you're doing with that statement: you try to add the function object reward to the left-hand side. What does it mean to add a function object to something? You need to write your code more clearly, so that you don't confuse the local reward variable with the visible reward() function.
I suspect that what you need is the return value from the function -- you would need to call it. Again, I recommend that you give the variable and function separate names.
import networkx as nx
from collections import defaultdict
from collections import Counter
def test_transmission(u, v, p):
return random.random()<p
def discrete_SIR(G,w,initial_infecteds,beta,Vl,duration):
if G.has_node(initial_infecteds):
initial_infecteds=[initial_infecteds]
N=G.order()
#t = [tmin]
S = [N-len(initial_infecteds)]
#I = [len(initial_infecteds)]
R = [0]
V = [0]
susceptible = defaultdict(lambda: True)
#above line is equivalent to u.susceptible=True for all nodes.
for u in initial_infecteds:
susceptible[u] = False
infecteds = [{}]*duration #bunch of empty sets
infecteds[0] = set(initial_infecteds)
I = [sum(map(len, infecteds))] #set I[0] to be the total number of infections
while I[-1]>0 :
new_infecteds = set()
vaccinated= set()
for u in infecteds:
for v in G.neighbors(u):
if len(vaccinated)+V[-1]< (Vl*N) : #check if vaccination over or not
if susceptible[v] and test_transmission(u, v, w):
vaccinated.add(v)
susceptible[v] = False
# print('transmitting vaccination')
elif susceptible[v] and test_transmission(u,v,beta):
new_infecteds.add(v)
susceptible[v]=False
# print('transmitting infection')
else:
# print("BYE")
if susceptible[v] and test_transmission(u, v,beta):
new_infecteds.add(v)
susceptible[v] = False
#infector[v] = [u]
recovering_nodes = infecteds.pop()
infecteds.insert(0,new_infecteds)
infecteds = new_infecteds
I.append(sum(map(len, infecteds)))
R.append(R[-1]+I[-1])
V.append(len(vaccinated)+V[-1])
S.append(N-V[-1]-I[-1]-R[-1])
return scipy.array(S),scipy.array(V), scipy.array(I),scipy.array(R)
m=100
w=0.2
#ran=nx.gnp_random_graph(100,0.003)
G=nx.grid_2d_graph(m,m,periodic=True)
initial_infections = [(u,v) for (u,v) in G if u==int(m/2) and v==int(m/2)]
S, V, I, R = discrete_SIR(G,w,initial_infecteds=initial_infections,beta=0.5,Vl=1,duration=8)
This is a code of SIR model but this is for recovery rate 1. I want to change this code to include a variable parameter recovery rate and not the default which is 1 in this case. I have tried to change the code to include that. The basic code is of a SIR model.
I added the changes as made from Joels post in my modified SIR model.
For book keeping-
next_time = t[-1]+1
if next_time <= tmax:
for i in infecteds:
for u in i:
node_history[u][0].append(next_time+duration)
node_history[u][1].append('R')
for j in new_infecteds:
for v in j:
node_history[v][0].append(next_time)
node_history[v][1].append('I')
Let infecteds be a list of sets, such that infecteds[T] are those just infected, infecteds[T-1] are those that have been infected for 1 time step, etc. Then pop off infecteds[0] [e.g., recovering_nodes = infecteds.pop(0)] and append the newly infected nodes to the list.
Then for each time step, just iterate through all the sets in infecteds.
Here's some relevant pseudocode:
duration = 8
infecteds = [{}]*duration #bunch of empty sets
infecteds[0] = {1,2,3}
I = [sum(map(len, infecteds))] #set I[0] to be the total number of infections
while I[-1] >0:
new_infecteds = {}
for infected_set in infecteds:
for infected_node in infected_set:
Do some stuff with the node and its neighbors.
new_infecteds gets some things added to it.
recovering_nodes = infecteds.pop()
infecteds.insert(0,new_infecteds)
for node in recovering_nodes:
update status and do any bookkeeping.
I.append(sum(map(len, infecteds)))
Be careful about your use of the word "rate". A higher rate should mean faster recovery, and thus shorter duration (duration is like 1/rate). Your comment seems to use the word "rate" to mean "duration", so that for you a higher "rate" is actually a longer "duration". This is the inverse of what most people would understand you to mean.