Source code for rlgraph.components.explorations.exploration

# Copyright 2018 The RLgraph authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from rlgraph import get_backend
from rlgraph.utils.rlgraph_errors import RLGraphError
from rlgraph.components.component import Component
from rlgraph.components.explorations.epsilon_exploration import EpsilonExploration
from rlgraph.components.common.noise_components import NoiseComponent
from rlgraph.spaces import IntBox, FloatBox
from rlgraph.spaces.space_utils import sanity_check_space
from rlgraph.utils.util import dtype
from rlgraph.utils.decorators import rlgraph_api, graph_fn

if get_backend() == "tf":
    import tensorflow as tf
elif get_backend() == "pytorch":
    import torch

[docs]class Exploration(Component): """ A Component that can be plugged on top of a Policy's output to produce action choices. It includes noise and/or epsilon-based exploration options as well as an out-Socket to draw actions from the Policy's distribution - either by sampling or by deterministically choosing the max-likelihood value. """ def __init__(self, epsilon_spec=None, noise_spec=None, scope="exploration", **kwargs): """ Args: epsilon_spec (any): The spec or Component object itself to construct an EpsilonExploration Component. noise_spec (dict): The specification dict for a noise generator that adds noise to the NN's output. """ super(Exploration, self).__init__(scope=scope, **kwargs) self.action_space = None # The actual action space (may not have batch-rank, just the plain space) self.epsilon_exploration = None self.noise_component = None # For define-by-run sampling. self.sample_obj = None # Don't allow both epsilon and noise component if epsilon_spec and noise_spec: raise RLGraphError("Cannot use both epsilon exploration and a noise component at the same time.") # Add epsilon component. if epsilon_spec: self.epsilon_exploration = EpsilonExploration.from_spec(epsilon_spec) self.add_components(self.epsilon_exploration) # Define our interface. @rlgraph_api(component=self) def get_action(self, actions, time_step, use_exploration=True): """ Action depends on time-step (e.g. epsilon-decay). """ epsilon_decisions = self.epsilon_exploration.do_explore(actions, time_step) return self._graph_fn_pick(use_exploration, epsilon_decisions, actions) # Add noise component. elif noise_spec: self.noise_component = NoiseComponent.from_spec(noise_spec) self.add_components(self.noise_component) @rlgraph_api(component=self) def get_action(self, actions, time_step=0, use_exploration=True): """ Noise is added to the sampled action. """ noise = self.noise_component.get_noise() return self._graph_fn_add_noise(use_exploration, noise, actions) # Don't explore at all. Simple pass-through. else: @rlgraph_api(component=self) def get_action(self, actions, time_step=0, use_exploration=False): """ Action is returned as is. """ return actions
[docs] def check_input_spaces(self, input_spaces, action_space=None): action_sample_space = input_spaces["actions"] if get_backend() == "tf": sanity_check_space(action_sample_space, must_have_batch_rank=True) assert action_space is not None self.action_space = action_space if self.epsilon_exploration and self.noise_component: # Check again at graph creation? This is currently redundant to the check in __init__ raise RLGraphError("Cannot use both epsilon exploration and a noise component at the same time.") if self.epsilon_exploration: sanity_check_space(self.action_space, allowed_types=[IntBox], must_have_categories=True, num_categories=(1, None)) elif self.noise_component: sanity_check_space(self.action_space, allowed_types=[FloatBox])
@graph_fn def _graph_fn_pick(self, use_exploration, epsilon_decisions, sample): """ Exploration for discrete action spaces. Either pick a random action (if `use_exploration` and `epsilon_decision` are True), or return non-exploratory action. Args: use_exploration (DataOp): The master switch determining, whether to use exploration or not. epsilon_decisions (DataOp): The bool coming from the epsilon-exploration component specifying whether to use exploration or not (per batch item). sample (DataOp): The output from a distribution's "sample_deterministic" OR "sample_stochastic". Returns: DataOp: The DataOp representing the action. This will match the shape of self.action_space. """ if get_backend() == "tf": random_actions = tf.random_uniform( shape=tf.shape(sample), maxval=self.action_space.num_categories, dtype=dtype("int") ) if use_exploration is False: return sample else: return tf.where( # `use_exploration` given as actual bool or as tensor? condition=epsilon_decisions if use_exploration is True else tf.logical_and( use_exploration, epsilon_decisions ), x=random_actions, y=sample ) elif get_backend() == "pytorch": # N.b. different order versus TF because we dont want to execute the sampling below. if use_exploration is False: return sample if self.sample_obj is None: # Don't create new sample objects very time. self.sample_obj = torch.distributions.Uniform(0, self.action_space.num_categories) random_actions = self.sample_obj.sample(sample.shape).int() if use_exploration is True: return torch.where(epsilon_decisions, random_actions, sample) else: if not isinstance(use_exploration, torch.ByteTensor): use_exploration = use_exploration.byte() if not isinstance(epsilon_decisions, torch.ByteTensor): epsilon_decisions = epsilon_decisions.byte() return torch.where(use_exploration & epsilon_decisions, random_actions, sample) @graph_fn def _graph_fn_add_noise(self, use_exploration, noise, sample): """ Noise for continuous action spaces. Return the action with added noise. Args: use_exploration (DataOp): The master switch determining, whether to add noise or not. noise (DataOp): The noise coming from the noise component. sample (DataOp): The output from a distribution's "sample_deterministic" or "sample_stochastic" API-method. Returns: DataOp: The DataOp representing the action. This will match the shape of self.action_space. """ if get_backend() == "tf": return tf.cond( use_exploration, true_fn=lambda: sample + noise, false_fn=lambda: sample ) elif get_backend() == "pytorch": if use_exploration: return sample + noise else: return sample