Source code for rlgraph.components.explorations.epsilon_exploration

# Copyright 2018 The RLgraph authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from rlgraph import get_backend
from rlgraph.components.component import Component
from rlgraph.components.common.decay_components import DecayComponent
from rlgraph.spaces.space_utils import sanity_check_space
from rlgraph.utils.decorators import rlgraph_api, graph_fn

if get_backend() == "tf":
    import tensorflow as tf
elif get_backend() == "pytorch":
    import torch


[docs]class EpsilonExploration(Component): """ A component to handle epsilon-exploration functionality. It takes the current time step and outputs a bool on whether to explore (uniformly random) or not (greedy or sampling). The time step is used by a epsilon-decay component to determine the current epsilon value between 1.0 and 0.0. The result of this decay is the probability, with which we output "True" (meaning: do explore), vs "False" (meaning: do not explore). API: ins: time_step (int): The current time step. outs: do_explore (bool): The decision whether to explore (do_explore=True; pick uniformly randomly) or whether to use a sample (or max-likelihood value) from a distribution (do_explore=False). """ def __init__(self, decay_spec=None, scope="epsilon-exploration", **kwargs): """ Keyword Args: decay_spec (Optional[dict,DecayComponent]): The spec-dict for the DecayComponent to use or a DecayComponent object directly. Keyword Args: Used as decay_spec (only if `decay_spec` not given) to construct the DecayComponent. """ super(EpsilonExploration, self).__init__(scope=scope, **kwargs) # The space of the samples that we have to produce epsilon decisions for. self.sample_space = None # Our (epsilon) Decay-Component. self.decay_component = DecayComponent.from_spec(decay_spec) # Add the decay component and make time_step our (only) input. self.add_components(self.decay_component)
[docs] def check_input_spaces(self, input_spaces, action_space=None): # Require at least a batch-rank in the incoming samples. self.sample_space = input_spaces["sample"] if get_backend() == "tf": sanity_check_space(self.sample_space, must_have_batch_rank=True)
@rlgraph_api def do_explore(self, sample, time_step=0): """ API-method taking a timestep and returning a bool type tensor on whether to explore or not (per batch item). Args: sample (SingleDataOp): A data sample from which we can extract the batch size. time_step (SingleDataOp): The current global time step. Returns: SingleDataOp: Single decisions over a batch on whether to explore or not. """ decayed_value = self.decay_component.decayed_value(time_step) return self._graph_fn_get_random_actions(decayed_value, sample) @graph_fn def _graph_fn_get_random_actions(self, decayed_value, sample): if get_backend() == "tf": shape = tf.shape(sample) batch_time_shape = (shape[0],) + ((shape[1],) if self.sample_space.has_time_rank is True else ()) return tf.random_uniform(shape=batch_time_shape) < decayed_value elif get_backend() == "pytorch": if sample.dim() == 0: sample = sample.unsqueeze(-1) shape = sample.shape batch_time_shape = (shape[0],) + ((shape[1],) if self.sample_space.has_time_rank is True else ()) x = torch.rand(batch_time_shape) < decayed_value return x