Source code for rlgraph.components.neural_networks.policy

# Copyright 2018 The RLgraph authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from rlgraph import get_backend
from rlgraph.utils.rlgraph_errors import RLGraphError
from rlgraph.spaces import IntBox, FloatBox
from rlgraph.components.component import Component
from rlgraph.components.distributions import Normal, Categorical
from rlgraph.components.neural_networks.neural_network import NeuralNetwork
from rlgraph.components.action_adapters.action_adapter import ActionAdapter
from rlgraph.components.action_adapters.baseline_action_adapter import BaselineActionAdapter
from rlgraph.components.layers.preprocessing.reshape import ReShape
from rlgraph.utils.decorators import rlgraph_api, graph_fn

if get_backend() == "tf":
    import tensorflow as tf
elif get_backend() == "pytorch":
    import torch


[docs]class Policy(Component):
    """
    A Policy is a wrapper Component that contains a NeuralNetwork, an ActionAdapter and a Distribution Component.
    """
    def __init__(self, network_spec, action_space=None, action_adapter_spec=None,
                 max_likelihood=True, scope="policy", **kwargs):
        """
        Args:
            network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build
                one.

            action_space (Space): The action Space within which this Component will create actions.

            action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default
                ActionAdapter object.

            max_likelihood (bool): Whether to pick actions according to the max-likelihood value or via sampling.
                Default: True.
        """
        super(Policy, self).__init__(scope=scope, **kwargs)

        self.neural_network = NeuralNetwork.from_spec(network_spec)
        if action_space is None:
            self.action_adapter = ActionAdapter.from_spec(action_adapter_spec)
            action_space = self.action_adapter.action_space
        else:
            self.action_adapter = ActionAdapter.from_spec(action_adapter_spec, action_space=action_space)
        self.action_space = action_space
        self.max_likelihood = max_likelihood

        # TODO: Hacky trick to implement IMPALA post-LSTM256 time-rank folding and unfolding.
        # TODO: Replace entirely via sonnet-like BatchApply Component.
        is_impala = "IMPALANetwork" in type(self.neural_network).__name__

        # Add API-method to get baseline output (if we use an extra value function baseline node).
        if isinstance(self.action_adapter, BaselineActionAdapter):
            # TODO: IMPALA attempt to speed up final pass after LSTM.
            if is_impala:
                self.time_rank_folder = ReShape(fold_time_rank=True, scope="time-rank-fold")
                self.time_rank_unfolder_v = ReShape(unfold_time_rank=True, time_major=True, scope="time-rank-unfold-v")
                self.time_rank_unfolder_a_probs = ReShape(unfold_time_rank=True, time_major=True,
                                                          scope="time-rank-unfold-a-probs")
                self.time_rank_unfolder_logits = ReShape(unfold_time_rank=True, time_major=True,
                                                         scope="time-rank-unfold-logits")
                self.time_rank_unfolder_log_probs = ReShape(unfold_time_rank=True, time_major=True,
                                                        scope="time-rank-unfold-log-probs")
                self.add_components(
                    self.time_rank_folder,
                    self.time_rank_unfolder_v,
                    self.time_rank_unfolder_a_probs,
                    self.time_rank_unfolder_log_probs,
                    self.time_rank_unfolder_logits
                )

            @rlgraph_api(component=self)
            def get_state_values_logits_probabilities_log_probs(self, nn_input, internal_states=None):
                nn_output = self.neural_network.apply(nn_input, internal_states)
                last_internal_states = nn_output.get("last_internal_states")
                nn_output = nn_output["output"]

                # TODO: IMPALA attempt to speed up final pass after LSTM.
                if is_impala:
                    nn_output = self.time_rank_folder.apply(nn_output)

                out = self.action_adapter.get_logits_probabilities_log_probs(nn_output)

                # TODO: IMPALA attempt to speed up final pass after LSTM.
                if is_impala:
                    state_values = self.time_rank_unfolder_v.apply(out["state_values"], nn_output)
                    logits = self.time_rank_unfolder_logits.apply(out["logits"], nn_output)
                    probs = self.time_rank_unfolder_a_probs.apply(out["probabilities"], nn_output)
                    log_probs = self.time_rank_unfolder_log_probs.apply(out["log_probs"], nn_output)
                else:
                    state_values = out["state_values"]
                    logits = out["logits"]
                    probs = out["probabilities"]
                    log_probs = out["log_probs"]

                return dict(state_values=state_values, logits=logits, probabilities=probs, log_probs=log_probs,
                            last_internal_states=last_internal_states)

        # Figure out our Distribution.
        if isinstance(action_space, IntBox):
            self.distribution = Categorical()
        # Continuous action space -> Normal distribution (each action needs mean and variance from network).
        elif isinstance(action_space, FloatBox):
            self.distribution = Normal()
        else:
            raise RLGraphError("ERROR: `action_space` is of type {} and not allowed in {} Component!".
                               format(type(action_space).__name__, self.name))

        self.add_components(self.neural_network, self.action_adapter, self.distribution)

        if is_impala:
            self.add_components(
                self.time_rank_folder, self.time_rank_unfolder_v, self.time_rank_unfolder_a_probs,
                self.time_rank_unfolder_log_probs, self.time_rank_unfolder_logits
            )

    # Define our interface.
    @rlgraph_api
    def get_nn_output(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: The raw output of the neural network (before it's cleaned-up and passed through the ActionAdapter).
        """
        out = self.neural_network.apply(nn_input, internal_states)
        return dict(output=out["output"], last_internal_states=out.get("last_internal_states"))

    @rlgraph_api
    def get_action(self, nn_input, internal_states=None, max_likelihood=None):
        """
        Returns an action based on NN output, action adapter output and distribution sampling.

        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.
            max_likelihood (Optional[bool]): If not None, use this to determine whether actions should be drawn
                from the distribution in max-likelihood or stochastic fashion.

        Returns:
            any: The drawn action.
        """
        max_likelihood = self.max_likelihood if max_likelihood is None else max_likelihood

        nn_output = self.get_nn_output(nn_input, internal_states)

        # Skip our distribution, iff discrete action-space and max-likelihood acting (greedy).
        # In that case, one does not need to create a distribution in the graph each act (only to get the argmax
        # over the logits, which is the same as the argmax over the probabilities (or log-probabilities)).
        if max_likelihood is True and isinstance(self.action_space, IntBox):
            out = self.action_adapter.get_logits_probabilities_log_probs(nn_output["output"])
            action = self._graph_fn_get_max_likelihood_action_wo_distribution(out["logits"])
        else:
            out = self.action_adapter.get_logits_probabilities_log_probs(nn_output["output"])
            action = self.distribution.draw(out["probabilities"], max_likelihood)
        return dict(action=action, last_internal_states=nn_output["last_internal_states"])

    @rlgraph_api
    def get_max_likelihood_action(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: See `get_action`, but with max_likelihood force set to True.
        """
        out = self.get_logits_probabilities_log_probs(nn_input, internal_states)

        if isinstance(self.action_space, IntBox):
            action = self._graph_fn_get_max_likelihood_action_wo_distribution(out["logits"])
        else:
            action = self.distribution.sample_deterministic(out["probabilities"])

        return dict(action=action, last_internal_states=out["last_internal_states"])

    @rlgraph_api
    def get_stochastic_action(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: See `get_action`, but with max_likelihood force set to False.
        """
        out = self.get_logits_probabilities_log_probs(nn_input, internal_states)
        action = self.distribution.sample_stochastic(out["probabilities"])
        return dict(action=action, last_internal_states=out["last_internal_states"])

    @rlgraph_api
    def get_action_layer_output(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: The raw output of the action layer of the ActionAdapter (including possibly the last internal states
                of a RNN-based NN).
        """
        nn_output = self.get_nn_output(nn_input, internal_states)
        action_layer_output = self.action_adapter.get_action_layer_output(nn_output["output"])
        # Add last internal states to return value.
        return dict(output=action_layer_output["output"], last_internal_states=nn_output["last_internal_states"])

    @rlgraph_api
    def get_logits_probabilities_log_probs(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            Dict:
                logits: The (reshaped) logits from the ActionAdapter.
                probabilities: The probabilities gained from the softmaxed logits.
                log_probs: The log(probabilities) values.
        """
        nn_output = self.get_nn_output(nn_input, internal_states)
        aa_output = self.action_adapter.get_logits_probabilities_log_probs(nn_output["output"])
        return dict(
            logits=aa_output["logits"], probabilities=aa_output["probabilities"], log_probs=aa_output["log_probs"],
            last_internal_states=nn_output["last_internal_states"]
        )

    @rlgraph_api
    def get_entropy(self, nn_input, internal_states=None):
        """
        Args:
            nn_input (any): The input to our neural network.
            internal_states (Optional[any]): The initial internal states going into an RNN-based neural network.

        Returns:
            any: See Distribution component.
        """
        out = self.get_logits_probabilities_log_probs(nn_input, internal_states)
        entropy = self.distribution.entropy(out["probabilities"])
        return dict(entropy=entropy, last_internal_states=out["last_internal_states"])

    @graph_fn
    def _graph_fn_get_max_likelihood_action_wo_distribution(self, logits):
        """
        Use this function only for discrete action spaces to circumvent using a full-blown
        backend-specific distribution object (e.g. tf.distribution.Multinomial).

        Args:
            logits (SingleDataOp): Logits over which to pick the argmax (greedy action).

        Returns:
            SingleDataOp: The argmax over the last rank of the input logits.
        """
        if get_backend() == "tf":
            return tf.argmax(logits, axis=-1, output_type=tf.int32)
        elif get_backend() == "pytorch":
            return torch.argmax(logits, dim=-1).int()