Source code for rlgraph.components.neural_networks.policy

# Copyright 2018 The RLgraph authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from rlgraph import get_backend
from rlgraph.utils.rlgraph_errors import RLGraphError
from rlgraph.spaces import IntBox, FloatBox
from rlgraph.components.component import Component
from rlgraph.components.distributions import Normal, Categorical
from rlgraph.components.neural_networks.neural_network import NeuralNetwork
from rlgraph.components.action_adapters.action_adapter import ActionAdapter
from rlgraph.components.action_adapters.baseline_action_adapter import BaselineActionAdapter
from rlgraph.components.layers.preprocessing.reshape import ReShape
from rlgraph.utils.decorators import rlgraph_api, graph_fn

if get_backend() == "tf":
    import tensorflow as tf
elif get_backend() == "pytorch":
    import torch


[docs]class Policy(Component): """ A Policy is a wrapper Component that contains a NeuralNetwork, an ActionAdapter and a Distribution Component. """ def __init__(self, network_spec, action_space=None, action_adapter_spec=None, max_likelihood=True, scope="policy", **kwargs): """ Args: network_spec (Union[NeuralNetwork,dict]): The NeuralNetwork Component or a specification dict to build one. action_space (Space): The action Space within which this Component will create actions. action_adapter_spec (Optional[dict]): A spec-dict to create an ActionAdapter. Use None for the default ActionAdapter object. max_likelihood (bool): Whether to pick actions according to the max-likelihood value or via sampling. Default: True. """ super(Policy, self).__init__(scope=scope, **kwargs) self.neural_network = NeuralNetwork.from_spec(network_spec) if action_space is None: self.action_adapter = ActionAdapter.from_spec(action_adapter_spec) action_space = self.action_adapter.action_space else: self.action_adapter = ActionAdapter.from_spec(action_adapter_spec, action_space=action_space) self.action_space = action_space self.max_likelihood = max_likelihood # TODO: Hacky trick to implement IMPALA post-LSTM256 time-rank folding and unfolding. # TODO: Replace entirely via sonnet-like BatchApply Component. is_impala = "IMPALANetwork" in type(self.neural_network).__name__ # Add API-method to get baseline output (if we use an extra value function baseline node). if isinstance(self.action_adapter, BaselineActionAdapter): # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: self.time_rank_folder = ReShape(fold_time_rank=True, scope="time-rank-fold") self.time_rank_unfolder_v = ReShape(unfold_time_rank=True, time_major=True, scope="time-rank-unfold-v") self.time_rank_unfolder_a_probs = ReShape(unfold_time_rank=True, time_major=True, scope="time-rank-unfold-a-probs") self.time_rank_unfolder_logits = ReShape(unfold_time_rank=True, time_major=True, scope="time-rank-unfold-logits") self.time_rank_unfolder_log_probs = ReShape(unfold_time_rank=True, time_major=True, scope="time-rank-unfold-log-probs") self.add_components( self.time_rank_folder, self.time_rank_unfolder_v, self.time_rank_unfolder_a_probs, self.time_rank_unfolder_log_probs, self.time_rank_unfolder_logits ) @rlgraph_api(component=self) def get_state_values_logits_probabilities_log_probs(self, nn_input, internal_states=None): nn_output = self.neural_network.apply(nn_input, internal_states) last_internal_states = nn_output.get("last_internal_states") nn_output = nn_output["output"] # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: nn_output = self.time_rank_folder.apply(nn_output) out = self.action_adapter.get_logits_probabilities_log_probs(nn_output) # TODO: IMPALA attempt to speed up final pass after LSTM. if is_impala: state_values = self.time_rank_unfolder_v.apply(out["state_values"], nn_output) logits = self.time_rank_unfolder_logits.apply(out["logits"], nn_output) probs = self.time_rank_unfolder_a_probs.apply(out["probabilities"], nn_output) log_probs = self.time_rank_unfolder_log_probs.apply(out["log_probs"], nn_output) else: state_values = out["state_values"] logits = out["logits"] probs = out["probabilities"] log_probs = out["log_probs"] return dict(state_values=state_values, logits=logits, probabilities=probs, log_probs=log_probs, last_internal_states=last_internal_states) # Figure out our Distribution. if isinstance(action_space, IntBox): self.distribution = Categorical() # Continuous action space -> Normal distribution (each action needs mean and variance from network). elif isinstance(action_space, FloatBox): self.distribution = Normal() else: raise RLGraphError("ERROR: `action_space` is of type {} and not allowed in {} Component!". format(type(action_space).__name__, self.name)) self.add_components(self.neural_network, self.action_adapter, self.distribution) if is_impala: self.add_components( self.time_rank_folder, self.time_rank_unfolder_v, self.time_rank_unfolder_a_probs, self.time_rank_unfolder_log_probs, self.time_rank_unfolder_logits ) # Define our interface. @rlgraph_api def get_nn_output(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: The raw output of the neural network (before it's cleaned-up and passed through the ActionAdapter). """ out = self.neural_network.apply(nn_input, internal_states) return dict(output=out["output"], last_internal_states=out.get("last_internal_states")) @rlgraph_api def get_action(self, nn_input, internal_states=None, max_likelihood=None): """ Returns an action based on NN output, action adapter output and distribution sampling. Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. max_likelihood (Optional[bool]): If not None, use this to determine whether actions should be drawn from the distribution in max-likelihood or stochastic fashion. Returns: any: The drawn action. """ max_likelihood = self.max_likelihood if max_likelihood is None else max_likelihood nn_output = self.get_nn_output(nn_input, internal_states) # Skip our distribution, iff discrete action-space and max-likelihood acting (greedy). # In that case, one does not need to create a distribution in the graph each act (only to get the argmax # over the logits, which is the same as the argmax over the probabilities (or log-probabilities)). if max_likelihood is True and isinstance(self.action_space, IntBox): out = self.action_adapter.get_logits_probabilities_log_probs(nn_output["output"]) action = self._graph_fn_get_max_likelihood_action_wo_distribution(out["logits"]) else: out = self.action_adapter.get_logits_probabilities_log_probs(nn_output["output"]) action = self.distribution.draw(out["probabilities"], max_likelihood) return dict(action=action, last_internal_states=nn_output["last_internal_states"]) @rlgraph_api def get_max_likelihood_action(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: See `get_action`, but with max_likelihood force set to True. """ out = self.get_logits_probabilities_log_probs(nn_input, internal_states) if isinstance(self.action_space, IntBox): action = self._graph_fn_get_max_likelihood_action_wo_distribution(out["logits"]) else: action = self.distribution.sample_deterministic(out["probabilities"]) return dict(action=action, last_internal_states=out["last_internal_states"]) @rlgraph_api def get_stochastic_action(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: See `get_action`, but with max_likelihood force set to False. """ out = self.get_logits_probabilities_log_probs(nn_input, internal_states) action = self.distribution.sample_stochastic(out["probabilities"]) return dict(action=action, last_internal_states=out["last_internal_states"]) @rlgraph_api def get_action_layer_output(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: The raw output of the action layer of the ActionAdapter (including possibly the last internal states of a RNN-based NN). """ nn_output = self.get_nn_output(nn_input, internal_states) action_layer_output = self.action_adapter.get_action_layer_output(nn_output["output"]) # Add last internal states to return value. return dict(output=action_layer_output["output"], last_internal_states=nn_output["last_internal_states"]) @rlgraph_api def get_logits_probabilities_log_probs(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: Dict: logits: The (reshaped) logits from the ActionAdapter. probabilities: The probabilities gained from the softmaxed logits. log_probs: The log(probabilities) values. """ nn_output = self.get_nn_output(nn_input, internal_states) aa_output = self.action_adapter.get_logits_probabilities_log_probs(nn_output["output"]) return dict( logits=aa_output["logits"], probabilities=aa_output["probabilities"], log_probs=aa_output["log_probs"], last_internal_states=nn_output["last_internal_states"] ) @rlgraph_api def get_entropy(self, nn_input, internal_states=None): """ Args: nn_input (any): The input to our neural network. internal_states (Optional[any]): The initial internal states going into an RNN-based neural network. Returns: any: See Distribution component. """ out = self.get_logits_probabilities_log_probs(nn_input, internal_states) entropy = self.distribution.entropy(out["probabilities"]) return dict(entropy=entropy, last_internal_states=out["last_internal_states"]) @graph_fn def _graph_fn_get_max_likelihood_action_wo_distribution(self, logits): """ Use this function only for discrete action spaces to circumvent using a full-blown backend-specific distribution object (e.g. tf.distribution.Multinomial). Args: logits (SingleDataOp): Logits over which to pick the argmax (greedy action). Returns: SingleDataOp: The argmax over the last rank of the input logits. """ if get_backend() == "tf": return tf.argmax(logits, axis=-1, output_type=tf.int32) elif get_backend() == "pytorch": return torch.argmax(logits, dim=-1).int()