Source code for rlgraph.components.action_adapters.dueling_action_adapter

# Copyright 2018 The RLgraph authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math

from rlgraph import get_backend
from rlgraph.components.action_adapters.action_adapter import ActionAdapter
from rlgraph.components.layers.nn.dense_layer import DenseLayer
from rlgraph.spaces import IntBox, FloatBox
from rlgraph.utils.decorators import rlgraph_api, graph_fn
from rlgraph.utils.util import SMALL_NUMBER, get_rank
from rlgraph.utils.ops import DataOpTuple


if get_backend() == "tf":
    import tensorflow as tf
elif get_backend() == "pytorch":
    import torch


[docs]class DuelingActionAdapter(ActionAdapter):
    """
    An ActionAdapter that adds a dueling Q calculation to the flattened output of a neural network.

    API:
        get_dueling_output(nn_output) (Tuple[SingleDataOp x 3]): The state-value, advantage-values
            (reshaped) and q-values (reshaped) after passing action_layer_output through the dueling layer.
    """
    def __init__(self, units_state_value_stream, units_advantage_stream,
                 weights_spec_state_value_stream=None, biases_spec_state_value_stream=None,
                 activation_state_value_stream="relu",
                 weights_spec_advantage_stream=None, biases_spec_advantage_stream=None,
                 activation_advantage_stream="relu",
                 scope="dueling-action-adapter", **kwargs):
        # TODO: change add_units=-1 once we have a true base class for action-adapters.
        super(DuelingActionAdapter, self).__init__(add_units=0, scope=scope, **kwargs)

        # The state-value stream.
        self.units_state_value_stream = units_state_value_stream
        self.weights_spec_state_value_stream = weights_spec_state_value_stream
        self.biases_spec_state_value_stream = biases_spec_state_value_stream
        self.activation_state_value_stream = activation_state_value_stream

        # The advantage stream.
        self.units_advantage_stream = units_advantage_stream
        self.weights_spec_advantage_stream = weights_spec_advantage_stream
        self.biases_spec_advantage_stream = biases_spec_advantage_stream
        self.activation_advantage_stream = activation_advantage_stream

        # Create all 4 extra DenseLayers.
        self.dense_layer_state_value_stream = DenseLayer(
            units=self.units_state_value_stream, weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-state-value-stream"
        )
        self.dense_layer_advantage_stream = DenseLayer(
            units=self.units_state_value_stream, weights_spec=self.weights_spec_state_value_stream,
            biases_spec=self.biases_spec_state_value_stream,
            activation=self.activation_state_value_stream,
            scope="dense-layer-advantage-stream"
        )
        self.state_value_node = DenseLayer(
            units=1,
            activation="linear",
            scope="state-value-node"
        )
        # self.action_layer is our advantage layer

        self.add_components(
            self.dense_layer_state_value_stream, self.dense_layer_advantage_stream, self.state_value_node
        )

    @rlgraph_api
    def get_action_layer_output(self, nn_output):
        """
        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            tuple:
                DataOpRecord: The output of the state-value stream (a DenseLayer) after passing `nn_output` through it.

                DataOpRecord: The output of the advantage-value stream (a DenseLayer) after passing `nn_output` through
                    it. Note: These will be flat advantage nodes that have not been reshaped yet according to the
                    action_space.
        """
        output_state_value_dense = self.dense_layer_state_value_stream.apply(nn_output)
        output_advantage_dense = self.dense_layer_advantage_stream.apply(nn_output)
        state_value_node = self.state_value_node.apply(output_state_value_dense)
        advantage_nodes = self.action_layer.apply(output_advantage_dense)
        return dict(state_value_node=state_value_node, output=advantage_nodes)

    @rlgraph_api
    def get_logits_probabilities_log_probs(self, nn_output):
        """
        Args:
            nn_output (DataOpRecord): The NN output of the preceding neural network.

        Returns:
            tuple (4x DataOpRecord):
                - The single state value node output.
                - The (already reshaped) q-values (the logits).
                - The probabilities obtained by softmaxing the q-values.
                - The log-probs.
        """
        out = self.get_action_layer_output(nn_output)
        advantage_values_reshaped = self.reshape.apply(out["output"])
        q_values = self._graph_fn_calculate_q_values(out["state_value_node"], advantage_values_reshaped)
        probabilities, log_probs = self._graph_fn_get_probabilities_log_probs(q_values)
        return dict(state_values=out["state_value_node"],
                    logits=q_values, probabilities=probabilities, log_probs=log_probs)

    @graph_fn
    def _graph_fn_calculate_q_values(self, state_value, advantage_values):
        """
        Args:
            state_value (SingleDataOp): The single node state-value output.
            advantage_values (SingleDataOp): The already reshaped advantage-values.

        Returns:
            SingleDataOp: The calculated, reshaped Q values (for each composite action) based on:
                Q = V + [A - mean(A)]
        """
        # Use the very first node as value function output.
        # Use all following nodes as advantage function output.
        if get_backend() == "tf":
            ## Separate out the single state-value node.
            #state_value, advantages = tf.split(
            #    value=inputs, num_or_size_splits=(1, self.num_advantage_values), axis=-1
            #)
            # Now we have to reshape the advantages according to our action space.
            #shape = list(self.target_space.get_shape(with_batch_rank=-1, with_category_rank=True))
            #advantages = tf.reshape(tensor=advantage_values, shape=shape)
            # Calculate the q-values according to [1] and return.
            mean_advantages = tf.reduce_mean(input_tensor=advantage_values, axis=-1, keepdims=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = tf.expand_dims(state_value_expanded, axis=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            ## state-value, advantages, q_values
            # q-values
            return q_values
            #tf.squeeze(state_value, axis=-1), advantages,
        elif get_backend() == "pytorch":
            mean_advantages = torch.mean(advantage_values, dim=-1, keepdim=True)

            # Make sure we broadcast the state_value correctly for the upcoming q_value calculation.
            state_value_expanded = state_value
            for _ in range(get_rank(advantage_values) - 2):
                state_value_expanded = torch.unsqueeze(state_value_expanded, dim=1)
            q_values = state_value_expanded + advantage_values - mean_advantages

            ## state-value, advantages, q_values
            # q-values
            return q_values

    # TODO: Use a SoftMax Component instead (uses the same code as the one below).
    @graph_fn
    def _graph_fn_get_probabilities_log_probs(self, logits):
        """
        Creates properties/parameters and log-probs from some reshaped output.

        Args:
            logits (SingleDataOp): The output of some layer that is already reshaped
                according to our action Space.

        Returns:
            tuple (2x SingleDataOp):
                parameters (DataOp): The parameters, ready to be passed to a Distribution object's
                    get_distribution API-method (usually some probabilities or loc/scale pairs).

                log_probs (DataOp): Simply the log(parameters).
        """
        if get_backend() == "tf":
            if isinstance(self.action_space, IntBox):
                # Discrete actions.
                parameters = tf.maximum(x=tf.nn.softmax(logits=logits, axis=-1), y=SMALL_NUMBER)
                # Log probs.
                log_probs = tf.log(x=parameters)
            elif isinstance(self.action_space, FloatBox):
                # Continuous actions.
                mean, log_sd = tf.split(value=logits, num_or_size_splits=2, axis=1)
                # Remove moments rank.
                mean = tf.squeeze(input=mean, axis=1)
                log_sd = tf.squeeze(input=log_sd, axis=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = tf.clip_by_value(
                    t=log_sd, clip_value_min=math.log(SMALL_NUMBER), clip_value_max=-math.log(SMALL_NUMBER)
                )

                # Turn log sd into sd.
                sd = tf.exp(x=log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(tf.log(x=mean), log_sd)
            else:
                raise NotImplementedError
            return parameters, log_probs

        elif get_backend() == "pytorch":
            if isinstance(self.action_space, IntBox):
                # Discrete actions.
                parameters = torch.max(torch.softmax(logits, dim=-1), torch.tensor(SMALL_NUMBER))
                # Log probs.
                log_probs = torch.log(parameters)
            elif isinstance(self.action_space, FloatBox):
                # Continuous actions.
                mean, log_sd = torch.split(logits, split_size_or_sections=2, dim=1)
                # Remove moments rank.
                mean = torch.squeeze(mean, dim=1)
                log_sd = torch.squeeze(log_sd, dim=1)

                # Clip log_sd. log(SMALL_NUMBER) is negative.
                log_sd = torch.clamp(
                    log_sd, min=math.log(SMALL_NUMBER), max=-math.log(SMALL_NUMBER)
                )

                # Turn log sd into sd.
                sd = torch.exp(log_sd)

                parameters = DataOpTuple(mean, sd)
                log_probs = DataOpTuple(torch.log(mean), log_sd)
            else:
                raise NotImplementedError

            return parameters, log_probs