Source code for rlgraph.components.loss_functions.impala_loss_function

# Copyright 2018 The RLgraph authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from rlgraph import get_backend
from rlgraph.components.helpers.v_trace_function import VTraceFunction
from rlgraph.components.loss_functions import LossFunction
from rlgraph.spaces import IntBox
from rlgraph.spaces.space_utils import sanity_check_space

if get_backend() == "tf":
    import tensorflow as tf


[docs]class IMPALALossFunction(LossFunction): """ The IMPALA loss function based on v-trace off-policy policy gradient corrections, described in detail in [1]. The three terms of the loss function are: 1) The policy gradient term: L[pg] = (rho_pg * advantages) * nabla log(pi(a|s)), where (rho_pg * advantages)=pg_advantages in code below. 2) The value-function baseline term: L[V] = 0.5 (vs - V(xs))^2, such that dL[V]/dtheta = (vs - V(xs)) nabla V(xs) 3) The entropy regularizer term: L[E] = - SUM[all actions a] pi(a|s) * log pi(a|s) [1] IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures - Espeholt, Soyer, Munos et al. - 2018 (https://arxiv.org/abs/1802.01561) """ def __init__(self, discount=0.99, reward_clipping="clamp_one", weight_pg=None, weight_baseline=None, weight_entropy=None, **kwargs): """ Args: discount (float): The discount factor (gamma) to use. reward_clipping (Optional[str]): One of None, "clamp_one" or "soft_asymmetric". Default: "clamp_one". weight_pg (float): The coefficient used for the policy gradient loss term (L[PG]). weight_baseline (float): The coefficient used for the Value-function baseline term (L[V]). weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). In the paper, values between 0.01 and 0.00005 are used via log-uniform search. """ # graph_fn_num_outputs=dict(_graph_fn_loss_per_item=2) <- debug super(IMPALALossFunction, self).__init__(scope=kwargs.pop("scope", "impala-loss-func"), **kwargs) self.discount = discount self.v_trace_function = VTraceFunction() self.reward_clipping = reward_clipping self.weight_pg = weight_pg if weight_pg is not None else 1.0 self.weight_baseline = weight_baseline if weight_baseline is not None else 0.5 self.weight_entropy = weight_entropy if weight_entropy is not None else 0.00025 self.action_space = None self.add_components(self.v_trace_function)
[docs] def check_input_spaces(self, input_spaces, action_space=None): assert action_space is not None self.action_space = action_space # Check for IntBox and num_categories. sanity_check_space( self.action_space, allowed_types=[IntBox], must_have_categories=True )
[docs] def loss(self, logits_actions_pi, action_probs_mu, values, actions, rewards, terminals): """ API-method that calculates the total loss (average over per-batch-item loss) from the original input to per-item-loss. Args: see `self._graph_fn_loss_per_item`. Returns: SingleDataOp: The tensor specifying the final loss (over the entire batch). """ #fake_step_op, loss_per_item = self._graph_fn_loss_per_item( logits_actions_pi, action_probs_mu, values, actions, rewards, terminals ) total_loss = self._graph_fn_loss_average(loss_per_item) # TODO: REMOVE no_op again. Only for IMPALA testing w/o update step. # fake_step_op, return total_loss, loss_per_item
def _graph_fn_loss_per_item(self, logits_actions_pi, action_probs_mu, values, actions, rewards, terminals): #, bootstrapped_values): """ Calculates the loss per batch item (summed over all timesteps) using the formula described above in the docstring to this class. Args: logits_actions_pi (DataOp): The logits for all possible actions coming from the learner's policy (pi). Dimensions are: (time+1) x batch x action-space+categories. +1 b/c last-next-state (aka "bootstrapped" value). action_probs_mu (DataOp): The probabilities for all actions coming from the actor's policies (mu). Dimensions are: (time+1) x batch x action-space+categories. values (DataOp): The state value estimates coming from baseline node of the learner's policy (pi). Dimensions are: (time+1) x batch. +1 b/c last-next-state (aka "bootstrapped" value). actions (DataOp): The actually taken (already one-hot flattened) actions. Dimensions are: (time+1) x batch x N (N=number of discrete actions). rewards (DataOp): The received rewards. Dimensions are: (time+1) x batch. terminals (DataOp): The observed terminal signals. Dimensions are: (time+1) x batch. Returns: SingleDataOp: The loss values per item in the batch, but summed over all timesteps. """ if get_backend() == "tf": values, bootstrapped_values = values[:-1], values[-1:] #return tf.no_op(), tf.ones_like(tf.squeeze(bootstrapped_values, axis=0)) logits_actions_pi = logits_actions_pi[:-1] # Ignore very first actions/rewards (these are the previous ones only used as part of the state input # for the network) actions_flat = actions[1:] actions = tf.reduce_sum( tf.cast(actions_flat * tf.range(self.action_space.num_categories, dtype=tf.float32), dtype=tf.int32), axis=-1 ) rewards = rewards[1:] terminals = terminals[1:] action_probs_mu = action_probs_mu[1:] # Discounts are simply 0.0, if there is a terminal, otherwise: `discount`. discounts = tf.expand_dims(tf.to_float(~terminals) * self.discount, axis=-1, name="discounts") # `clamp_one`: Clamp rewards between -1.0 and 1.0. if self.reward_clipping == "clamp_one": rewards = tf.clip_by_value(rewards, -1, 1, name="reward-clipping") # `soft_asymmetric`: Negative rewards are less negative than positive rewards are positive. elif self.reward_clipping == "soft_asymmetric": squeezed = tf.tanh(rewards / 5.0) rewards = tf.where(rewards < 0.0, 0.3 * squeezed, squeezed) * 5.0 # Let the v-trace helper function calculate the v-trace values (vs) and the pg-advantages # (already multiplied by rho_t_pg): A = rho_t_pg * (rt + gamma*vt - V(t)). # Both vs and pg_advantages will block the gradient as they should be treated as constants by the gradient # calculator of this loss func. vs, pg_advantages = self.v_trace_function.calc_v_trace_values( logits_actions_pi, tf.log(action_probs_mu), actions, actions_flat, discounts, rewards, values, bootstrapped_values ) cross_entropy = tf.expand_dims(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=actions, logits=logits_actions_pi ), axis=-1) #vs = tf.ones_like(values) #pg_advantages = tf.ones_like(log_probs_actions_taken_pi) # Make sure vs and advantage values are treated as constants for the gradient calculation. #vs = tf.stop_gradient(vs) pg_advantages = tf.stop_gradient(pg_advantages) # The policy gradient loss. loss_pg = pg_advantages * cross_entropy loss = tf.reduce_sum(loss_pg, axis=0) # reduce over the time-rank if self.weight_pg != 1.0: loss = self.weight_pg * loss # The value-function baseline loss. loss_baseline = 0.5 * tf.square(x=tf.subtract(vs, values)) loss_baseline = tf.reduce_sum(loss_baseline, axis=0) # reduce over the time-rank loss += self.weight_baseline * loss_baseline # The entropy regularizer term. policy = tf.nn.softmax(logits=logits_actions_pi) log_policy = tf.nn.log_softmax(logits=logits_actions_pi) loss_entropy = tf.reduce_sum(-policy * log_policy, axis=-1) loss_entropy = -tf.reduce_sum(loss_entropy, axis=0) # reduce over the time-rank loss += self.weight_entropy * loss_entropy return loss