Source code for rlgraph.agents.impala_agent

# Copyright 2018 The RLgraph authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import copy

from rlgraph import get_backend
from rlgraph.utils.decorators import rlgraph_api
from rlgraph.utils import RLGraphError
from rlgraph.agents.agent import Agent
from rlgraph.components.common.dict_merger import DictMerger
from rlgraph.components.common.container_splitter import ContainerSplitter
from rlgraph.components.common.slice import Slice
from rlgraph.components.common.staging_area import StagingArea
from rlgraph.components.common.environment_stepper import EnvironmentStepper
from rlgraph.components.layers.preprocessing.reshape import ReShape
from rlgraph.components.layers.preprocessing.transpose import Transpose
from rlgraph.components.layers.preprocessing.concat import Concat
from rlgraph.components.neural_networks.actor_component import ActorComponent
from rlgraph.components.neural_networks.dynamic_batching_policy import DynamicBatchingPolicy
from rlgraph.components.loss_functions.impala_loss_function import IMPALALossFunction
from rlgraph.components.memories.fifo_queue import FIFOQueue
from rlgraph.components.memories.queue_runner import QueueRunner
from rlgraph.spaces import FloatBox, Dict, Tuple
from rlgraph.utils.util import default_dict

if get_backend() == "tf":
    import tensorflow as tf

[docs]class IMPALAAgent(Agent): """ An Agent implementing the IMPALA algorithm described in [1]. The Agent contains both learner and actor API-methods, which will be put into the graph depending on the type (). [1] IMPALA: Scalable Distributed Deep-RL with Importance Weighted Actor-Learner Architectures - Espeholt, Soyer, Munos et al. - 2018 ( """ default_internal_states_space = Tuple(FloatBox(shape=(256,)), FloatBox(shape=(256,)), add_batch_rank=False) #default_internal_states_space = Tuple(FloatBox(shape=(266,)), FloatBox(shape=(266,)), add_batch_rank=False) default_environment_spec = dict( type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4 ) # TODO: capacity FIFO Queue make configurable. def __init__(self, discount=0.99, fifo_queue_spec=None, architecture="large", environment_spec=None, weight_pg=None, weight_baseline=None, weight_entropy=None, worker_sample_size=100, dynamic_batching=False, **kwargs): """ Args: discount (float): The discount factor gamma. architecture (str): Which IMPALA architecture to use. One of "small" or "large". Will be ignored if `network_spec` is given explicitly in kwargs. Default: "large". fifo_queue_spec (Optional[dict,FIFOQueue]): The spec for the FIFOQueue to use for the IMPALA algorithm. environment_spec (dict): The spec for constructing an Environment object for an actor-type IMPALA agent. weight_pg (float): See IMPALALossFunction Component. weight_baseline (float): See IMPALALossFunction Component. weight_entropy (float): See IMPALALossFunction Component. worker_sample_size (int): How many steps the actor will perform in the environment each sample-run. dynamic_batching (bool): Whether to use the deepmind's custom dynamic batching op for wrapping the optimizer's step call. The file must be compiled for this to work (see Docker file). Default: False. Keyword Args: type (str): One of "single", "actor" or "learner". Default: "single". num_actors (int): If type is "single", how many actors should be run in separate threads. """ type_ = kwargs.pop("type", "single") assert type_ in ["single", "actor", "learner"] self.type = type_ if self.type == "single": self.num_actors = kwargs.pop("num_actors", 1) else: self.num_actors = 0 self.worker_sample_size = worker_sample_size self.dynamic_batching = dynamic_batching # Network-spec by default is a "large architecture" IMPALA network. network_spec = default_dict(kwargs.pop( "network_spec", dict(type="rlgraph.components.papers.impala.impala_networks.{}IMPALANetwork". format("Large" if architecture == "large" else "Small")) ), dict(worker_sample_size=1 if self.type == "actor" else self.worker_sample_size + 1)) action_adapter_spec = kwargs.pop("action_adapter_spec", dict(type="baseline-action-adapter")) # Depending on the job-type, remove the pieces from the Agent-spec/graph we won't need. exploration_spec = kwargs.pop("exploration_spec", None) optimizer_spec = kwargs.pop("optimizer_spec", None) observe_spec = kwargs.pop("observe_spec", None) # Run everything in a single process. if self.type == "single": environment_spec = environment_spec or self.default_environment_spec update_spec = kwargs.pop("update_spec", None) # Actors won't need to learn (no optimizer needed in graph). elif self.type == "actor": optimizer_spec = None update_spec = kwargs.pop("update_spec", dict(do_updates=False)) environment_spec = environment_spec or self.default_environment_spec # Learners won't need to explore (act) or observe (insert into Queue). else: exploration_spec = None observe_spec = None update_spec = kwargs.pop("update_spec", None) environment_spec = None # Add previous-action/reward preprocessors to env-specific preprocessor spec. # TODO: remove this empty hard-coded preprocessor. kwargs.pop("preprocessing_spec", None) preprocessing_spec = dict(type="dict-preprocessor-stack", preprocessors=dict( # Flatten actions. previous_action=[ dict(type="reshape", flatten=True, flatten_categories=kwargs.get("action_space").num_categories) ], # Bump reward and convert to float32, so that it can be concatenated by the Concat layer. previous_reward=[ dict(type="reshape", new_shape=(1,)) #dict(type="convert_type", to_dtype="float32") ] )) # Limit communication in distributed mode between each actor and the learner (never between actors). execution_spec = kwargs.pop("execution_spec", None) if execution_spec is not None and execution_spec.get("mode") == "distributed": execution_spec["session_config"] = dict( type="monitored-training-session", allow_soft_placement=True, log_device_placement=False, device_filters=["/job:learner/task:0"] + ( ["/job:actor/task:{}".format(execution_spec["distributed_spec"]["task_index"])] if self.type == "actor" else ["/job:learner/task:0"] ) ) # If Actor, make non-chief in either case (even if task idx == 0). if self.type == "actor": execution_spec["distributed_spec"]["is_chief"] = False # Hard-set device to the CPU for actors. execution_spec["device_strategy"] = "custom" execution_spec["default_device"] = "/job:{}/task:{}/cpu".format(self.type, execution_spec["distributed_spec"]["task_index"]) # Now that we fixed the Agent's spec, call the super constructor. super(IMPALAAgent, self).__init__( discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, action_adapter_spec=action_adapter_spec, exploration_spec=exploration_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, execution_spec=execution_spec, name=kwargs.pop("name", "impala-{}-agent".format(self.type)), **kwargs ) # If we use dynamic batching, wrap the dynamic batcher around the policy's graph_fn that we # actually call below during our build. if self.dynamic_batching: self.policy = DynamicBatchingPolicy(policy_spec=self.policy, scope="") # Manually set the reuse_variable_scope for our policies (actor: mu, learner: pi). self.policy.propagate_sub_component_properties(dict(reuse_variable_scope="shared")) # Always use 1st learner as the parameter server for all policy variables. if self.execution_spec["mode"] == "distributed" and self.execution_spec["distributed_spec"]["cluster_spec"]: self.policy.propagate_sub_component_properties(dict(device=dict(variables="/job:learner/task:0/cpu"))) # Check whether we have an RNN. self.has_rnn = self.neural_network.has_rnn() # Check, whether we are running with GPU. self.has_gpu = self.execution_spec["gpu_spec"]["gpus_enabled"] is True and \ self.execution_spec["gpu_spec"]["num_gpus"] > 0 # Some FIFO-queue specs. self.fifo_queue_keys = ["terminals", "states", "action_probs", "initial_internal_states"] self.fifo_record_space = Dict( { "terminals": bool, "states": default_dict(copy.deepcopy(self.state_space), dict( previous_action=self.action_space, previous_reward=FloatBox() )), "action_probs": FloatBox(shape=(self.action_space.num_categories,)), "initial_internal_states": self.internal_states_space }, add_batch_rank=False, add_time_rank=(self.worker_sample_size + 1) ) # Take away again time-rank from initial-states (comes in only for one time-step). self.fifo_record_space["initial_internal_states"] = \ self.fifo_record_space["initial_internal_states"].with_time_rank(False) # Create our FIFOQueue (actors will enqueue, learner(s) will dequeue). self.fifo_queue = FIFOQueue.from_spec( fifo_queue_spec or dict(capacity=1), reuse_variable_scope="shared-fifo-queue", only_insert_single_records=True, record_space=self.fifo_record_space, device="/job:learner/task:0/cpu" if self.execution_spec["mode"] == "distributed" and self.execution_spec["distributed_spec"]["cluster_spec"] else None ) # Remove `states` key from input_spaces: not needed. del self.input_spaces["states"] # Add all our sub-components to the core. if self.type == "single": # Create a queue runner that takes care of pushing items into the queue from our actors. self.env_output_splitter = ContainerSplitter(tuple_length=8, scope="env-output-splitter") self.fifo_output_splitter = ContainerSplitter(*self.fifo_queue_keys, scope="fifo-output-splitter") self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Slice some data from the EnvStepper (e.g only first internal states are needed). self.internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) # TODO: add state transposer, remove action/rewards transposer (part of state). self.transpose_actions = ReShape(flip_batch_and_time_rank=True, time_major=True, scope="transpose-a", flatten_categories=False) self.transpose_rewards = ReShape(flip_batch_and_time_rank=True, time_major=True, scope="transpose-r") self.transpose_terminals = ReShape(flip_batch_and_time_rank=True, time_major=True, scope="transpose-t") self.transpose_action_probs = ReShape(flip_batch_and_time_rank=True, time_major=True, scope="transpose-a-probs-mu") self.concat = Concat(axis=1) # 1=the time rank (at the time of the concat, we are still batch-major) # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction(, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy ) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) dummy_flattener = ReShape(flatten=True) # dummy Flattener to calculate action-probs space self.environment_steppers = list() for i in range(self.num_actors): policy_spec = dict( network_spec=network_spec, action_adapter_spec=dict(type="baseline-action-adapter"), action_space=self.action_space ) env_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=ActorComponent( preprocessor_spec=preprocessing_spec, policy_spec=policy_spec, exploration_spec=exploration_spec ), state_space=self.state_space.with_batch_rank(), reward_space=float, # TODO <- float64 for deepmind? may not work for other envs internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_previous_action=True, add_previous_reward=True, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space(self.action_space), scope="env-stepper-{}".format(i) ) if self.dynamic_batching: env_stepper.actor_component.policy.parent_component = None env_stepper.actor_component.policy = DynamicBatchingPolicy(policy_spec=env_stepper.actor_component.policy, scope="") env_stepper.actor_component.add_components(env_stepper.actor_component.policy) env_stepper.actor_component.policy.propagate_sub_component_properties(dict(reuse_variable_scope="shared")) self.environment_steppers.append(env_stepper) # Create the QueueRunners (one for each env-stepper). # - Take return value 1 of API-method step as record to insert. self.queue_runner = QueueRunner(self.fifo_queue, "step", 1, self.env_output_splitter, self.fifo_input_merger, self.internal_states_slicer, *self.environment_steppers) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.queue_runner, self.transpose_actions, self.transpose_rewards, self.transpose_terminals, self.transpose_action_probs, self.preprocessor, self.staging_area, self.concat, self.policy, self.loss_function, self.optimizer ] elif self.type == "actor": # No learning, no loss function. self.loss_function = None # A Dict Splitter to split things from the EnvStepper. self.env_output_splitter = ContainerSplitter(tuple_length=4, scope="env-output-splitter") self.states_dict_splitter = ContainerSplitter("RGB_INTERLEAVED", "INSTR", "previous_action", "previous_reward", scope="states-dict-splitter") # Slice some data from the EnvStepper (e.g only first internal states are needed). self.internal_states_slicer = Slice(scope="internal-states-slicer", squeeze=True) # Merge back to insert into FIFO. self.fifo_input_merger = DictMerger(*self.fifo_queue_keys) dummy_flattener = ReShape(flatten=True) # dummy Flattener to calculate action-probs space self.environment_stepper = EnvironmentStepper( environment_spec=environment_spec, actor_component_spec=ActorComponent(self.preprocessor, self.policy, self.exploration), state_space=self.state_space.with_batch_rank(), reward_space=float, # TODO <- float64 for deepmind? may not work for other envs internal_states_space=self.internal_states_space, num_steps=self.worker_sample_size, add_previous_action=True, add_previous_reward=True, add_action_probs=True, action_probs_space=dummy_flattener.get_preprocessed_space(self.action_space) ) sub_components = [ self.environment_stepper, self.env_output_splitter, self.internal_states_slicer, self.fifo_input_merger, self.states_dict_splitter, self.fifo_queue ] # Learner. else: self.environment_stepper = None # A Dict splitter to split up items from the queue. self.fifo_input_merger = None self.fifo_output_splitter = ContainerSplitter(*self.fifo_queue_keys, scope="fifo-output-splitter") self.states_dict_splitter = ContainerSplitter("INSTR", "RGB_INTERLEAVED", "previous_action", "previous_reward", scope="states-dict-splitter") self.internal_states_slicer = None self.transpose_states = Transpose( scope="transpose-states", device=dict(ops="/job:learner/task:0/cpu") ) self.transpose_terminals = Transpose( scope="transpose-terminals", device=dict(ops="/job:learner/task:0/cpu") ) self.transpose_action_probs = Transpose( scope="transpose-a-probs-mu", device=dict(ops="/job:learner/task:0/cpu") ) self.staging_area = StagingArea(num_data=len(self.fifo_queue_keys)) # Create an IMPALALossFunction with some parameters. self.loss_function = IMPALALossFunction(, weight_pg=weight_pg, weight_baseline=weight_baseline, weight_entropy=weight_entropy, device="/job:learner/task:0/gpu" ) self.policy.propagate_sub_component_properties( dict(device=dict(variables="/job:learner/task:0/cpu", ops="/job:learner/task:0/gpu")) ) for component in [self.staging_area, self.preprocessor, self.optimizer]: component.propagate_sub_component_properties( dict(device="/job:learner/task:0/gpu") ) sub_components = [ self.fifo_output_splitter, self.fifo_queue, self.states_dict_splitter, self.transpose_states, self.transpose_terminals, self.transpose_action_probs, self.staging_area, self.preprocessor, self.policy, self.loss_function, self.optimizer ] # Add all the agent's sub-components to the root. self.root_component.add_components(*sub_components) # Define the Agent's (root Component's) API. self.define_api_methods(*sub_components) if self.auto_build: if self.type == "learner": build_options = dict( build_device_context="/job:learner/task:0/cpu", pin_global_variable_device="/job:learner/task:0/cpu" ) self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=build_options) else: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=None) self.graph_built = True if self.has_gpu: # Get 1st return op of API-method `stage` of sub-component `staging-area` (which is the stage-op). self.stage_op = self.root_component.sub_components["staging-area"].api_methods["stage"]. \ out_op_columns[0].op_records[0].op # Initialize the stage. self.graph_executor.monitored_session.run_step_fn( lambda step_context: ) # TODO remove after full refactor. self.dequeue_op = self.root_component.sub_components["fifo-queue"].api_methods["get_records"]. \ out_op_columns[0].op_records[0].op if self.type == "actor": self.enqueue_op = self.root_component.sub_components["fifo-queue"].api_methods["insert_records"]. \ out_op_columns[0].op_records[0].op
[docs] def define_api_methods(self, *sub_components): # TODO: Unify agents with/w/o synchronizable policy. # TODO: Unify Agents with/w/o get_action method (w/ env-stepper vs w/o). #global_scope_base = "environment-stepper/actor-component/" if self.type == "actor" else "" #super(IMPALAAgent, self).define_api_methods( # global_scope_base+"policy", # global_scope_base+"dict-preprocessor-stack" #) # Assemble the specific agent. if self.type == "single": self.define_api_methods_single(*sub_components) elif self.type == "actor": self.define_api_methods_actor(*sub_components) else: self.define_api_methods_learner(*sub_components)
[docs] def define_api_methods_single(self, fifo_output_splitter, fifo_queue, queue_runner, transpose_actions, transpose_rewards, transpose_terminals, transpose_action_probs, preprocessor, staging_area, concat, policy, loss_function, optimizer): @rlgraph_api(component=self.root_component) def setup_queue_runner(self_): return queue_runner.setup() @rlgraph_api(component=self.root_component) def get_queue_size(self_): return fifo_queue.get_size() @rlgraph_api(component=self.root_component) def update_from_memory(self_): # Pull n records from the queue. # Note that everything will come out as batch-major and must be transposed before the main-LSTM. # This is done by the network itself for all network inputs: # - preprocessed_s # - preprocessed_last_s_prime # But must still be done for actions, rewards, terminals here in this API-method via separate ReShapers. records = fifo_queue.get_records(self.update_spec["batch_size"]) preprocessed_s, actions, rewards, terminals, last_s_prime, action_probs_mu, \ initial_internal_states = fifo_output_splitter.split(records) preprocessed_last_s_prime = preprocessor.preprocess(last_s_prime) # Append last-next-state to the rest before sending it through the network. preprocessed_s_all = concat.apply(preprocessed_s, preprocessed_last_s_prime) # Flip actions, rewards, terminals to time-major. # TODO: Create components that are less input-space sensitive (those that have no variables should # TODO: be reused for any kind of processing) actions = transpose_actions.apply(actions) rewards = transpose_rewards.apply(rewards) terminals = transpose_terminals.apply(terminals) action_probs_mu = transpose_action_probs.apply(action_probs_mu) # If we use a GPU: Put everything on staging area (adds 1 time step policy lag, but makes copying # data into GPU more efficient). if self.has_gpu: stage_op = staging_area.stage( preprocessed_s_all, actions, rewards, terminals, action_probs_mu, initial_internal_states ) # Get data from stage again and continue. preprocessed_s_all, actions, rewards, terminals, action_probs_mu, \ initial_internal_states = staging_area.unstage() # endif. else: # TODO: No-op component? stage_op = None # Get the pi-action probs AND the values for all our states. state_values_pi, logits_pi, probs_pi, log_probabilities_pi, current_internal_states = \ policy.get_state_values_logits_probabilities_log_probs( preprocessed_s_all, initial_internal_states ) # Calculate the loss. loss, loss_per_item = loss_function.loss(log_probabilities_pi, action_probs_mu, state_values_pi, actions, rewards, terminals) if self.dynamic_batching: policy_vars = queue_runner.data_producing_components[0].actor_component.policy._variables() else: policy_vars = policy._variables() # TODO: dynbatching tboard check #return loss, loss, loss, loss # Pass vars and loss values into optimizer. step_op, loss, loss_per_item = optimizer.step(policy_vars, loss, loss_per_item) # Return optimizer op and all loss values. # TODO: Make it possible to return None from API-method without messing with the meta-graph. return step_op, (stage_op if stage_op else step_op), loss, loss_per_item
[docs] def define_api_methods_actor(self, env_stepper, env_output_splitter, internal_states_slicer, merger, states_dict_splitter, fifo_queue): """ Defines the API-methods used by an IMPALA actor. Actors only step through an environment (n-steps at a time), collect the results and push them into the FIFO queue. Results include: The actions actually taken, the discounted accumulated returns for each action, the probability of each taken action according to the behavior policy. Args: env_stepper (EnvironmentStepper): The EnvironmentStepper Component to setp through the Env n steps in a single op call. fifo_queue (FIFOQueue): The FIFOQueue Component used to enqueue env sample runs (n-step). """ # Perform n-steps in the env and insert the results into our FIFO-queue. @rlgraph_api(component=self.root_component) def perform_n_steps_and_insert_into_fifo(self_): #, internal_states, time_step=0): # Take n steps in the environment. step_results = env_stepper.step() terminals, states, action_log_probs, internal_states = env_output_splitter.split(step_results) initial_internal_states = internal_states_slicer.slice(internal_states, 0) record = merger.merge(terminals, states, action_log_probs, initial_internal_states) # Insert results into the FIFOQueue. insert_op = fifo_queue.insert_records(record) return insert_op, terminals @rlgraph_api(component=self.root_component) def reset(self): # Resets the environment running inside the agent. reset_op = env_stepper.reset() return reset_op
[docs] def define_api_methods_learner( self, fifo_output_splitter, fifo_queue, states_dict_splitter, transpose_states, transpose_terminals, transpose_action_probs, staging_area, preprocessor, policy, loss_function, optimizer ): """ Defines the API-methods used by an IMPALA learner. Its job is basically: Pull a batch from the FIFOQueue, split it up into its components and pass these through the loss function and into the optimizer for a learning update. Args: fifo_queue (FIFOQueue): The FIFOQueue Component used to enqueue env sample runs (n-step). splitter (ContainerSplitter): The DictSplitter Component to split up a batch from the queue along its items. policy (Policy): The Policy Component, which to update. loss_function (IMPALALossFunction): The IMPALALossFunction Component. optimizer (Optimizer): The optimizer that we use to calculate an update and apply it. """ @rlgraph_api(component=self.root_component) def update_from_memory(self_): # Pull n records from the queue. # Note that everything will come out as batch-major and must be transposed before the main-LSTM. # This is done by the network itself for all network inputs: # - preprocessed_s # - preprocessed_last_s_prime # But must still be done for actions, rewards, terminals here in this API-method via separate ReShapers. records = fifo_queue.get_records(self.update_spec["batch_size"]) terminals, states, action_probs_mu, initial_internal_states = fifo_output_splitter.split(records) # Flip everything to time-major. # TODO: Create components that are less input-space sensitive (those that have no variables should # TODO: be reused for any kind of processing) states = transpose_states.apply(states) terminals = transpose_terminals.apply(terminals) action_probs_mu = transpose_action_probs.apply(action_probs_mu) # If we use a GPU: Put everything on staging area (adds 1 time step policy lag, but makes copying # data into GPU more efficient). if self.has_gpu: stage_op = staging_area.stage(states, terminals, action_probs_mu, initial_internal_states) # Get data from stage again and continue. states, terminals, action_probs_mu, initial_internal_states = staging_area.unstage() else: # TODO: No-op component? stage_op = None # Preprocess actions and rewards inside the state (actions: flatten one-hot, rewards: expand). states = preprocessor.preprocess(states) # state_values_pi, _, _, log_probabilities_pi, current_internal_states = \ # policy.get_state_values_logits_probabilities_log_probs(states, initial_internal_states) # Only retrieve logits and do faster sparse softmax in loss. state_values_pi, logits, _, _, current_internal_states = \ policy.get_state_values_logits_probabilities_log_probs(states, initial_internal_states) # Isolate actions and rewards from states. _, _, actions, rewards = states_dict_splitter.split(states) # Calculate the loss. # step_op,\ <- DEBUG: fake step op loss, loss_per_item = loss_function.loss(logits, action_probs_mu, state_values_pi, actions, rewards, terminals) policy_vars = policy._variables() # Pass vars and loss values into optimizer. step_op, loss, loss_per_item = optimizer.step(policy_vars, loss, loss_per_item) # Return optimizer op and all loss values. # TODO: Make it possible to return None from API-method without messing with the meta-graph. return step_op, (stage_op if stage_op else step_op), loss, loss_per_item @rlgraph_api(component=self.root_component) def get_queue_size(self_): return fifo_queue.get_size()
[docs] def get_action(self, states, internal_states=None, use_exploration=True, extra_returns=None): pass
def _observe_graph(self, preprocessed_states, actions, internals, rewards, terminals): self.graph_executor.execute(("insert_records", [preprocessed_states, actions, rewards, terminals]))
[docs] def update(self, batch=None): if batch is None: # Include stage_op or not? if self.has_gpu: return self.graph_executor.execute("update_from_memory") else: return self.graph_executor.execute(("update_from_memory", None, ([0, 2, 3]))) else: raise RLGraphError("Cannot call update-from-batch on an IMPALA Agent.")
def __repr__(self): return "IMPALAAgent(type={})".format(self.type)