Bootstrap on timeout.

btaba · copybara-github · commit e8c9c185999d · 2025-12-15T09:54:36.000-08:00
PiperOrigin-RevId: 844813746
Change-Id: I0d2d18892ff41e9e1597f6bb0ebcf36d9020c9a0
diff --git a/brax/envs/inverted_pendulum.py b/brax/envs/inverted_pendulum.py
@@ -124,8 +124,9 @@ def reset(self, rng: jax.Array) -> State:
     obs = self._get_obs(pipeline_state)
     reward, done = jp.zeros(2)
     metrics = {}
+    info = {'time_out': done}  # allows bootstrap_on_timeout for PPO
 
-    return State(pipeline_state, obs, reward, done, metrics)
+    return State(pipeline_state, obs, reward, done, metrics, info)
 
   def step(self, state: State, action: jax.Array) -> State:
     """Run one timestep of the environment's dynamics."""
@@ -140,7 +141,8 @@ def step(self, state: State, action: jax.Array) -> State:
     reward = 1.0
     done = jp.where(jp.abs(obs[1]) > 0.2, 1.0, 0.0)
     return state.replace(
-        pipeline_state=pipeline_state, obs=obs, reward=reward, done=done
+        pipeline_state=pipeline_state, obs=obs, reward=reward, done=done,
+        info={**state.info, 'time_out': done}
     )
 
   @property
diff --git a/brax/training/agents/ppo/networks.py b/brax/training/agents/ppo/networks.py
@@ -32,8 +32,13 @@ class PPONetworks:
   parametric_action_distribution: distribution.ParametricDistribution
 
 
-def make_inference_fn(ppo_networks: PPONetworks):
-  """Creates params and inference function for the PPO agent."""
+def make_inference_fn(ppo_networks: PPONetworks, compute_value: bool = False):
+  """Creates params and inference function for the PPO agent.
+
+  Args:
+    ppo_networks: The PPO networks.
+    compute_value: If True, compute value during rollouts.
+  """
 
   def make_policy(
       params: types.Params, deterministic: bool = False
@@ -55,11 +60,16 @@ def policy(
       postprocessed_actions = parametric_action_distribution.postprocess(
           raw_actions
       )
-      return postprocessed_actions, {
+      extras = {
           'log_prob': log_prob,
           'raw_action': raw_actions,
           'distribution_params': logits,
       }
+      if compute_value:
+        extras['value'] = ppo_networks.value_network.apply(
+            params[0], params[2], observations
+        )
+      return postprocessed_actions, extras
 
     return policy
 
diff --git a/brax/training/agents/ppo/train.py b/brax/training/agents/ppo/train.py
@@ -224,6 +224,7 @@ def train(
     max_grad_norm: Optional[float] = None,
     normalize_advantage: bool = True,
     vf_loss_coefficient: float = 0.5,
+    bootstrap_on_timeout: bool = False,
     desired_kl: float = 0.01,
     learning_rate_schedule: Optional[
         Union[str, ppo_optimizer.LRSchedule]
@@ -299,6 +300,10 @@ def train(
     max_grad_norm: gradient clipping norm value. If None, no clipping is done
     normalize_advantage: whether to normalize advantage estimate
     vf_loss_coefficient: Coefficient for value function loss.
+    bootstrap_on_timeout: if True, bootstrap value on time_out steps using
+      reward += gamma * V(s) * time_out. Environments should set
+      state.info['time_out'] = 1.0 and done=True for steps where the episode ends
+      due to a time_out.
     desired_kl: Desired KL divergence for adaptive KL divergence learning rate
       schedule.
     learning_rate_schedule: Learning rate schedule for the optimizer.
@@ -431,7 +436,9 @@ def reset_fn_donated_env_state(env_state_donated, key_envs):
   ppo_network = network_factory(
       obs_shape, env.action_size, preprocess_observations_fn=normalize
   )
-  make_policy = ppo_networks.make_inference_fn(ppo_network)
+  make_policy = ppo_networks.make_inference_fn(
+      ppo_network, compute_value=bootstrap_on_timeout
+  )
 
   # Optimizer.
   base_optimizer = optax.adam(learning_rate=learning_rate)
@@ -551,13 +558,16 @@ def training_step(
     def f(carry, unused_t):
       current_state, current_key = carry
       current_key, next_key = jax.random.split(current_key)
+      extra_fields = ['truncation', 'episode_metrics', 'episode_done']
+      if bootstrap_on_timeout:
+        extra_fields.append('time_out')
       next_state, data = acting.generate_unroll(
           env,
           current_state,
           policy,
           current_key,
           unroll_length,
-          extra_fields=('truncation', 'episode_metrics', 'episode_done'),
+          extra_fields=tuple(extra_fields),
       )
       return (next_state, next_key), data
 
@@ -574,6 +584,18 @@ def f(carry, unused_t):
     )
     assert data.discount.shape[1:] == (unroll_length,)
 
+    if bootstrap_on_timeout:  # bootstrap reward on timeout
+      time_out = data.extras['state_extras']['time_out']
+      value = data.extras['policy_extras']['value']
+      data = types.Transition(
+          observation=data.observation,
+          action=data.action,
+          reward=data.reward + discounting * time_out * value,
+          discount=data.discount,
+          next_observation=data.next_observation,
+          extras=data.extras,
+      )
+
     normalizer_params = training_state.normalizer_params
     if not lr_is_adaptive_kl:
       # Update normalization params before SGD for backwards compatibility.
diff --git a/brax/training/agents/ppo/train_test.py b/brax/training/agents/ppo/train_test.py
@@ -62,9 +62,11 @@ def testTrain(self, obs_mode):
           dict(distribution_type='tanh_normal', noise_std_type='log'),
       ),
       normalize_mode=['welford', 'ema'],
+      bootstrap_on_timeout=[True, False],
   )
   def testTrainWithNetworkParams(
-      self, distribution_type, noise_std_type, normalize_mode
+      self, distribution_type, noise_std_type, normalize_mode,
+      bootstrap_on_timeout
   ):
     """Test PPO runs with different network params."""
     network_factory = functools.partial(
@@ -99,6 +101,7 @@ def testTrainWithNetworkParams(
         network_factory=network_factory,
         learning_rate_schedule='ADAPTIVE_KL',
         normalize_observations_mode=normalize_mode,
+        bootstrap_on_timeout=bootstrap_on_timeout,
     )
 
   def testTrainAsymmetricActorCritic(self):
diff --git a/docs/release-notes/next-release.md b/docs/release-notes/next-release.md
@@ -9,3 +9,4 @@
 * Add `donate_argnums` to brax PPO to somewhat mitigate repeated graph captures when using MJX-Warp.
 * Add `normalize_observations_mode` to PPO to allow using EMA for running statistics instead of Welford. EMA is more stable for longer training runs.
 * Fix bug in PPO training metric logging frequency for multi-GPU devices.
+* Add value bootstrap on `timeout` for PPO. `reward += gamma * V(s) * time_out` if `bootstrap_on_timeout` is set to True.