Fix counting issue in running statistics.

btaba · copybara-github · commit ca7649cd9391 · 2025-12-06T21:09:01.000-08:00
PiperOrigin-RevId: 841283704
Change-Id: Ie2cf7e925293e89433609000c8876d70b0be8ce5
diff --git a/brax/training/acme/running_statistics.py b/brax/training/acme/running_statistics.py
@@ -19,8 +19,9 @@
 https://github.com/deepmind/acme/blob/master/acme/jax/running_statistics.py
 """
 
-from typing import Any, Optional, Tuple
+from typing import Optional, Tuple, Union
 
+from brax.training import types as training_types
 from brax.training.acme import types
 from flax import struct
 import jax
@@ -45,21 +46,28 @@ class NestedMeanStd:
 @struct.dataclass
 class RunningStatisticsState(NestedMeanStd):
   """Full state of running statistics computation."""
-  count: jnp.ndarray
+  count: Union[jnp.ndarray, training_types.UInt64]
   summed_variance: types.Nest
+  std_eps: float = 0.0
 
 
-def init_state(nest: types.Nest) -> RunningStatisticsState:
-  """Initializes the running statistics for the given nested structure."""
+def init_state(nest: types.Nest, std_eps: float = 0.0) -> RunningStatisticsState:
+  """Initializes the running statistics for the given nested structure.
+
+  Args:
+    nest: Nested structure to initialize statistics for.
+    std_eps: Epsilon for numerical stability when getting std.
+  """
   dtype = jnp.float64 if jax.config.jax_enable_x64 else jnp.float32
 
   return RunningStatisticsState(
-      count=jnp.zeros((), dtype=dtype),
+      count=training_types.UInt64(hi=0, lo=0),
       mean=_zeros_like(nest, dtype=dtype),
       summed_variance=_zeros_like(nest, dtype=dtype),
       # Initialize with ones to make sure normalization works correctly
       # in the initial state.
-      std=_ones_like(nest, dtype=dtype))
+      std=_ones_like(nest, dtype=dtype),
+      std_eps=std_eps)
 
 
 def _validate_batch_shapes(batch: types.NestedArray,
@@ -99,10 +107,10 @@ def update(state: RunningStatisticsState,
 
   Note: data batch and state elements (mean, etc.) must have the same structure.
 
-  Note: by default will use int32 for counts and float32 for accumulated
-  variance. This results in an integer overflow after 2^31 data points and
-  degrading precision after 2^24 batch updates or even earlier if variance
-  updates have large dynamic range.
+  Note: by default uses UInt64 for counts that get converted to float32 for division.
+  This conversion has a small precision loss for large counts. float32 is used
+  to accumulate variance, so can also suffer from precision loss due to the 24 bit
+  mantissa for float32.
   To improve precision, consider setting jax_enable_x64 to True, see
   https://jax.readthedocs.io/en/latest/notebooks/Common_Gotchas_in_JAX.html#double-64bit-precision
 
@@ -133,13 +141,21 @@ def update(state: RunningStatisticsState,
                            jax.tree_util.tree_leaves(state.mean)[0].ndim]
   batch_axis = range(len(batch_dims))
   if weights is None:
-    step_increment = jnp.prod(jnp.array(batch_dims))
+    step_increment = jnp.prod(jnp.array(batch_dims)).astype(jnp.int32)
   else:
-    step_increment = jnp.sum(weights)
+    step_increment = jnp.sum(weights).astype(jnp.int32)
   if pmap_axis_name is not None:
     step_increment = jax.lax.psum(step_increment, axis_name=pmap_axis_name)
   count = state.count + step_increment
 
+  if isinstance(count, training_types.UInt64):
+    # Convert UInt64 count to float32 for division operations.
+    # Note: small precision loss due to float32's 24-bit mantissa.
+    count_float = (jnp.float32(count.hi) * jnp.float32(2.0**32) +
+                   jnp.float32(count.lo))
+  else:
+    count_float = jnp.float32(count)
+
   # Validation is important. If the shapes don't match exactly, but are
   # compatible, arrays will be silently broadcasted resulting in incorrect
   # statistics.
@@ -162,7 +178,7 @@ def _compute_node_statistics(
           weights,
           list(weights.shape) + [1] * (batch.ndim - weights.ndim))
       diff_to_old_mean = diff_to_old_mean * expanded_weights
-    mean_update = jnp.sum(diff_to_old_mean, axis=batch_axis) / count
+    mean_update = jnp.sum(diff_to_old_mean, axis=batch_axis) / count_float
     if pmap_axis_name is not None:
       mean_update = jax.lax.psum(
           mean_update, axis_name=pmap_axis_name)
@@ -188,14 +204,19 @@ def compute_std(summed_variance: jnp.ndarray,
     assert isinstance(summed_variance, jnp.ndarray)
     # Summed variance can get negative due to rounding errors.
     summed_variance = jnp.maximum(summed_variance, 0)
-    std = jnp.sqrt(summed_variance / count)
+    std = jnp.sqrt(summed_variance / count_float + state.std_eps)
     std = jnp.clip(std, std_min_value, std_max_value)
     return std
 
   std = jax.tree_util.tree_map(compute_std, summed_variance, state.std)
 
   return RunningStatisticsState(
-      count=count, mean=mean, summed_variance=summed_variance, std=std)
+      count=count,
+      mean=mean,
+      summed_variance=summed_variance,
+      std=std,
+      std_eps=state.std_eps,
+  )
 
 
 def normalize(batch: types.NestedArray,
diff --git a/brax/training/agents/ppo/checkpoint_test.py b/brax/training/agents/ppo/checkpoint_test.py
@@ -85,7 +85,8 @@ def test_save_and_load_checkpoint(self):
         value=ppo_network.value_network.init(dummy_key),
     )
     normalizer_params = running_statistics.init_state(
-        jax.tree_util.tree_map(jp.zeros, config.observation_size)
+        jax.tree_util.tree_map(jp.zeros, config.observation_size),
+        std_eps=0.02,
     )
     params = (normalizer_params, network_params.policy, network_params.value)
 
@@ -103,6 +104,10 @@ def test_save_and_load_checkpoint(self):
     out = policy_fn(jp.zeros(1), jax.random.PRNGKey(0))
     self.assertEqual(out[0].shape, (3,))
 
+    loaded_params = checkpoint.load(epath.Path(path.full_path) / "000000000001")
+    loaded_normalizer = loaded_params[0]
+    self.assertEqual(loaded_normalizer.std_eps, 0.02)
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/brax/training/agents/ppo/train.py b/brax/training/agents/ppo/train.py
@@ -216,6 +216,7 @@ def train(
     num_updates_per_batch: int = 2,
     num_resets_per_eval: int = 0,
     normalize_observations: bool = False,
+    normalize_observations_std_eps: float = 0.0,
     reward_scaling: float = 1.0,
     clipping_epsilon: float = 0.3,
     gae_lambda: float = 0.95,
@@ -287,6 +288,8 @@ def train(
     num_resets_per_eval: the number of environment resets to run between each
       eval. The environment resets occur on the host
     normalize_observations: whether to normalize observations
+    normalize_observations_std_eps: small value added to the standard deviation
+      for obs normalization to improve numerical stability
     reward_scaling: float scaling for reward
     clipping_epsilon: clipping epsilon for PPO loss
     gae_lambda: General advantage estimation lambda
@@ -672,7 +675,7 @@ def training_epoch_with_timing(
       optimizer_state=optimizer.init(init_params),  # pytype: disable=wrong-arg-types  # numpy-scalars
       params=init_params,
       normalizer_params=running_statistics.init_state(
-          _remove_pixels(obs_shape)
+          _remove_pixels(obs_shape), std_eps=normalize_observations_std_eps
       ),
       env_steps=types.UInt64(hi=0, lo=0),
   )
diff --git a/brax/training/checkpoint.py b/brax/training/checkpoint.py
@@ -193,7 +193,12 @@ def load(
   target = orbax_checkpointer.restore(
       path, ocp.args.PyTreeRestore(restore_args=restore_args), item=None
   )
-  target[0] = running_statistics.RunningStatisticsState(**target[0])
+
+  # Reconstruct UInt64 count if it was saved as dict.
+  state_dict = target[0]
+  if isinstance(state_dict['count'], dict) and 'hi' in state_dict['count']:
+    state_dict['count'] = types.UInt64(**state_dict['count'])
+  target[0] = running_statistics.RunningStatisticsState(**state_dict)
 
   return target
 
diff --git a/brax/training/types.py b/brax/training/types.py
@@ -113,8 +113,11 @@ def to_numpy(self):
 
   def __post_init__(self):
     """Cast post init."""
-    object.__setattr__(self, "hi", jnp.uint32(self.hi))
-    object.__setattr__(self, "lo", jnp.uint32(self.lo))
+    # Only convert known types - avoids issues with checkpoint serialization.
+    if isinstance(self.hi, (int, np.integer, np.ndarray, jax.Array)):
+      object.__setattr__(self, "hi", jnp.uint32(self.hi))
+    if isinstance(self.lo, (int, np.integer, np.ndarray, jax.Array)):
+      object.__setattr__(self, "lo", jnp.uint32(self.lo))
 
   def __add__(self, other):
     other = _sanitize_uint64_input(other)