feat: sim2real domain randomization + reward fixes for rotary cartpole

Close the sim2real gap for the Furuta pendulum (swings up but can't balance on hardware). Root causes were (a) no domain randomization, so the policy overfit one deterministic sim instance, and (b) reward design flaws that produced degenerate policies. Domain randomization (runner-level, backend-agnostic): - BaseRunner: domain_rand config; per-env action-delay buffer (latency), Gaussian qpos/qvel sensor noise, per-env dynamics-scale sampling (friction/damping/torque), resampled per episode. Sensor noise per step. - privileged_obs/privileged_dim expose normalized DR factors (mu) for RMA. - step() now uses clean state for reward/termination, noisy state for the observation the policy sees. - MuJoCoRunner: applies per-env friction/damping/torque scales. - robot.py: compute_motor_force gains friction/damping scale args. - Configs: DR blocks for mujoco (full) and mjx (delay+noise); clean defaults for mujoco_single/serial; noise/delay anchored to recordings. Reward fixes (rotary_cartpole): - Shift upright reward to [0,1] (was [-1,1]) + alive_bonus, so surviving always beats ending early (kills the "suicide into the limit" policy). - Add balance_bonus * upright * stillness so reward requires upright AND near-zero pendulum velocity (kills the "spin in full loops" policy). Deploy: - eval.py load_policy reconstructs the history/adaptation encoder (auto-detects its dim from the checkpoint) so DR+embedding policies load. Fixes: - MuJoCoRunner._sim_reset referenced self._env (typo) -> self.env, which was breaking every rotary-cartpole reset. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 20:48:25 +02:00
parent 8cc84d6a21
commit b37cd26690
22 changed files with 1219 additions and 781 deletions
--- a/scripts/eval.py
+++ b/scripts/eval.py
@@ -74,14 +74,33 @@ def _infer_hidden_sizes(state_dict: dict[str, torch.Tensor]) -> tuple[int, ...]:
    return tuple(sizes)


+def _infer_encoder_out_dim(state_dict: dict[str, torch.Tensor]) -> int | None:
+    """Return the history/adaptation encoder output dim, if present.
+
+    Lets eval reconstruct an embedding policy without knowing the training
+    embedding_dim/latent_dim — read it straight from the saved weights.
+    """
+    for key in ("history_encoder.fc.weight", "adaptation_module.fc.weight"):
+        if key in state_dict:
+            return state_dict[key].shape[0]
+    return None
+
+
 def load_policy(
    checkpoint_path: str,
    observation_space: spaces.Space,
    action_space: spaces.Space,
    device: torch.device = torch.device("cpu"),
+    history_length: int = 0,
+    rma_mode: str = "none",
+    raw_obs_dim: int = 0,
 ) -> tuple[SharedMLP, RunningStandardScaler]:
    """Load a trained SharedMLP + observation normalizer from a checkpoint.

+    For DR + history-embedding policies (history_length > 0) or RMA deploy
+    policies (rma_mode="deploy"), the history/adaptation encoder must be
+    reconstructed too — its output dim is read back from the saved weights.
+
    Returns:
        (model, state_preprocessor) ready for inference.
    """
@@ -89,13 +108,20 @@ def load_policy(

    # Infer architecture from saved weights.
    hidden_sizes = _infer_hidden_sizes(ckpt["policy"])
+    enc_out = _infer_encoder_out_dim(ckpt["policy"])

-    # Reconstruct model.
+    # Reconstruct model — pass through the encoder config so a DR+embedding
+    # checkpoint rebuilds the history encoder with matching dimensions.
    model = SharedMLP(
        observation_space=observation_space,
        action_space=action_space,
        device=device,
        hidden_sizes=hidden_sizes,
+        history_length=history_length,
+        rma_mode=rma_mode,
+        raw_obs_dim=raw_obs_dim,
+        embedding_dim=enc_out or 32,   # legacy "none" + history
+        latent_dim=enc_out or 8,       # RMA deploy adaptation module
    )
    model.load_state_dict(ckpt["policy"])
    model.eval()
@@ -194,7 +220,10 @@ def _eval_sim(cfg: DictConfig, env_name: str, checkpoint_path: str) -> None:

    device = runner.device
    model, preprocessor = load_policy(
-        checkpoint_path, runner.observation_space, runner.action_space, device
+        checkpoint_path, runner.observation_space, runner.action_space, device,
+        history_length=runner.config.history_length,
+        rma_mode=runner.config.rma_mode,
+        raw_obs_dim=runner.env.observation_space.shape[0],
    )

    mj_model = runner._model
@@ -280,7 +309,10 @@ def _eval_serial(cfg: DictConfig, env_name: str, checkpoint_path: str) -> None:

    device = serial_runner.device
    model, preprocessor = load_policy(
-        checkpoint_path, serial_runner.observation_space, serial_runner.action_space, device
+        checkpoint_path, serial_runner.observation_space, serial_runner.action_space, device,
+        history_length=serial_runner.config.history_length,
+        rma_mode=serial_runner.config.rma_mode,
+        raw_obs_dim=serial_runner.env.observation_space.shape[0],
    )

    # Set up digital-twin MuJoCo model for visualization.