feat: sim2real domain randomization + reward fixes for rotary cartpole

Close the sim2real gap for the Furuta pendulum (swings up but can't
balance on hardware). Root causes were (a) no domain randomization, so
the policy overfit one deterministic sim instance, and (b) reward design
flaws that produced degenerate policies.

Domain randomization (runner-level, backend-agnostic):
- BaseRunner: domain_rand config; per-env action-delay buffer (latency),
  Gaussian qpos/qvel sensor noise, per-env dynamics-scale sampling
  (friction/damping/torque), resampled per episode. Sensor noise per step.
- privileged_obs/privileged_dim expose normalized DR factors (mu) for RMA.
- step() now uses clean state for reward/termination, noisy state for the
  observation the policy sees.
- MuJoCoRunner: applies per-env friction/damping/torque scales.
- robot.py: compute_motor_force gains friction/damping scale args.
- Configs: DR blocks for mujoco (full) and mjx (delay+noise); clean
  defaults for mujoco_single/serial; noise/delay anchored to recordings.

Reward fixes (rotary_cartpole):
- Shift upright reward to [0,1] (was [-1,1]) + alive_bonus, so surviving
  always beats ending early (kills the "suicide into the limit" policy).
- Add balance_bonus * upright * stillness so reward requires upright AND
  near-zero pendulum velocity (kills the "spin in full loops" policy).

Deploy:
- eval.py load_policy reconstructs the history/adaptation encoder
  (auto-detects its dim from the checkpoint) so DR+embedding policies load.

Fixes:
- MuJoCoRunner._sim_reset referenced self._env (typo) -> self.env, which
  was breaking every rotary-cartpole reset.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-09 20:48:25 +02:00
parent 8cc84d6a21
commit b37cd26690
22 changed files with 1219 additions and 781 deletions

View File

@@ -1,94 +0,0 @@
"""Unit tests for MuJoCoRunner domain randomization and history buffer."""
import dataclasses
import numpy as np
import pytest
import torch
from gymnasium import spaces
from src.runners.mujoco import DomainRandConfig, MuJoCoRunnerConfig
from src.models.mlp import SharedMLP, HistoryEncoder
class TestDomainRandConfig:
def test_default_all_zero(self) -> None:
cfg = DomainRandConfig()
assert cfg.mass_frac == 0.0
assert cfg.friction_frac == 0.0
assert cfg.gear_frac == 0.0
def test_from_dict(self) -> None:
d = {"mass_frac": 0.15, "gear_frac": 0.1}
cfg = DomainRandConfig(**d)
assert cfg.mass_frac == 0.15
assert cfg.gear_frac == 0.1
assert cfg.damping_frac == 0.0 # not set
class TestMuJoCoRunnerConfig:
def test_default_dr_disabled(self) -> None:
cfg = MuJoCoRunnerConfig()
assert isinstance(cfg.domain_rand, DomainRandConfig)
assert cfg.domain_rand.mass_frac == 0.0
def test_domain_rand_from_dict(self) -> None:
"""Hydra passes nested configs as dicts — test __post_init__ converts."""
cfg = MuJoCoRunnerConfig(
domain_rand={"mass_frac": 0.2, "friction_frac": 0.3}, # type: ignore[arg-type]
)
assert isinstance(cfg.domain_rand, DomainRandConfig)
assert cfg.domain_rand.mass_frac == 0.2
assert cfg.domain_rand.friction_frac == 0.3
def test_history_length_default(self) -> None:
cfg = MuJoCoRunnerConfig()
assert cfg.history_length == 0
class TestHistoryEncoder:
def test_output_shape(self) -> None:
enc = HistoryEncoder(history_length=10, step_dim=7, embedding_dim=32)
x = torch.randn(4, 10, 7) # batch=4, H=10, step_dim=7
out = enc(x)
assert out.shape == (4, 32)
def test_different_embedding_dim(self) -> None:
enc = HistoryEncoder(history_length=5, step_dim=7, embedding_dim=16)
x = torch.randn(2, 5, 7)
out = enc(x)
assert out.shape == (2, 16)
class TestSharedMLPWithHistory:
def test_no_history(self) -> None:
"""Without history, model works as before."""
obs_space = spaces.Box(low=-1.0, high=1.0, shape=(6,))
act_space = spaces.Box(low=-1.0, high=1.0, shape=(1,))
model = SharedMLP(obs_space, act_space, torch.device("cpu"),
hidden_sizes=(32, 32))
assert model.history_encoder is None
inp = {"states": torch.randn(4, 6)}
mean, log_std, _ = model.compute(inp, role="policy")
assert mean.shape == (4, 1)
def test_with_history(self) -> None:
"""With history, model splits obs and encodes history."""
raw_obs_dim = 6
act_dim = 1
H = 10
step_dim = raw_obs_dim + act_dim # 7
aug_dim = raw_obs_dim + H * step_dim # 6 + 70 = 76
obs_space = spaces.Box(low=-1.0, high=1.0, shape=(aug_dim,))
act_space = spaces.Box(low=-1.0, high=1.0, shape=(act_dim,))
model = SharedMLP(obs_space, act_space, torch.device("cpu"),
hidden_sizes=(32, 32),
history_length=H, raw_obs_dim=raw_obs_dim,
embedding_dim=32)
assert model.history_encoder is not None
inp = {"states": torch.randn(4, aug_dim)}
mean, log_std, _ = model.compute(inp, role="policy")
assert mean.shape == (4, act_dim)
value, _ = model.compute(inp, role="value")
assert value.shape == (4, 1)