feat: RMA-style history-conditioned policy for sim2real adaptation

Added a temporal observation history buffer and 1D-CNN encoder so the policy can implicitly infer environment parameters (mass, friction, gear ratios, etc.) from recent (obs, action) dynamics. Architecture: history window [(obs₀,a₀), ..., (obs_{H-1},a_{H-1})] → 1D-CNN HistoryEncoder → embedding (32-dim) → concat [current_obs, embedding] → MLP → action Components: - BaseRunner: history ring buffer, _push_history/_reset_history, augmented obs space (6 + H×7 = 76 with H=10) - HistoryEncoder (src/models/mlp.py): 2-layer temporal Conv1d + GAP - SharedMLP: optional history_length/raw_obs_dim/embedding_dim params; splits augmented obs, encodes history, feeds [obs, emb] to MLP - TrainerConfig: history_length, embedding_dim fields - All runner configs: history_length=10 by default - Tests: encoder shape, model with/without history, config defaults
2026-03-28 18:58:24 +01:00
parent 8ed9afe583
commit 8cc84d6a21
9 changed files with 209 additions and 9 deletions
--- a/tests/test_sim2real.py
+++ b/tests/test_sim2real.py
@@ -1,11 +1,14 @@
-"""Unit tests for MuJoCoRunner domain randomization."""
+"""Unit tests for MuJoCoRunner domain randomization and history buffer."""

 import dataclasses

 import numpy as np
 import pytest
+import torch
+from gymnasium import spaces

 from src.runners.mujoco import DomainRandConfig, MuJoCoRunnerConfig
+from src.models.mlp import SharedMLP, HistoryEncoder


 class TestDomainRandConfig:
@@ -37,3 +40,55 @@ class TestMuJoCoRunnerConfig:
        assert isinstance(cfg.domain_rand, DomainRandConfig)
        assert cfg.domain_rand.mass_frac == 0.2
        assert cfg.domain_rand.friction_frac == 0.3
+
+    def test_history_length_default(self) -> None:
+        cfg = MuJoCoRunnerConfig()
+        assert cfg.history_length == 0
+
+
+class TestHistoryEncoder:
+    def test_output_shape(self) -> None:
+        enc = HistoryEncoder(history_length=10, step_dim=7, embedding_dim=32)
+        x = torch.randn(4, 10, 7)  # batch=4, H=10, step_dim=7
+        out = enc(x)
+        assert out.shape == (4, 32)
+
+    def test_different_embedding_dim(self) -> None:
+        enc = HistoryEncoder(history_length=5, step_dim=7, embedding_dim=16)
+        x = torch.randn(2, 5, 7)
+        out = enc(x)
+        assert out.shape == (2, 16)
+
+
+class TestSharedMLPWithHistory:
+    def test_no_history(self) -> None:
+        """Without history, model works as before."""
+        obs_space = spaces.Box(low=-1.0, high=1.0, shape=(6,))
+        act_space = spaces.Box(low=-1.0, high=1.0, shape=(1,))
+        model = SharedMLP(obs_space, act_space, torch.device("cpu"),
+                          hidden_sizes=(32, 32))
+        assert model.history_encoder is None
+        inp = {"states": torch.randn(4, 6)}
+        mean, log_std, _ = model.compute(inp, role="policy")
+        assert mean.shape == (4, 1)
+
+    def test_with_history(self) -> None:
+        """With history, model splits obs and encodes history."""
+        raw_obs_dim = 6
+        act_dim = 1
+        H = 10
+        step_dim = raw_obs_dim + act_dim  # 7
+        aug_dim = raw_obs_dim + H * step_dim  # 6 + 70 = 76
+
+        obs_space = spaces.Box(low=-1.0, high=1.0, shape=(aug_dim,))
+        act_space = spaces.Box(low=-1.0, high=1.0, shape=(act_dim,))
+        model = SharedMLP(obs_space, act_space, torch.device("cpu"),
+                          hidden_sizes=(32, 32),
+                          history_length=H, raw_obs_dim=raw_obs_dim,
+                          embedding_dim=32)
+        assert model.history_encoder is not None
+        inp = {"states": torch.randn(4, aug_dim)}
+        mean, log_std, _ = model.compute(inp, role="policy")
+        assert mean.shape == (4, act_dim)
+        value, _ = model.compute(inp, role="value")
+        assert value.shape == (4, 1)