feat: RMA-style history-conditioned policy for sim2real adaptation

Added a temporal observation history buffer and 1D-CNN encoder so the policy can implicitly infer environment parameters (mass, friction, gear ratios, etc.) from recent (obs, action) dynamics. Architecture: history window [(obs₀,a₀), ..., (obs_{H-1},a_{H-1})] → 1D-CNN HistoryEncoder → embedding (32-dim) → concat [current_obs, embedding] → MLP → action Components: - BaseRunner: history ring buffer, _push_history/_reset_history, augmented obs space (6 + H×7 = 76 with H=10) - HistoryEncoder (src/models/mlp.py): 2-layer temporal Conv1d + GAP - SharedMLP: optional history_length/raw_obs_dim/embedding_dim params; splits augmented obs, encodes history, feeds [obs, emb] to MLP - TrainerConfig: history_length, embedding_dim fields - All runner configs: history_length=10 by default - Tests: encoder shape, model with/without history, config defaults
2026-03-28 18:58:24 +01:00
parent 8ed9afe583
commit 8cc84d6a21
9 changed files with 209 additions and 9 deletions
--- a/configs/runner/mjx.yaml
+++ b/configs/runner/mjx.yaml
@@ -2,3 +2,4 @@ num_envs: 1024       # MJX shines with many parallel envs
 device: auto         # auto = cuda if available, else cpu
 dt: 0.002
 substeps: 10
+history_length: 10   # RMA-style: 10-step window of (obs, action) pairs
--- a/configs/runner/mujoco.yaml
+++ b/configs/runner/mujoco.yaml
@@ -2,6 +2,7 @@ num_envs: 64
 device: auto  # auto = cuda if available, else cpu
 dt: 0.002
 substeps: 10
+history_length: 10  # RMA-style: 10-step window of (obs, action) pairs

 # ── Sim2real: domain randomization ───────────────────────────────
 domain_rand:
--- a/configs/runner/mujoco_single.yaml
+++ b/configs/runner/mujoco_single.yaml
@@ -5,3 +5,4 @@ num_envs: 1
 device: cpu
 dt: 0.002
 substeps: 10
+history_length: 10
--- a/configs/runner/serial.yaml
+++ b/configs/runner/serial.yaml
@@ -8,3 +8,4 @@ port: /dev/cu.usbserial-0001
 baud: 115200
 dt: 0.02                    # control loop period (50 Hz, matches training)
 no_data_timeout: 2.0        # seconds of silence before declaring disconnect
+history_length: 10           # must match training runner
--- a/configs/training/ppo.yaml
+++ b/configs/training/ppo.yaml
@@ -18,6 +18,10 @@ max_log_std: 2.0

 record_video_every: 10000

+# RMA-style history encoder
+history_length: 10       # temporal window (must match runner)
+embedding_dim: 32        # history encoder output dimension
+
 # ClearML remote execution (GPU worker)
 remote: false

--- a/src/core/runner.py
+++ b/src/core/runner.py
@@ -14,6 +14,7 @@ T = TypeVar("T")
 class BaseRunnerConfig:
    num_envs: int = 1
    device: str = "cpu"
+    history_length: int = 0  # 0 = no history (single obs), >0 = RMA-style

 class BaseRunner(abc.ABC, Generic[T]):
    def __init__(self, env: BaseEnv, config: T) -> None:
@@ -36,6 +37,26 @@ class BaseRunner(abc.ABC, Generic[T]):
            self.config.num_envs, dtype=torch.long, device=self.config.device
        )

+        # ── History buffer (RMA-style adaptation) ────────────────
+        self._history_len: int = getattr(self.config, "history_length", 0)
+        if self._history_len > 0:
+            obs_dim = self.observation_space.shape[0]
+            act_dim = self.action_space.shape[0]
+            self._history_obs_dim = obs_dim
+            self._history_act_dim = act_dim
+            self._history_step_dim = obs_dim + act_dim  # each step stores (obs, action)
+            # Ring buffer: (num_envs, history_length, obs_dim + act_dim)
+            self._history_buf = torch.zeros(
+                self.config.num_envs, self._history_len, self._history_step_dim,
+                device=self.config.device,
+            )
+            # Augmented observation space: [current_obs, history_flat]
+            from gymnasium import spaces
+            aug_dim = obs_dim + self._history_len * self._history_step_dim
+            self.observation_space = spaces.Box(
+                low=-torch.inf, high=torch.inf, shape=(aug_dim,)
+            )
+
    @property
    @abc.abstractmethod
    def num_envs(self) -> int:
@@ -63,14 +84,44 @@ class BaseRunner(abc.ABC, Generic[T]):
        if hasattr(self, "_offscreen_renderer") and self._offscreen_renderer is not None:
            self._offscreen_renderer.close()

+    def _augment_obs(self, obs: torch.Tensor) -> torch.Tensor:
+        """Concatenate history buffer to current obs if history is enabled."""
+        if self._history_len <= 0:
+            return obs
+        # Flatten history: (num_envs, H, step_dim) → (num_envs, H * step_dim)
+        hist_flat = self._history_buf.reshape(obs.shape[0], -1)
+        return torch.cat([obs, hist_flat], dim=-1)
+
+    def _push_history(self, obs: torch.Tensor, actions: torch.Tensor,
+                      env_ids: torch.Tensor | None = None) -> None:
+        """Push (obs, action) into the ring buffer (shift left, append right)."""
+        if self._history_len <= 0:
+            return
+        step = torch.cat([obs, actions.reshape(obs.shape[0], -1)], dim=-1)
+        if env_ids is None:
+            # All envs.
+            self._history_buf = torch.roll(self._history_buf, -1, dims=1)
+            self._history_buf[:, -1] = step
+        else:
+            self._history_buf[env_ids] = torch.roll(
+                self._history_buf[env_ids], -1, dims=1
+            )
+            self._history_buf[env_ids, -1] = step[env_ids]
+
+    def _reset_history(self, env_ids: torch.Tensor) -> None:
+        """Zero the history buffer for reset envs."""
+        if self._history_len > 0:
+            self._history_buf[env_ids] = 0.0
+
    def reset(self) -> tuple[torch.Tensor, dict[str, Any]]:
        all_ids = torch.arange(self.num_envs, device=self.device)
        qpos, qvel = self._sim_reset(all_ids)
        self.step_counts.zero_()
+        self._reset_history(all_ids)

        state = self.env.build_state(qpos, qvel)
        obs = self.env.compute_observations(state)
-        return obs, {}
+        return self._augment_obs(obs), {}
    
    def step(self, actions: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict[str, Any]]:
        self._last_actions = actions
@@ -83,23 +134,27 @@ class BaseRunner(abc.ABC, Generic[T]):
        terminated = self.env.compute_terminations(state)
        truncated = self.env.compute_truncations(self.step_counts)

+        # Push current (obs, action) into history before augmenting.
+        self._push_history(obs, actions)
+
        info: dict[str, Any] = {}

        done = terminated | truncated
        done_ids = done.nonzero(as_tuple=False).squeeze(-1)

        if done_ids.numel() > 0:
-            info["final_observations"] = obs[done_ids].clone()
+            info["final_observations"] = self._augment_obs(obs)[done_ids].clone()
            info["final_env_ids"] = done_ids.clone()

            reset_qpos, reset_qvel = self._sim_reset(done_ids)
            self.step_counts[done_ids] = 0
+            self._reset_history(done_ids)

            reset_state = self.env.build_state(reset_qpos, reset_qvel)
            obs[done_ids] = self.env.compute_observations(reset_state)

        # skrl expects (num_envs, 1) for rewards/terminated/truncated
-        return obs, rewards.unsqueeze(-1), terminated.unsqueeze(-1), truncated.unsqueeze(-1), info
+        return self._augment_obs(obs), rewards.unsqueeze(-1), terminated.unsqueeze(-1), truncated.unsqueeze(-1), info
        
    def _render_frame(self, env_idx: int = 0) -> np.ndarray:
        """Return a raw RGB frame. Override in subclass."""
--- a/src/models/mlp.py
+++ b/src/models/mlp.py
@@ -3,14 +3,72 @@ import torch.nn as nn
 from gymnasium import spaces
 from skrl.models.torch import Model, GaussianMixin, DeterministicMixin

+
+class HistoryEncoder(nn.Module):
+    """1D-CNN encoder over a temporal window of (obs, action) pairs.
+
+    Input:  (batch, history_length, step_dim)
+    Output: (batch, embedding_dim)
+
+    Architecture: two temporal conv layers → global average pool → linear.
+    This lets the policy implicitly infer environment parameters (mass,
+    friction, gear, etc.) from recent dynamics — the core of RMA-style
+    adaptation for sim2real.
+    """
+
+    def __init__(
+        self,
+        history_length: int,
+        step_dim: int,
+        embedding_dim: int = 32,
+        hidden_channels: int = 32,
+    ) -> None:
+        super().__init__()
+        self.conv = nn.Sequential(
+            # (batch, step_dim, history_length) after transpose
+            nn.Conv1d(step_dim, hidden_channels, kernel_size=3, padding=1),
+            nn.ELU(),
+            nn.Conv1d(hidden_channels, hidden_channels, kernel_size=3, padding=1),
+            nn.ELU(),
+        )
+        self.fc = nn.Linear(hidden_channels, embedding_dim)
+
+    def forward(self, history: torch.Tensor) -> torch.Tensor:
+        """history: (batch, history_length, step_dim)."""
+        # Conv1d expects (batch, channels, seq_len).
+        x = history.transpose(1, 2)
+        x = self.conv(x)
+        # Global average pool over time.
+        x = x.mean(dim=-1)
+        return self.fc(x)
+
+
 class SharedMLP(GaussianMixin, DeterministicMixin, Model):
-    def __init__(self, observation_space: spaces.Space, action_space: spaces.Space, device: torch.device, hidden_sizes: tuple[int, ...] = (32, 32), clip_actions: bool = False, clip_log_std: bool = True, min_log_std: float = -2.0, max_log_std: float = 2.0, initial_log_std: float = 0.0):
+    def __init__(self, observation_space: spaces.Space, action_space: spaces.Space, device: torch.device, hidden_sizes: tuple[int, ...] = (32, 32), clip_actions: bool = False, clip_log_std: bool = True, min_log_std: float = -2.0, max_log_std: float = 2.0, initial_log_std: float = 0.0, history_length: int = 0, raw_obs_dim: int = 0, embedding_dim: int = 32):
        Model.__init__(self, observation_space, action_space, device)
        GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
        DeterministicMixin.__init__(self, clip_actions)

-        layers = []
-        in_dim: int = self.num_observations
+        self._history_length = history_length
+        self._raw_obs_dim = raw_obs_dim
+        self._embedding_dim = embedding_dim
+
+        if history_length > 0 and raw_obs_dim > 0:
+            # The observation is [current_obs(raw_obs_dim), history_flat(H * step_dim)].
+            act_dim = self.num_actions
+            step_dim = raw_obs_dim + act_dim
+            self.history_encoder = HistoryEncoder(
+                history_length=history_length,
+                step_dim=step_dim,
+                embedding_dim=embedding_dim,
+            )
+            # MLP input = raw obs + history embedding.
+            in_dim = raw_obs_dim + embedding_dim
+        else:
+            self.history_encoder = None
+            in_dim = self.num_observations
+
+        layers: list[nn.Module] = []
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(in_dim, hidden_size))
            layers.append(nn.ELU())
@@ -32,17 +90,29 @@ class SharedMLP(GaussianMixin, DeterministicMixin, Model):
        elif role == "value":
            return DeterministicMixin.act(self, inputs, role)
       
+    def _encode(self, states: torch.Tensor) -> torch.Tensor:
+        """Split augmented obs into current obs + history, encode, concat."""
+        if self.history_encoder is None:
+            return self.net(states)
+
+        obs = states[:, :self._raw_obs_dim]
+        hist_flat = states[:, self._raw_obs_dim:]
+        step_dim = self._raw_obs_dim + self.num_actions
+        history = hist_flat.reshape(-1, self._history_length, step_dim)
+        embedding = self.history_encoder(history)
+        return self.net(torch.cat([obs, embedding], dim=-1))
+
    def compute(
        self, inputs: dict[str, torch.Tensor], role: str = ""
   ) -> tuple[torch.Tensor, ...]:
        if role == "policy":
-            self._shared_output = self.net(inputs["states"])
+            self._shared_output = self._encode(inputs["states"])
            return self.mean_layer(self._shared_output), self.log_std_parameter, {}
        elif role == "value":
            shared_output = (
                self._shared_output
                if self._shared_output is not None
-                else self.net(inputs["states"])
+                else self._encode(inputs["states"])
            )
            self._shared_output = None
            return self.value_layer(shared_output), {}
--- a/src/training/trainer.py
+++ b/src/training/trainer.py
@@ -48,6 +48,10 @@ class TrainerConfig:
    record_video_every: int = 10_000   # 0 = disabled
    record_video_fps: int = 0          # 0 = derive from sim dt×substeps

+    # History encoder (RMA-style adaptation)
+    history_length: int = 0     # 0 = disabled, >0 = temporal window size
+    embedding_dim: int = 32     # history encoder output dimension
+

 # ── Video-recording trainer ──────────────────────────────────────────

@@ -173,6 +177,11 @@ class Trainer:
            device=device,
        )

+        # Determine raw obs dim (without history augmentation).
+        raw_obs_dim = 0
+        if self.config.history_length > 0:
+            raw_obs_dim = self.runner.env.observation_space.shape[0]
+
        self.model = SharedMLP(
            observation_space=obs_space,
            action_space=act_space,
@@ -181,6 +190,9 @@ class Trainer:
            initial_log_std=self.config.initial_log_std,
            min_log_std=self.config.min_log_std,
            max_log_std=self.config.max_log_std,
+            history_length=self.config.history_length,
+            raw_obs_dim=raw_obs_dim,
+            embedding_dim=self.config.embedding_dim,
        )

        models = {"policy": self.model, "value": self.model}
--- a/tests/test_sim2real.py
+++ b/tests/test_sim2real.py
@@ -1,11 +1,14 @@
-"""Unit tests for MuJoCoRunner domain randomization."""
+"""Unit tests for MuJoCoRunner domain randomization and history buffer."""

 import dataclasses

 import numpy as np
 import pytest
+import torch
+from gymnasium import spaces

 from src.runners.mujoco import DomainRandConfig, MuJoCoRunnerConfig
+from src.models.mlp import SharedMLP, HistoryEncoder


 class TestDomainRandConfig:
@@ -37,3 +40,55 @@ class TestMuJoCoRunnerConfig:
        assert isinstance(cfg.domain_rand, DomainRandConfig)
        assert cfg.domain_rand.mass_frac == 0.2
        assert cfg.domain_rand.friction_frac == 0.3
+
+    def test_history_length_default(self) -> None:
+        cfg = MuJoCoRunnerConfig()
+        assert cfg.history_length == 0
+
+
+class TestHistoryEncoder:
+    def test_output_shape(self) -> None:
+        enc = HistoryEncoder(history_length=10, step_dim=7, embedding_dim=32)
+        x = torch.randn(4, 10, 7)  # batch=4, H=10, step_dim=7
+        out = enc(x)
+        assert out.shape == (4, 32)
+
+    def test_different_embedding_dim(self) -> None:
+        enc = HistoryEncoder(history_length=5, step_dim=7, embedding_dim=16)
+        x = torch.randn(2, 5, 7)
+        out = enc(x)
+        assert out.shape == (2, 16)
+
+
+class TestSharedMLPWithHistory:
+    def test_no_history(self) -> None:
+        """Without history, model works as before."""
+        obs_space = spaces.Box(low=-1.0, high=1.0, shape=(6,))
+        act_space = spaces.Box(low=-1.0, high=1.0, shape=(1,))
+        model = SharedMLP(obs_space, act_space, torch.device("cpu"),
+                          hidden_sizes=(32, 32))
+        assert model.history_encoder is None
+        inp = {"states": torch.randn(4, 6)}
+        mean, log_std, _ = model.compute(inp, role="policy")
+        assert mean.shape == (4, 1)
+
+    def test_with_history(self) -> None:
+        """With history, model splits obs and encodes history."""
+        raw_obs_dim = 6
+        act_dim = 1
+        H = 10
+        step_dim = raw_obs_dim + act_dim  # 7
+        aug_dim = raw_obs_dim + H * step_dim  # 6 + 70 = 76
+
+        obs_space = spaces.Box(low=-1.0, high=1.0, shape=(aug_dim,))
+        act_space = spaces.Box(low=-1.0, high=1.0, shape=(act_dim,))
+        model = SharedMLP(obs_space, act_space, torch.device("cpu"),
+                          hidden_sizes=(32, 32),
+                          history_length=H, raw_obs_dim=raw_obs_dim,
+                          embedding_dim=32)
+        assert model.history_encoder is not None
+        inp = {"states": torch.randn(4, aug_dim)}
+        mean, log_std, _ = model.compute(inp, role="policy")
+        assert mean.shape == (4, act_dim)
+        value, _ = model.compute(inp, role="value")
+        assert value.shape == (4, 1)