feat: RMA-style history-conditioned policy for sim2real adaptation

Added a temporal observation history buffer and 1D-CNN encoder so the policy can implicitly infer environment parameters (mass, friction, gear ratios, etc.) from recent (obs, action) dynamics. Architecture: history window [(obs₀,a₀), ..., (obs_{H-1},a_{H-1})] → 1D-CNN HistoryEncoder → embedding (32-dim) → concat [current_obs, embedding] → MLP → action Components: - BaseRunner: history ring buffer, _push_history/_reset_history, augmented obs space (6 + H×7 = 76 with H=10) - HistoryEncoder (src/models/mlp.py): 2-layer temporal Conv1d + GAP - SharedMLP: optional history_length/raw_obs_dim/embedding_dim params; splits augmented obs, encodes history, feeds [obs, emb] to MLP - TrainerConfig: history_length, embedding_dim fields - All runner configs: history_length=10 by default - Tests: encoder shape, model with/without history, config defaults
2026-03-28 18:58:24 +01:00
parent 8ed9afe583
commit 8cc84d6a21
9 changed files with 209 additions and 9 deletions
--- a/configs/runner/mjx.yaml
+++ b/configs/runner/mjx.yaml
@@ -2,3 +2,4 @@ num_envs: 1024       # MJX shines with many parallel envs
 device: auto         # auto = cuda if available, else cpu
 dt: 0.002
 substeps: 10
+history_length: 10   # RMA-style: 10-step window of (obs, action) pairs
--- a/configs/runner/mujoco.yaml
+++ b/configs/runner/mujoco.yaml
@@ -2,6 +2,7 @@ num_envs: 64
 device: auto  # auto = cuda if available, else cpu
 dt: 0.002
 substeps: 10
+history_length: 10  # RMA-style: 10-step window of (obs, action) pairs

 # ── Sim2real: domain randomization ───────────────────────────────
 domain_rand:
--- a/configs/runner/mujoco_single.yaml
+++ b/configs/runner/mujoco_single.yaml
@@ -5,3 +5,4 @@ num_envs: 1
 device: cpu
 dt: 0.002
 substeps: 10
+history_length: 10
--- a/configs/runner/serial.yaml
+++ b/configs/runner/serial.yaml
@@ -8,3 +8,4 @@ port: /dev/cu.usbserial-0001
 baud: 115200
 dt: 0.02                    # control loop period (50 Hz, matches training)
 no_data_timeout: 2.0        # seconds of silence before declaring disconnect
+history_length: 10           # must match training runner
--- a/configs/training/ppo.yaml
+++ b/configs/training/ppo.yaml
@@ -18,6 +18,10 @@ max_log_std: 2.0

 record_video_every: 10000

+# RMA-style history encoder
+history_length: 10       # temporal window (must match runner)
+embedding_dim: 32        # history encoder output dimension
+
 # ClearML remote execution (GPU worker)
 remote: false