♻️ full agent refactor

2026-06-10 21:15:34 +02:00
parent a98e86ef66
commit 1e0836e1bc
49 changed files with 1309 additions and 829 deletions
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -1,5 +1,5 @@
 defaults:
-  - env: cartpole
+  - env: rotary_cartpole
  - runner: mujoco
  - training: ppo
-  - _self_
+  - _self_
--- a/configs/env/cartpole.yaml
+++ b/configs/env/cartpole.yaml
@@ -1,7 +0,0 @@
-max_steps: 500
-robot_path: assets/cartpole
-angle_threshold: 0.418
-cart_limit: 2.4
-reward_alive: 1.0
-reward_pole_upright_scale: 1.0
-reward_action_penalty_scale: 0.01
--- a/configs/env/rotary_cartpole.yaml
+++ b/configs/env/rotary_cartpole.yaml
@@ -9,6 +9,7 @@ balance_vel_scale: 0.5           # how fast the balance bonus decays with pendul
 motor_vel_penalty: 0.01          # penalise high motor angular velocity
 motor_angle_penalty: 0.05        # penalise deviation from centre
 action_penalty: 0.05             # penalise large actions (energy cost)
+action_rate_penalty: 0.01        # penalise action changes (real-motor smoothness)

 # ── Initial state randomisation ──────────────────────────────────────
 pendulum_init_range_deg: 180.0   # pendulum starts in [-180°, +180°]
@@ -22,5 +23,6 @@ hpo:
  motor_vel_penalty: {min: 0.001, max: 0.1}
  motor_angle_penalty: {min: 0.01, max: 0.2}
  action_penalty: {min: 0.01, max: 0.2}
+  action_rate_penalty: {min: 0.001, max: 0.1}
  pendulum_init_range_deg: {min: 30.0, max: 180.0}
  max_steps: {values: [500, 1000, 2000]}
--- a/configs/runner/mjx.yaml
+++ b/configs/runner/mjx.yaml
@@ -2,9 +2,7 @@ num_envs: 1024       # MJX shines with many parallel envs
 device: auto         # auto = cuda if available, else cpu
 dt: 0.002
 substeps: 10
-history_length: 10   # RMA-style: 10-step window of (obs, action) pairs
-
-rma_mode: "none"    # "none" | "teacher" | "deploy"
+history_length: 10   # (obs, action) window for implicit adaptation

 # ── Domain randomization (sim-to-real) ──────────────────────────────
 # Full DR on GPU: latency + sensor noise + per-env dynamics scales
--- a/configs/runner/mujoco.yaml
+++ b/configs/runner/mujoco.yaml
@@ -2,9 +2,7 @@ num_envs: 64
 device: auto  # auto = cuda if available, else cpu
 dt: 0.002
 substeps: 10
-history_length: 10   # must match training.history_length (DR + embedding)
-
-rma_mode: "none"     # "none" | "teacher" | "deploy"
+history_length: 10   # (obs, action) window for implicit adaptation

 # ── Domain randomization (sim-to-real) ──────────────────────────────
 # Noise/delay levels anchored to the real recordings (~50 Hz, ~0.5 rad/s
--- a/configs/runner/mujoco_single.yaml
+++ b/configs/runner/mujoco_single.yaml
@@ -7,8 +7,6 @@ dt: 0.002
 substeps: 10
 history_length: 10

-rma_mode: "none"    # "none" | "teacher" | "deploy"
-
 # Clean by default (deterministic eval).  Confirming-experiment example —
 # re-eval an existing checkpoint in sim with a fixed 1-step action delay:
 #   mjpython scripts/eval.py env=rotary_cartpole runner=mujoco_single \
--- a/configs/runner/serial.yaml
+++ b/configs/runner/serial.yaml
@@ -9,5 +9,3 @@ baud: 115200
 dt: 0.02                    # control loop period (50 Hz, matches training)
 no_data_timeout: 2.0        # seconds of silence before declaring disconnect
 history_length: 10           # must match training runner
-
-rma_mode: "none"    # "none" | "teacher" | "deploy"
--- a/configs/training/ppo.yaml
+++ b/configs/training/ppo.yaml
@@ -1,14 +1,18 @@
+# PPO defaults — sized for the CPU MuJoCo runner (64 parallel envs).
+# 128 rollout steps × 64 envs ≈ 8K samples per update.
+
 hidden_sizes: [256, 256]
-total_timesteps: 5000000
-rollout_steps: 2048
-learning_epochs: 10
-mini_batches: 8
+total_timesteps: 500000          # × 64 envs = 32M env steps
+rollout_steps: 128
+learning_epochs: 5
+mini_batches: 4
 discount_factor: 0.99
 gae_lambda: 0.95
 learning_rate: 0.0003
 clip_ratio: 0.2
 value_loss_scale: 0.5
 entropy_loss_scale: 0.01
+kl_threshold: 0.01               # KL-adaptive LR; 0 = fixed learning rate
 log_interval: 1000
 checkpoint_interval: 50000

@@ -18,13 +22,9 @@ max_log_std: 2.0

 record_video_every: 10000

-# RMA-style history encoder
-history_length: 10       # temporal window (must match runner)
-embedding_dim: 32        # history encoder output dimension
-
-# RMA (Rapid Motor Adaptation)
-rma_mode: "none"         # "none" | "teacher" | "deploy"
-latent_dim: 8            # env encoder / adaptation latent dimension
+# History encoder output dim — the window size itself comes from
+# runner.history_length (single source of truth).
+embedding_dim: 32

 # ClearML remote execution (GPU worker)
 remote: false
--- a/configs/training/ppo_mjx.yaml
+++ b/configs/training/ppo_mjx.yaml
@@ -1,15 +1,20 @@
-# PPO tuned for MJX (1024+ parallel envs on GPU).
+# PPO sized for MJX (1024+ parallel envs on GPU).
 # Inherits defaults + HPO ranges from ppo.yaml.
-# With 1024 envs, each timestep collects 1024 samples, so total_timesteps
-# can be much lower than the CPU config.
+#
+# Short rollouts × many envs is the GPU-PPO sweet spot:
+# 24 steps × 1024 envs ≈ 25K samples per update (~6K per mini-batch).
+# (The old rollout_steps=2048 inherited from the CPU config meant a
+#  2M-sample memory per update — GBs of VRAM and glacial updates.)

 defaults:
  - ppo
  - _self_

-total_timesteps: 300000         # 300K × 1024 envs ≈ 307M env steps
-mini_batches: 32                # keep mini-batch size similar (~32K)
-learning_rate: 0.001            # ~3x higher LR for 16x larger batch (sqrt scaling)
+rollout_steps: 24
+mini_batches: 4
+learning_epochs: 5
+learning_rate: 0.0003           # KL-adaptive scheduler handles the rest
+total_timesteps: 100000         # × 1024 envs ≈ 100M env steps
 log_interval: 100
 checkpoint_interval: 10000