♻️ full agent refactor

2026-06-10 21:15:34 +02:00
parent a98e86ef66
commit 1e0836e1bc
49 changed files with 1309 additions and 829 deletions
--- a/configs/training/ppo.yaml
+++ b/configs/training/ppo.yaml
@@ -1,14 +1,18 @@
+# PPO defaults — sized for the CPU MuJoCo runner (64 parallel envs).
+# 128 rollout steps × 64 envs ≈ 8K samples per update.
+
 hidden_sizes: [256, 256]
-total_timesteps: 5000000
-rollout_steps: 2048
-learning_epochs: 10
-mini_batches: 8
+total_timesteps: 500000          # × 64 envs = 32M env steps
+rollout_steps: 128
+learning_epochs: 5
+mini_batches: 4
 discount_factor: 0.99
 gae_lambda: 0.95
 learning_rate: 0.0003
 clip_ratio: 0.2
 value_loss_scale: 0.5
 entropy_loss_scale: 0.01
+kl_threshold: 0.01               # KL-adaptive LR; 0 = fixed learning rate
 log_interval: 1000
 checkpoint_interval: 50000

@@ -18,13 +22,9 @@ max_log_std: 2.0

 record_video_every: 10000

-# RMA-style history encoder
-history_length: 10       # temporal window (must match runner)
-embedding_dim: 32        # history encoder output dimension
-
-# RMA (Rapid Motor Adaptation)
-rma_mode: "none"         # "none" | "teacher" | "deploy"
-latent_dim: 8            # env encoder / adaptation latent dimension
+# History encoder output dim — the window size itself comes from
+# runner.history_length (single source of truth).
+embedding_dim: 32

 # ClearML remote execution (GPU worker)
 remote: false