♻️ full agent refactor

2026-06-10 21:15:34 +02:00
parent a98e86ef66
commit 1e0836e1bc
49 changed files with 1309 additions and 829 deletions
--- a/configs/training/ppo_mjx.yaml
+++ b/configs/training/ppo_mjx.yaml
@@ -1,15 +1,20 @@
-# PPO tuned for MJX (1024+ parallel envs on GPU).
+# PPO sized for MJX (1024+ parallel envs on GPU).
 # Inherits defaults + HPO ranges from ppo.yaml.
-# With 1024 envs, each timestep collects 1024 samples, so total_timesteps
-# can be much lower than the CPU config.
+#
+# Short rollouts × many envs is the GPU-PPO sweet spot:
+# 24 steps × 1024 envs ≈ 25K samples per update (~6K per mini-batch).
+# (The old rollout_steps=2048 inherited from the CPU config meant a
+#  2M-sample memory per update — GBs of VRAM and glacial updates.)

 defaults:
  - ppo
  - _self_

-total_timesteps: 300000         # 300K × 1024 envs ≈ 307M env steps
-mini_batches: 32                # keep mini-batch size similar (~32K)
-learning_rate: 0.001            # ~3x higher LR for 16x larger batch (sqrt scaling)
+rollout_steps: 24
+mini_batches: 4
+learning_epochs: 5
+learning_rate: 0.0003           # KL-adaptive scheduler handles the rest
+total_timesteps: 100000         # × 1024 envs ≈ 100M env steps
 log_interval: 100
 checkpoint_interval: 10000