# PPO tuned for MJX (1024+ parallel envs on GPU). # With 1024 envs, each timestep collects 1024 samples, so total_timesteps # can be much lower than the CPU config. hidden_sizes: [128, 128] total_timesteps: 300000 # 300K × 1024 envs ≈ 307M env steps rollout_steps: 1024 # PPO batch = 1024 envs × 1024 steps = 1M samples learning_epochs: 4 mini_batches: 32 # keep mini-batch size similar to CPU config (~32K) discount_factor: 0.99 gae_lambda: 0.95 learning_rate: 0.001 # ~3x higher LR for 16x larger batch (sqrt scaling) clip_ratio: 0.2 value_loss_scale: 0.5 entropy_loss_scale: 0.05 log_interval: 100 # log more often (shorter run) checkpoint_interval: 10000 record_video_every: 10000 # ClearML remote execution (GPU worker) remote: false