# PPO tuned for single-env real-time training on real hardware. # Inherits defaults + HPO ranges from ppo.yaml. # ~50 Hz control × 1 env = ~50 timesteps/s. # 100k timesteps ≈ 33 minutes of wall-clock training. defaults: - ppo - _self_ hidden_sizes: [256, 256] total_timesteps: 2000000 learning_epochs: 10 learning_rate: 0.0005 # conservative — can't undo real-world damage entropy_loss_scale: 0.01 rollout_steps: 2048 mini_batches: 8 log_interval: 2048 checkpoint_interval: 5000 # frequent saves — can't rewind real hardware initial_log_std: -0.5 # moderate initial exploration min_log_std: -4.0 max_log_std: 2.0 # cap σ at 1.0 # Never run real-hardware training remotely remote: false # Tighter HPO ranges for real hardware (override base ppo.yaml ranges) hpo: entropy_loss_scale: {min: 0.00005, max: 0.001} learning_rate: {min: 0.0003, max: 0.003}