# PPO tuned for single-env simulation — mimics real hardware training. # Inherits defaults + HPO ranges from ppo.yaml. # Same 50 Hz control (runner=mujoco_single), 1 env, conservative hypers. # Sim runs ~100× faster than real time, so we can afford more timesteps. defaults: - ppo - _self_ hidden_sizes: [256, 256] total_timesteps: 500000 learning_epochs: 5 learning_rate: 0.001 entropy_loss_scale: 0.0001 log_interval: 1024 checkpoint_interval: 10000 initial_log_std: -0.5 min_log_std: -4.0 max_log_std: 0.0 record_video_every: 50000 remote: false