# PPO tuned for single-env simulation — mimics real hardware training. # Inherits defaults + HPO ranges from ppo.yaml. # Same 50 Hz control (runner=mujoco_single), 1 env, conservative hypers. # Sim runs ~100× faster than real time, so we can afford more timesteps. defaults: - ppo - _self_ hidden_sizes: [256, 256] total_timesteps: 2000000 learning_epochs: 10 learning_rate: 0.0003 entropy_loss_scale: 0.01 rollout_steps: 2048 mini_batches: 8 log_interval: 2048 checkpoint_interval: 10000 initial_log_std: -0.5 min_log_std: -4.0 max_log_std: 2.0 record_video_every: 50000 remote: false