# PPO tuned for single-env simulation — mimics real hardware training.
# Inherits defaults + HPO ranges from ppo.yaml.
# Same 50 Hz control (runner=mujoco_single), 1 env, conservative hypers.
# Sim runs ~100× faster than real time, so we can afford more timesteps.

defaults:
  - ppo
  - _self_

hidden_sizes: [256, 256]
total_timesteps: 500000
learning_epochs: 5
learning_rate: 0.001
entropy_loss_scale: 0.0001
log_interval: 1024
checkpoint_interval: 10000
initial_log_std: -0.5
min_log_std: -4.0
max_log_std: 0.0

record_video_every: 50000

remote: false