# PPO tuned for single-env simulation — mimics real hardware training.
# Inherits defaults + HPO ranges from ppo.yaml.
# Same 50 Hz control (runner=mujoco_single), 1 env, conservative hypers.
# Sim runs ~100× faster than real time, so we can afford more timesteps.

defaults:
  - ppo
  - _self_

hidden_sizes: [256, 256]
total_timesteps: 2000000
learning_epochs: 10
learning_rate: 0.0003
entropy_loss_scale: 0.01
rollout_steps: 2048
mini_batches: 8
log_interval: 2048
checkpoint_interval: 10000
initial_log_std: -0.5
min_log_std: -4.0
max_log_std: 2.0

record_video_every: 50000

remote: false