28 lines
875 B
YAML
28 lines
875 B
YAML
# PPO tuned for single-env real-time training on real hardware.
|
||
# Inherits defaults + HPO ranges from ppo.yaml.
|
||
# ~50 Hz control × 1 env = ~50 timesteps/s.
|
||
# 100k timesteps ≈ 33 minutes of wall-clock training.
|
||
|
||
defaults:
|
||
- ppo
|
||
- _self_
|
||
|
||
hidden_sizes: [256, 256]
|
||
total_timesteps: 100000
|
||
learning_epochs: 5
|
||
learning_rate: 0.001 # conservative — can't undo real-world damage
|
||
entropy_loss_scale: 0.0001
|
||
log_interval: 1024
|
||
checkpoint_interval: 5000 # frequent saves — can't rewind real hardware
|
||
initial_log_std: -0.5 # moderate initial exploration
|
||
min_log_std: -4.0
|
||
max_log_std: 0.0 # cap σ at 1.0
|
||
|
||
# Never run real-hardware training remotely
|
||
remote: false
|
||
|
||
# Tighter HPO ranges for real hardware (override base ppo.yaml ranges)
|
||
hpo:
|
||
entropy_loss_scale: {min: 0.00005, max: 0.001}
|
||
learning_rate: {min: 0.0003, max: 0.003}
|