43 lines
1.3 KiB
YAML
43 lines
1.3 KiB
YAML
# PPO defaults — sized for the CPU MuJoCo runner (64 parallel envs).
|
||
# 128 rollout steps × 64 envs ≈ 8K samples per update.
|
||
|
||
hidden_sizes: [256, 256]
|
||
total_timesteps: 500000 # × 64 envs = 32M env steps
|
||
rollout_steps: 128
|
||
learning_epochs: 5
|
||
mini_batches: 4
|
||
discount_factor: 0.99
|
||
gae_lambda: 0.95
|
||
learning_rate: 0.0003
|
||
clip_ratio: 0.2
|
||
value_loss_scale: 0.5
|
||
entropy_loss_scale: 0.01
|
||
kl_threshold: 0.01 # KL-adaptive LR; 0 = fixed learning rate
|
||
log_interval: 1000
|
||
checkpoint_interval: 50000
|
||
|
||
initial_log_std: -0.5
|
||
min_log_std: -4.0
|
||
max_log_std: 2.0
|
||
|
||
record_video_every: 10000
|
||
|
||
# History encoder output dim — the window size itself comes from
|
||
# runner.history_length (single source of truth).
|
||
embedding_dim: 32
|
||
|
||
# ClearML remote execution (GPU worker)
|
||
remote: false
|
||
|
||
# ── HPO search ranges ────────────────────────────────────────────────
|
||
# Read by scripts/hpo.py — ignored by TrainerConfig during training.
|
||
hpo:
|
||
learning_rate: {min: 0.00005, max: 0.001}
|
||
clip_ratio: {min: 0.1, max: 0.3}
|
||
discount_factor: {min: 0.98, max: 0.999}
|
||
gae_lambda: {min: 0.9, max: 0.99}
|
||
entropy_loss_scale: {min: 0.0001, max: 0.1}
|
||
value_loss_scale: {min: 0.1, max: 1.0}
|
||
learning_epochs: {min: 2, max: 8, type: int}
|
||
mini_batches: {values: [2, 4, 8, 16]}
|