# PPO defaults — sized for the CPU MuJoCo runner (64 parallel envs). # 128 rollout steps × 64 envs ≈ 8K samples per update. hidden_sizes: [256, 256] total_timesteps: 500000 # × 64 envs = 32M env steps rollout_steps: 128 learning_epochs: 5 mini_batches: 4 discount_factor: 0.99 gae_lambda: 0.95 learning_rate: 0.0003 clip_ratio: 0.2 value_loss_scale: 0.5 entropy_loss_scale: 0.01 kl_threshold: 0.01 # KL-adaptive LR; 0 = fixed learning rate log_interval: 1000 checkpoint_interval: 50000 initial_log_std: -0.5 min_log_std: -4.0 max_log_std: 2.0 record_video_every: 10000 # History encoder output dim — the window size itself comes from # runner.history_length (single source of truth). embedding_dim: 32 # ClearML remote execution (GPU worker) remote: false # ── HPO search ranges ──────────────────────────────────────────────── # Read by scripts/hpo.py — ignored by TrainerConfig during training. hpo: learning_rate: {min: 0.00005, max: 0.001} clip_ratio: {min: 0.1, max: 0.3} discount_factor: {min: 0.98, max: 0.999} gae_lambda: {min: 0.9, max: 0.99} entropy_loss_scale: {min: 0.0001, max: 0.1} value_loss_scale: {min: 0.1, max: 1.0} learning_epochs: {min: 2, max: 8, type: int} mini_batches: {values: [2, 4, 8, 16]}