hidden_sizes: [128, 128] total_timesteps: 5000000 rollout_steps: 1024 learning_epochs: 4 mini_batches: 4 discount_factor: 0.99 gae_lambda: 0.95 learning_rate: 0.0003 clip_ratio: 0.2 value_loss_scale: 0.5 entropy_loss_scale: 0.05 log_interval: 1000 checkpoint_interval: 50000 initial_log_std: 0.5 min_log_std: -2.0 max_log_std: 2.0 record_video_every: 10000 # ClearML remote execution (GPU worker) remote: false # ── HPO search ranges ──────────────────────────────────────────────── # Read by scripts/hpo.py — ignored by TrainerConfig during training. hpo: learning_rate: {min: 0.00005, max: 0.001} clip_ratio: {min: 0.1, max: 0.3} discount_factor: {min: 0.98, max: 0.999} gae_lambda: {min: 0.9, max: 0.99} entropy_loss_scale: {min: 0.0001, max: 0.1} value_loss_scale: {min: 0.1, max: 1.0} learning_epochs: {min: 2, max: 8, type: int} mini_batches: {values: [2, 4, 8, 16]}