♻️ full agent refactor
This commit is contained in:
@@ -1,14 +1,18 @@
|
||||
# PPO defaults — sized for the CPU MuJoCo runner (64 parallel envs).
|
||||
# 128 rollout steps × 64 envs ≈ 8K samples per update.
|
||||
|
||||
hidden_sizes: [256, 256]
|
||||
total_timesteps: 5000000
|
||||
rollout_steps: 2048
|
||||
learning_epochs: 10
|
||||
mini_batches: 8
|
||||
total_timesteps: 500000 # × 64 envs = 32M env steps
|
||||
rollout_steps: 128
|
||||
learning_epochs: 5
|
||||
mini_batches: 4
|
||||
discount_factor: 0.99
|
||||
gae_lambda: 0.95
|
||||
learning_rate: 0.0003
|
||||
clip_ratio: 0.2
|
||||
value_loss_scale: 0.5
|
||||
entropy_loss_scale: 0.01
|
||||
kl_threshold: 0.01 # KL-adaptive LR; 0 = fixed learning rate
|
||||
log_interval: 1000
|
||||
checkpoint_interval: 50000
|
||||
|
||||
@@ -18,13 +22,9 @@ max_log_std: 2.0
|
||||
|
||||
record_video_every: 10000
|
||||
|
||||
# RMA-style history encoder
|
||||
history_length: 10 # temporal window (must match runner)
|
||||
embedding_dim: 32 # history encoder output dimension
|
||||
|
||||
# RMA (Rapid Motor Adaptation)
|
||||
rma_mode: "none" # "none" | "teacher" | "deploy"
|
||||
latent_dim: 8 # env encoder / adaptation latent dimension
|
||||
# History encoder output dim — the window size itself comes from
|
||||
# runner.history_length (single source of truth).
|
||||
embedding_dim: 32
|
||||
|
||||
# ClearML remote execution (GPU worker)
|
||||
remote: false
|
||||
|
||||
@@ -1,15 +1,20 @@
|
||||
# PPO tuned for MJX (1024+ parallel envs on GPU).
|
||||
# PPO sized for MJX (1024+ parallel envs on GPU).
|
||||
# Inherits defaults + HPO ranges from ppo.yaml.
|
||||
# With 1024 envs, each timestep collects 1024 samples, so total_timesteps
|
||||
# can be much lower than the CPU config.
|
||||
#
|
||||
# Short rollouts × many envs is the GPU-PPO sweet spot:
|
||||
# 24 steps × 1024 envs ≈ 25K samples per update (~6K per mini-batch).
|
||||
# (The old rollout_steps=2048 inherited from the CPU config meant a
|
||||
# 2M-sample memory per update — GBs of VRAM and glacial updates.)
|
||||
|
||||
defaults:
|
||||
- ppo
|
||||
- _self_
|
||||
|
||||
total_timesteps: 300000 # 300K × 1024 envs ≈ 307M env steps
|
||||
mini_batches: 32 # keep mini-batch size similar (~32K)
|
||||
learning_rate: 0.001 # ~3x higher LR for 16x larger batch (sqrt scaling)
|
||||
rollout_steps: 24
|
||||
mini_batches: 4
|
||||
learning_epochs: 5
|
||||
learning_rate: 0.0003 # KL-adaptive scheduler handles the rest
|
||||
total_timesteps: 100000 # × 1024 envs ≈ 100M env steps
|
||||
log_interval: 100
|
||||
checkpoint_interval: 10000
|
||||
|
||||
|
||||
Reference in New Issue
Block a user