♻️ full agent refactor

This commit is contained in:
2026-06-10 21:15:34 +02:00
parent a98e86ef66
commit 1e0836e1bc
49 changed files with 1309 additions and 829 deletions

View File

@@ -1,14 +1,18 @@
# PPO defaults — sized for the CPU MuJoCo runner (64 parallel envs).
# 128 rollout steps × 64 envs ≈ 8K samples per update.
hidden_sizes: [256, 256]
total_timesteps: 5000000
rollout_steps: 2048
learning_epochs: 10
mini_batches: 8
total_timesteps: 500000 # × 64 envs = 32M env steps
rollout_steps: 128
learning_epochs: 5
mini_batches: 4
discount_factor: 0.99
gae_lambda: 0.95
learning_rate: 0.0003
clip_ratio: 0.2
value_loss_scale: 0.5
entropy_loss_scale: 0.01
kl_threshold: 0.01 # KL-adaptive LR; 0 = fixed learning rate
log_interval: 1000
checkpoint_interval: 50000
@@ -18,13 +22,9 @@ max_log_std: 2.0
record_video_every: 10000
# RMA-style history encoder
history_length: 10 # temporal window (must match runner)
embedding_dim: 32 # history encoder output dimension
# RMA (Rapid Motor Adaptation)
rma_mode: "none" # "none" | "teacher" | "deploy"
latent_dim: 8 # env encoder / adaptation latent dimension
# History encoder output dim — the window size itself comes from
# runner.history_length (single source of truth).
embedding_dim: 32
# ClearML remote execution (GPU worker)
remote: false