♻️ full agent refactor

This commit is contained in:
2026-06-10 21:15:34 +02:00
parent a98e86ef66
commit 1e0836e1bc
49 changed files with 1309 additions and 829 deletions

View File

@@ -1,5 +1,5 @@
defaults:
- env: cartpole
- env: rotary_cartpole
- runner: mujoco
- training: ppo
- _self_
- _self_

View File

@@ -1,7 +0,0 @@
max_steps: 500
robot_path: assets/cartpole
angle_threshold: 0.418
cart_limit: 2.4
reward_alive: 1.0
reward_pole_upright_scale: 1.0
reward_action_penalty_scale: 0.01

View File

@@ -9,6 +9,7 @@ balance_vel_scale: 0.5 # how fast the balance bonus decays with pendul
motor_vel_penalty: 0.01 # penalise high motor angular velocity
motor_angle_penalty: 0.05 # penalise deviation from centre
action_penalty: 0.05 # penalise large actions (energy cost)
action_rate_penalty: 0.01 # penalise action changes (real-motor smoothness)
# ── Initial state randomisation ──────────────────────────────────────
pendulum_init_range_deg: 180.0 # pendulum starts in [-180°, +180°]
@@ -22,5 +23,6 @@ hpo:
motor_vel_penalty: {min: 0.001, max: 0.1}
motor_angle_penalty: {min: 0.01, max: 0.2}
action_penalty: {min: 0.01, max: 0.2}
action_rate_penalty: {min: 0.001, max: 0.1}
pendulum_init_range_deg: {min: 30.0, max: 180.0}
max_steps: {values: [500, 1000, 2000]}

View File

@@ -2,9 +2,7 @@ num_envs: 1024 # MJX shines with many parallel envs
device: auto # auto = cuda if available, else cpu
dt: 0.002
substeps: 10
history_length: 10 # RMA-style: 10-step window of (obs, action) pairs
rma_mode: "none" # "none" | "teacher" | "deploy"
history_length: 10 # (obs, action) window for implicit adaptation
# ── Domain randomization (sim-to-real) ──────────────────────────────
# Full DR on GPU: latency + sensor noise + per-env dynamics scales

View File

@@ -2,9 +2,7 @@ num_envs: 64
device: auto # auto = cuda if available, else cpu
dt: 0.002
substeps: 10
history_length: 10 # must match training.history_length (DR + embedding)
rma_mode: "none" # "none" | "teacher" | "deploy"
history_length: 10 # (obs, action) window for implicit adaptation
# ── Domain randomization (sim-to-real) ──────────────────────────────
# Noise/delay levels anchored to the real recordings (~50 Hz, ~0.5 rad/s

View File

@@ -7,8 +7,6 @@ dt: 0.002
substeps: 10
history_length: 10
rma_mode: "none" # "none" | "teacher" | "deploy"
# Clean by default (deterministic eval). Confirming-experiment example —
# re-eval an existing checkpoint in sim with a fixed 1-step action delay:
# mjpython scripts/eval.py env=rotary_cartpole runner=mujoco_single \

View File

@@ -9,5 +9,3 @@ baud: 115200
dt: 0.02 # control loop period (50 Hz, matches training)
no_data_timeout: 2.0 # seconds of silence before declaring disconnect
history_length: 10 # must match training runner
rma_mode: "none" # "none" | "teacher" | "deploy"

View File

@@ -1,14 +1,18 @@
# PPO defaults — sized for the CPU MuJoCo runner (64 parallel envs).
# 128 rollout steps × 64 envs ≈ 8K samples per update.
hidden_sizes: [256, 256]
total_timesteps: 5000000
rollout_steps: 2048
learning_epochs: 10
mini_batches: 8
total_timesteps: 500000 # × 64 envs = 32M env steps
rollout_steps: 128
learning_epochs: 5
mini_batches: 4
discount_factor: 0.99
gae_lambda: 0.95
learning_rate: 0.0003
clip_ratio: 0.2
value_loss_scale: 0.5
entropy_loss_scale: 0.01
kl_threshold: 0.01 # KL-adaptive LR; 0 = fixed learning rate
log_interval: 1000
checkpoint_interval: 50000
@@ -18,13 +22,9 @@ max_log_std: 2.0
record_video_every: 10000
# RMA-style history encoder
history_length: 10 # temporal window (must match runner)
embedding_dim: 32 # history encoder output dimension
# RMA (Rapid Motor Adaptation)
rma_mode: "none" # "none" | "teacher" | "deploy"
latent_dim: 8 # env encoder / adaptation latent dimension
# History encoder output dim — the window size itself comes from
# runner.history_length (single source of truth).
embedding_dim: 32
# ClearML remote execution (GPU worker)
remote: false

View File

@@ -1,15 +1,20 @@
# PPO tuned for MJX (1024+ parallel envs on GPU).
# PPO sized for MJX (1024+ parallel envs on GPU).
# Inherits defaults + HPO ranges from ppo.yaml.
# With 1024 envs, each timestep collects 1024 samples, so total_timesteps
# can be much lower than the CPU config.
#
# Short rollouts × many envs is the GPU-PPO sweet spot:
# 24 steps × 1024 envs ≈ 25K samples per update (~6K per mini-batch).
# (The old rollout_steps=2048 inherited from the CPU config meant a
# 2M-sample memory per update — GBs of VRAM and glacial updates.)
defaults:
- ppo
- _self_
total_timesteps: 300000 # 300K × 1024 envs ≈ 307M env steps
mini_batches: 32 # keep mini-batch size similar (~32K)
learning_rate: 0.001 # ~3x higher LR for 16x larger batch (sqrt scaling)
rollout_steps: 24
mini_batches: 4
learning_epochs: 5
learning_rate: 0.0003 # KL-adaptive scheduler handles the rest
total_timesteps: 100000 # × 1024 envs ≈ 100M env steps
log_interval: 100
checkpoint_interval: 10000