feat: sim2real domain randomization + reward fixes for rotary cartpole

Close the sim2real gap for the Furuta pendulum (swings up but can't
balance on hardware). Root causes were (a) no domain randomization, so
the policy overfit one deterministic sim instance, and (b) reward design
flaws that produced degenerate policies.

Domain randomization (runner-level, backend-agnostic):
- BaseRunner: domain_rand config; per-env action-delay buffer (latency),
  Gaussian qpos/qvel sensor noise, per-env dynamics-scale sampling
  (friction/damping/torque), resampled per episode. Sensor noise per step.
- privileged_obs/privileged_dim expose normalized DR factors (mu) for RMA.
- step() now uses clean state for reward/termination, noisy state for the
  observation the policy sees.
- MuJoCoRunner: applies per-env friction/damping/torque scales.
- robot.py: compute_motor_force gains friction/damping scale args.
- Configs: DR blocks for mujoco (full) and mjx (delay+noise); clean
  defaults for mujoco_single/serial; noise/delay anchored to recordings.

Reward fixes (rotary_cartpole):
- Shift upright reward to [0,1] (was [-1,1]) + alive_bonus, so surviving
  always beats ending early (kills the "suicide into the limit" policy).
- Add balance_bonus * upright * stillness so reward requires upright AND
  near-zero pendulum velocity (kills the "spin in full loops" policy).

Deploy:
- eval.py load_policy reconstructs the history/adaptation encoder
  (auto-detects its dim from the checkpoint) so DR+embedding policies load.

Fixes:
- MuJoCoRunner._sim_reset referenced self._env (typo) -> self.env, which
  was breaking every rotary-cartpole reset.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-09 20:48:25 +02:00
parent 8cc84d6a21
commit b37cd26690
22 changed files with 1219 additions and 781 deletions

View File

@@ -4,14 +4,21 @@ Minimises the trajectory-matching cost between a MuJoCo rollout and a
recorded real-robot sequence. Uses the ``cmaes`` package (pure-Python
CMA-ES with native box-constraint support).
All 28 parameters (motor + pendulum/arm) are optimised jointly from a
single full-system recording. Velocities are optionally preprocessed
with Savitzky-Golay differentiation for cleaner targets.
Motor parameters are **locked** from the motor-only sysid — only
pendulum/arm inertial parameters, joint dynamics, and ctrl_limit are
optimised. Velocities are optionally preprocessed with Savitzky-Golay
differentiation for cleaner targets.
Usage:
python -m src.sysid.optimize \
--robot-path assets/rotary_cartpole \
--recording assets/rotary_cartpole/recordings/capture_....npz
--recording assets/rotary_cartpole/recordings/capture_20260314_000435.npz
# Shorter run for testing:
python -m src.sysid.optimize \
--robot-path assets/rotary_cartpole \
--recording <file>.npz \
--max-generations 10 --population-size 8
"""
from __future__ import annotations
@@ -26,17 +33,13 @@ import numpy as np
import structlog
from src.sysid.rollout import (
LOCKED_MOTOR_PARAMS,
PARAM_SETS,
ROTARY_CARTPOLE_PARAMS,
ParamSpec,
_make_actuator,
_resolve_params,
bounds_arrays,
build_base_model,
defaults_dict,
defaults_vector,
params_to_dict,
patch_model,
rollout,
windowed_rollout,
)
@@ -45,8 +48,62 @@ log = structlog.get_logger()
# ── Velocity preprocessing ───────────────────────────────────────────
# Shared implementation in src.sysid.preprocess.
from src.sysid.preprocess import preprocess_recording as _preprocess_recording
def _preprocess_recording(
recording: dict[str, np.ndarray],
preprocess_vel: bool = True,
sg_window: int = 7,
sg_polyorder: int = 3,
) -> dict[str, np.ndarray]:
"""Optionally recompute velocities using Savitzky-Golay differentiation.
Applies SG filtering to both motor_vel and pendulum_vel, replacing
the noisy firmware finite-difference velocities with smooth
analytical derivatives of the (clean) angle signals.
"""
if not preprocess_vel:
return recording
from scipy.signal import savgol_filter
rec = dict(recording)
times = rec["time"]
dt = float(np.mean(np.diff(times)))
# Motor velocity.
rec["motor_vel_raw"] = rec["motor_vel"].copy()
rec["motor_vel"] = savgol_filter(
rec["motor_angle"],
window_length=sg_window,
polyorder=sg_polyorder,
deriv=1,
delta=dt,
)
# Pendulum velocity.
rec["pendulum_vel_raw"] = rec["pendulum_vel"].copy()
rec["pendulum_vel"] = savgol_filter(
rec["pendulum_angle"],
window_length=sg_window,
polyorder=sg_polyorder,
deriv=1,
delta=dt,
)
motor_noise = np.std(rec["motor_vel_raw"] - rec["motor_vel"])
pend_noise = np.std(rec["pendulum_vel_raw"] - rec["pendulum_vel"])
log.info(
"velocity_preprocessed",
method="savgol",
sg_window=sg_window,
sg_polyorder=sg_polyorder,
dt_ms=f"{dt*1000:.1f}",
motor_noise_std=f"{motor_noise:.3f} rad/s",
pend_noise_std=f"{pend_noise:.3f} rad/s",
)
return rec
# ── Cost function ────────────────────────────────────────────────────
@@ -115,6 +172,7 @@ def cost_function(
vel_weight: float = 0.1,
pendulum_scale: float = 3.0,
window_duration: float = 0.5,
motor_params: dict[str, float] | None = None,
) -> float:
"""Compute trajectory-matching cost for a candidate parameter vector.
@@ -140,6 +198,7 @@ def cost_function(
window_duration=window_duration,
sim_dt=sim_dt,
substeps=substeps,
motor_params=motor_params,
)
else:
sim = rollout(
@@ -148,6 +207,7 @@ def cost_function(
actions=recording["action"],
sim_dt=sim_dt,
substeps=substeps,
motor_params=motor_params,
)
except Exception as exc:
log.warning("rollout_failed", error=str(exc))
@@ -161,122 +221,6 @@ def cost_function(
return _compute_trajectory_cost(sim, recording, pos_weight, vel_weight, pendulum_scale)
# ── Fast cost (model-cached) ────────────────────────────────────────
def _fast_cost(
params_vec: np.ndarray,
recording: dict[str, np.ndarray],
model, # mujoco.MjModel (cached, mutated in-place)
body_ids: dict[str, int],
dof_ids: dict[str, int],
specs: list[ParamSpec],
sim_dt: float = 0.002,
substeps: int = 10,
pos_weight: float = 1.0,
vel_weight: float = 0.1,
pendulum_scale: float = 3.0,
window_duration: float = 0.5,
) -> float:
"""Like cost_function but patches a cached model instead of rebuilding."""
import mujoco as _mj
from src.runners.mujoco import ActuatorLimits
params = params_to_dict(params_vec, specs)
if not _check_inertia_valid(params):
return 1e6
# Patch the cached model in-place — no XML, no temp files.
patch_model(model, params, body_ids, dof_ids)
# Rebuild actuator from params (motor params change per candidate).
actuator = _make_actuator(params)
try:
data = _mj.MjData(model)
limits = ActuatorLimits(model)
ctrl_limit = params.get("ctrl_limit", 0.588)
times = recording["time"]
actions = recording["action"]
n = len(actions)
sim_motor_angle = np.zeros(n, dtype=np.float64)
sim_motor_vel = np.zeros(n, dtype=np.float64)
sim_pend_angle = np.zeros(n, dtype=np.float64)
sim_pend_vel = np.zeros(n, dtype=np.float64)
if window_duration > 0:
# Windowed (multiple shooting).
real_motor = recording["motor_angle"]
real_motor_vel = recording["motor_vel"]
real_pend = recording["pendulum_angle"]
real_pend_vel = recording["pendulum_vel"]
t0, t_end = times[0], times[-1]
window_starts: list[int] = []
current_t = t0
while current_t < t_end:
idx = int(np.searchsorted(times, current_t))
idx = min(idx, n - 1)
window_starts.append(idx)
current_t += window_duration
n_windows = len(window_starts)
for w, w_start in enumerate(window_starts):
w_end = window_starts[w + 1] if w + 1 < n_windows else n
_mj.mj_resetData(model, data)
data.qpos[0] = real_motor[w_start]
data.qpos[1] = real_pend[w_start]
data.qvel[0] = real_motor_vel[w_start]
data.qvel[1] = real_pend_vel[w_start]
data.ctrl[:] = 0.0
_mj.mj_forward(model, data)
for i in range(w_start, w_end):
action = max(-ctrl_limit, min(ctrl_limit, float(actions[i])))
ctrl = actuator.transform_ctrl(action)
data.ctrl[0] = ctrl
for _ in range(substeps):
limits.enforce(model, data)
data.qfrc_applied[0] = actuator.compute_motor_force(data.qvel[0], ctrl)
_mj.mj_step(model, data)
sim_motor_angle[i] = data.qpos[0]
sim_pend_angle[i] = data.qpos[1]
sim_motor_vel[i] = data.qvel[0]
sim_pend_vel[i] = data.qvel[1]
else:
# Open-loop single-shot.
_mj.mj_resetData(model, data)
for i in range(n):
action = max(-ctrl_limit, min(ctrl_limit, float(actions[i])))
ctrl = actuator.transform_ctrl(action)
data.ctrl[0] = ctrl
for _ in range(substeps):
limits.enforce(model, data)
data.qfrc_applied[0] = actuator.compute_motor_force(data.qvel[0], ctrl)
_mj.mj_step(model, data)
sim_motor_angle[i] = data.qpos[0]
sim_pend_angle[i] = data.qpos[1]
sim_motor_vel[i] = data.qvel[0]
sim_pend_vel[i] = data.qvel[1]
sim = {
"motor_angle": sim_motor_angle,
"motor_vel": sim_motor_vel,
"pendulum_angle": sim_pend_angle,
"pendulum_vel": sim_pend_vel,
}
except Exception:
return 1e6
for key in sim:
if np.any(~np.isfinite(sim[key])):
return 1e6
return _compute_trajectory_cost(sim, recording, pos_weight, vel_weight, pendulum_scale)
# ── Parallel evaluation helper (module-level for pickling) ───────────
# Shared state set by the parent process before spawning workers.
@@ -285,34 +229,22 @@ _par_robot_path: Path = Path(".")
_par_specs: list[ParamSpec] = []
_par_kwargs: dict = {}
# Cached model (built once per worker, patched per candidate).
_par_model: object = None # mujoco.MjModel
_par_body_ids: dict[str, int] = {}
_par_dof_ids: dict[str, int] = {}
def _init_worker(recording, robot_path, specs, kwargs):
"""Initialiser run once per worker process."""
global _par_recording, _par_robot_path, _par_specs, _par_kwargs
global _par_model, _par_body_ids, _par_dof_ids
_par_recording = recording
_par_robot_path = robot_path
_par_specs = specs
_par_kwargs = kwargs
# Build base model once — reused for all candidates in this worker.
_par_model, _, _par_body_ids, _par_dof_ids = build_base_model(robot_path)
_par_model.opt.timestep = kwargs.get("sim_dt", 0.002)
def _eval_candidate(x_natural: np.ndarray) -> float:
"""Evaluate a single candidate — called in worker processes."""
return _fast_cost(
return cost_function(
x_natural,
_par_recording,
_par_model,
_par_body_ids,
_par_dof_ids,
_par_robot_path,
_par_specs,
**_par_kwargs,
)
@@ -336,11 +268,13 @@ def optimize(
window_duration: float = 0.5,
seed: int = 42,
preprocess_vel: bool = True,
sg_window: int = 7,
sg_polyorder: int = 3,
) -> dict:
"""Run CMA-ES optimisation and return results.
All parameters (motor + pendulum/arm) are optimised jointly from a
single full-system recording.
Motor parameters are locked from the motor-only sysid.
Only pendulum/arm parameters are optimised.
Returns a dict with:
best_params: dict[str, float]
@@ -348,6 +282,7 @@ def optimize(
history: list of (generation, best_cost) tuples
recording: str (path used)
specs: list of param names
motor_params: dict of locked motor params
"""
from cmaes import CMA
@@ -357,11 +292,23 @@ def optimize(
if specs is None:
specs = ROTARY_CARTPOLE_PARAMS
motor_params = dict(LOCKED_MOTOR_PARAMS)
log.info(
"motor_params_locked",
n_params=len(motor_params),
gear_avg=f"{(motor_params['actuator_gear_pos'] + motor_params['actuator_gear_neg']) / 2:.4f}",
)
# Load recording.
recording = dict(np.load(recording_path))
# Preprocessing: SG velocity recomputation.
recording = _preprocess_recording(recording, preprocess_vel=preprocess_vel)
recording = _preprocess_recording(
recording,
preprocess_vel=preprocess_vel,
sg_window=sg_window,
sg_polyorder=sg_polyorder,
)
n_samples = len(recording["time"])
duration = recording["time"][-1] - recording["time"][0]
@@ -429,6 +376,7 @@ def optimize(
vel_weight=vel_weight,
pendulum_scale=pendulum_scale,
window_duration=window_duration,
motor_params=motor_params,
)
log.info("parallel_workers", n_workers=n_workers)
@@ -480,19 +428,6 @@ def optimize(
gen_best=f"{min(c for _, c in solutions):.6f}",
)
# Early stopping: no improvement in last 50 generations.
patience = 50
if len(history) > patience:
old_cost = history[-patience][1]
if old_cost - best_cost < old_cost * 0.001:
log.info(
"early_stopping",
gen=gen,
best_cost=f"{best_cost:.6f}",
stall_gens=patience,
)
break
total_time = time.monotonic() - t0
# Clean up process pool.
@@ -506,8 +441,7 @@ def optimize(
"cmaes_finished",
best_cost=f"{best_cost:.6f}",
total_time=f"{total_time:.1f}s",
generations=len(history),
evaluations=len(history) * population_size,
evaluations=max_generations * population_size,
)
# Log parameter comparison.
@@ -531,6 +465,7 @@ def optimize(
"recording": str(recording_path),
"param_names": [s.name for s in specs],
"defaults": {s.name: s.default for s in specs},
"motor_params": motor_params,
"preprocess_vel": preprocess_vel,
"timestamp": datetime.now().isoformat(),
}
@@ -581,12 +516,16 @@ def main() -> None:
action="store_true",
help="Skip Savitzky-Golay velocity preprocessing",
)
parser.add_argument("--sg-window", type=int, default=7,
help="Savitzky-Golay window length (odd, default 7)")
parser.add_argument("--sg-polyorder", type=int, default=3,
help="Savitzky-Golay polynomial order (default 3)")
parser.add_argument(
"--param-set",
type=str,
default="full",
choices=list(PARAM_SETS.keys()),
help="Parameter set to optimize: 'full' (28), 'motor' (13), or 'pendulum' (15)",
help="Parameter set to optimize: 'reduced' (6 params, fast) or 'full' (15 params)",
)
args = parser.parse_args()
@@ -607,6 +546,8 @@ def main() -> None:
window_duration=args.window_duration,
seed=args.seed,
preprocess_vel=not args.no_preprocess_vel,
sg_window=args.sg_window,
sg_polyorder=args.sg_polyorder,
)
# Save results JSON.
@@ -631,6 +572,7 @@ def main() -> None:
export_tuned_files(
robot_path=args.robot_path,
params=result["best_params"],
motor_params=result.get("motor_params"),
)