♻️ crazy refactor

2026-03-11 22:52:01 +01:00
parent 35223b3560
commit 4115447022
34 changed files with 4255 additions and 102 deletions
--- a/scripts/train.py
+++ b/scripts/train.py
@@ -0,0 +1,118 @@
+import os
+import pathlib
+import sys
+
+# Ensure project root is on sys.path so `src.*` imports work
+# regardless of which directory the script is invoked from.
+_PROJECT_ROOT = str(pathlib.Path(__file__).resolve().parent.parent)
+if _PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, _PROJECT_ROOT)
+
+# Headless rendering: use OSMesa on Linux servers (must be set before mujoco import)
+if sys.platform == "linux" and "DISPLAY" not in os.environ:
+    os.environ.setdefault("MUJOCO_GL", "osmesa")
+
+import hydra
+import hydra.utils as hydra_utils
+import structlog
+from clearml import Task
+from hydra.core.hydra_config import HydraConfig
+from omegaconf import DictConfig, OmegaConf
+
+from src.core.env import BaseEnv
+from src.core.registry import build_env
+from src.core.runner import BaseRunner
+from src.training.trainer import Trainer, TrainerConfig
+
+logger = structlog.get_logger()
+
+
+# ── runner registry ───────────────────────────────────────────────────
+# Maps Hydra config-group name → (RunnerClass, ConfigClass)
+# Imports are deferred so JAX is only loaded when runner=mjx is chosen.
+
+RUNNER_REGISTRY: dict[str, tuple[str, str, str]] = {
+    "mujoco":        ("src.runners.mujoco", "MuJoCoRunner", "MuJoCoRunnerConfig"),
+    "mujoco_single": ("src.runners.mujoco", "MuJoCoRunner", "MuJoCoRunnerConfig"),
+    "mjx":           ("src.runners.mjx",    "MJXRunner",    "MJXRunnerConfig"),
+    "serial":        ("src.runners.serial", "SerialRunner", "SerialRunnerConfig"),
+}
+
+
+def _build_runner(runner_name: str, env: BaseEnv, cfg: DictConfig) -> BaseRunner:
+    """Instantiate the right runner from the Hydra config-group name."""
+    if runner_name not in RUNNER_REGISTRY:
+        raise ValueError(
+            f"Unknown runner '{runner_name}'. Registered: {list(RUNNER_REGISTRY)}"
+        )
+    module_path, cls_name, cfg_cls_name = RUNNER_REGISTRY[runner_name]
+
+    import importlib
+    mod = importlib.import_module(module_path)
+    runner_cls = getattr(mod, cls_name)
+    config_cls = getattr(mod, cfg_cls_name)
+
+    runner_config = config_cls(**OmegaConf.to_container(cfg.runner, resolve=True))
+    return runner_cls(env=env, config=runner_config)
+
+
+def _init_clearml(choices: dict[str, str], remote: bool = False) -> Task:
+    """Initialize ClearML task with project structure and tags."""
+    Task.ignore_requirements("torch")
+
+    env_name = choices.get("env", "cartpole")
+    runner_name = choices.get("runner", "mujoco")
+    training_name = choices.get("training", "ppo")
+
+    project = "RL-Framework"
+    task_name = f"{env_name}-{runner_name}-{training_name}"
+    tags = [env_name, runner_name, training_name]
+
+    task = Task.init(project_name=project, task_name=task_name, tags=tags)
+    task.set_base_docker(
+        "registry.kube.optimize/worker-image:latest",
+        docker_setup_bash_script=(
+            "apt-get update && apt-get install -y --no-install-recommends "
+            "libosmesa6 libgl1-mesa-glx libglfw3 && rm -rf /var/lib/apt/lists/* "
+            "&& pip install 'jax[cuda12]' mujoco-mjx"
+        ),
+    )
+
+    req_file = pathlib.Path(hydra_utils.get_original_cwd()) / "requirements.txt"
+    task.set_packages(str(req_file))
+
+    # Execute remotely if requested and running locally
+    if remote and task.running_locally():
+        logger.info("executing_task_remotely", queue="gpu-queue")
+        task.execute_remotely(queue_name="gpu-queue", exit_process=True)
+
+    return task
+
+
+@hydra.main(version_base=None, config_path="../configs", config_name="config")
+def main(cfg: DictConfig) -> None:
+    choices = HydraConfig.get().runtime.choices
+
+    # ClearML init — must happen before heavy work so remote execution
+    # can take over early.  The remote worker re-runs the full script;
+    # execute_remotely() is a no-op on the worker side.
+    training_dict = OmegaConf.to_container(cfg.training, resolve=True)
+    remote = training_dict.pop("remote", False)
+    training_dict.pop("hpo", None)  # HPO range metadata — not a TrainerConfig field
+    task = _init_clearml(choices, remote=remote)
+
+    env_name = choices.get("env", "cartpole")
+    env = build_env(env_name, cfg)
+    runner = _build_runner(choices.get("runner", "mujoco"), env, cfg)
+    trainer_config = TrainerConfig(**training_dict)
+    trainer = Trainer(runner=runner, config=trainer_config)
+
+    try:
+        trainer.train()
+    finally:
+        trainer.close()
+        task.close()
+
+
+if __name__ == "__main__":
+    main()