✨ initial commit

2026-03-06 22:19:44 +01:00
commit c8f28ffbcc
17 changed files with 811 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 outputs/
 .vscode/
 runs/
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
 RL-Framework-7914bb
--- a/assets/cartpole/cartpole.urdf
+++ b/assets/cartpole/cartpole.urdf
@@ -0,0 +1,64 @@
 <?xml version="1.0" encoding="utf-8"?>
 <robot name="cartpole">
  <!-- World link (fixed base) -->
  <link name="world"/>
  <!-- Cart (slides along x-axis) -->
  <link name="cart">
    <inertial>
      <mass value="1.0"/>
      <inertia ixx="0.001" ixy="0" ixz="0" iyy="0.001" iyz="0" izz="0.001"/>
    </inertial>
    <visual>
      <geometry>
        <box size="0.3 0.2 0.1"/>
      </geometry>
    </visual>
    <collision>
      <geometry>
        <box size="0.3 0.2 0.1"/>
      </geometry>
    </collision>
  </link>
  <!-- Cart slides along x-axis -->
  <joint name="cart_joint" type="prismatic">
    <parent link="world"/>
    <child link="cart"/>
    <axis xyz="1 0 0"/>
    <limit lower="-2.4" upper="2.4" effort="100" velocity="10"/>
  </joint>
  <!-- Pole (rotates around y-axis, attached on top of cart) -->
  <link name="pole">
    <inertial>
      <origin xyz="0 0 0.3"/>
      <mass value="0.1"/>
      <inertia ixx="0.003" ixy="0" ixz="0" iyy="0.003" iyz="0" izz="0.0001"/>
    </inertial>
    <visual>
      <origin xyz="0 0 0.3"/>
      <geometry>
        <cylinder radius="0.02" length="0.6"/>
      </geometry>
    </visual>
    <collision>
      <origin xyz="0 0 0.3"/>
      <geometry>
        <cylinder radius="0.02" length="0.6"/>
      </geometry>
    </collision>
  </link>
  <!-- Pole rotates freely (no motor) -->
  <joint name="pole_joint" type="revolute">
    <parent link="cart"/>
    <child link="pole"/>
    <origin xyz="0 0 0.05"/>
    <axis xyz="0 1 0"/>
    <limit lower="-6.28" upper="6.28" effort="0" velocity="100"/>
    <dynamics damping="0.0" friction="0.0"/>
  </joint>
 </robot>
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -0,0 +1,5 @@
 defaults:
  - env: cartpole
  - runner: mujoco
  - training: ppo
  - _self_
--- a/configs/env/cartpole.yaml
+++ b/configs/env/cartpole.yaml
@@ -0,0 +1,11 @@
 max_steps: 500
 angle_threshold: 0.418
 cart_limit: 2.4
 reward_alive: 1.0
 reward_pole_upright_scale: 1.0
 reward_action_penalty_scale: 0.01
 model_path: assets/cartpole/cartpole.urdf
 actuators:
  - joint: cart_joint
    gear: 10.0
    ctrl_range: [-1.0, 1.0]
--- a/configs/runner/mujoco.yaml
+++ b/configs/runner/mujoco.yaml
@@ -0,0 +1,4 @@
 num_envs: 16
 device: cpu
 dt: 0.02
 substeps: 2
--- a/configs/training/ppo.yaml
+++ b/configs/training/ppo.yaml
@@ -0,0 +1,13 @@
 hidden_sizes: [128, 128]
 total_timesteps: 1000000
 rollout_steps: 1024
 learning_epochs: 4
 mini_batches: 4
 discount_factor: 0.99
 gae_lambda: 0.95
 learning_rate: 0.0003
 clip_ratio: 0.2
 value_loss_scale: 0.5
 entropy_loss_scale: 0.01
 log_interval: 10
 clearml_project: RL-Framework
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,8 @@
 torch
 gymnasium
 hydra-core
 omegaconf
 mujoco
 skrl[torch]
 clearml
 pytest
--- a/src/core/init.py
+++ b/src/core/init.py
--- a/src/core/env.py
+++ b/src/core/env.py
@@ -0,0 +1,59 @@
 import abc
 import dataclasses
 from typing import TypeVar, Generic, Any
 from gymnasium import spaces
 import torch
 import pathlib
 T = TypeVar("T")
@dataclasses.dataclass
 class ActuatorConfig:
    """Actuator definition — maps a joint to a motor with gear ratio and control limits.
    Kept in the env config (not runner config) because actuators define what the robot
    can do, which determines action space — a task-level concept.
    This mirrors Isaac Lab's pattern of separating actuator config from the robot file."""
    joint: str = ""
    gear: float = 1.0
    ctrl_range: tuple[float, float] = (-1.0, 1.0)
@dataclasses.dataclass
 class BaseEnvConfig:
    max_steps: int = 1000
    model_path: pathlib.Path | None = None
    actuators: list[ActuatorConfig] = dataclasses.field(default_factory=list)
 class BaseEnv(abc.ABC, Generic[T]):
    def __init__(self, config: BaseEnvConfig):
        self.config = config
    @property
    @abc.abstractmethod
    def observation_space(self) -> spaces.Space:
        ...
    @property
    @abc.abstractmethod
    def action_space(self) -> spaces.Space:
        ...
    @abc.abstractmethod
    def build_state(self, qpos: torch.Tensor, qvel: torch.Tensor) -> Any:
        ...
    @abc.abstractmethod
    def compute_observations(self, state: Any) -> torch.Tensor:
        ...
    @abc.abstractmethod
    def compute_rewards(self, state: Any, actions: torch.Tensor) -> torch.Tensor:
        ...
    @abc.abstractmethod
    def compute_terminations(self, state: Any) -> torch.Tensor:
        ...
    def compute_truncations(self, step_counts: torch.Tensor) -> torch.Tensor:
        return step_counts >= self.config.max_steps
--- a/src/core/runner.py
+++ b/src/core/runner.py
@@ -0,0 +1,97 @@
 import dataclasses
 import abc
 from typing import Any, Generic, TypeVar
 from src.core.env import BaseEnv
 import torch
 T = TypeVar("T")
@dataclasses.dataclass
 class BaseRunnerConfig:
    num_envs: int = 1
    device: str = "cpu"
 class BaseRunner(abc.ABC, Generic[T]):
    def __init__(self, env: BaseEnv, config: T) -> None:
        self.env = env
        self.config = config
        self._sim_initialize(config)
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        self.num_agents: int = 1  # single-agent RL (required by skrl)
        self.step_counts = torch.zeros(
            self.config.num_envs, dtype=torch.long, device=self.config.device
        )
    @property
    @abc.abstractmethod
    def num_envs(self) -> int:
        ...
    @property
    @abc.abstractmethod
    def device(self) -> torch.device:
        ...
    @abc.abstractmethod
    def _sim_initialize(self, config: T) -> None:
        ...
    @abc.abstractmethod
    def _sim_step(self, actions: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        ...
    @abc.abstractmethod
    def _sim_reset(self, env_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        ...
    @abc.abstractmethod
    def _sim_close(self) -> None:
        ...
    def reset(self) -> tuple[torch.Tensor, dict[str, Any]]:
        all_ids = torch.arange(self.num_envs, device=self.device)
        qpos, qvel = self._sim_reset(all_ids)
        self.step_counts.zero_()
        state = self.env.build_state(qpos, qvel)
        obs = self.env.compute_observations(state)
        return obs, {}
    def step(self, actions: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, dict[str, Any]]:
        qpos, qvel = self._sim_step(actions)
        self.step_counts += 1
        state = self.env.build_state(qpos, qvel)
        obs = self.env.compute_observations(state)
        rewards = self.env.compute_rewards(state, actions)
        terminated = self.env.compute_terminations(state)
        truncated = self.env.compute_truncations(self.step_counts)
        info: dict[str, Any] = {}
        done = terminated | truncated
        done_ids = done.nonzero(as_tuple=False).squeeze(-1)
        if done_ids.numel() > 0:
            info["final_observations"] = obs[done_ids].clone()
            info["final_env_ids"] = done_ids.clone()
            reset_qpos, reset_qvel = self._sim_reset(done_ids)
            self.step_counts[done_ids] = 0
            reset_state = self.env.build_state(reset_qpos, reset_qvel)
            obs[done_ids] = self.env.compute_observations(reset_state)
        # skrl expects (num_envs, 1) for rewards/terminated/truncated
        return obs, rewards.unsqueeze(-1), terminated.unsqueeze(-1), truncated.unsqueeze(-1), info
    def render(self, env_idx: int = 0, mode: str = "human") -> torch.Tensor | None:
        raise NotImplementedError("Render method not implemented for this runner.")
    def close(self) -> None:
        self._sim_close()
--- a/src/envs/cartpole.py
+++ b/src/envs/cartpole.py
@@ -0,0 +1,53 @@
 import dataclasses
 import torch
 from src.core.env import BaseEnv, BaseEnvConfig
 from gymnasium import spaces
@dataclasses.dataclass
 class CartPoleState:
    cart_pos: torch.float # (num_envs,)
    cart_vel: torch.float # (num_envs,)
    pole_angle: torch.float # (num_envs,)
    pole_vel: torch.float # (num_envs,)
@dataclasses.dataclass
 class CartPoleConfig(BaseEnvConfig):
    """CartPole task config. All values come from Hydra YAML."""
    angle_threshold: float = 0.418        # ~24 degrees
    cart_limit: float = 2.4
    reward_alive: float = 1.0
    reward_pole_upright_scale: float = 1.0
    reward_action_penalty_scale: float = 0.01
 class CartPoleEnv(BaseEnv[CartPoleConfig]):
    def __init__(self, config: CartPoleConfig):
        super().__init__(config)
    @property
    def observation_space(self) -> torch.Tensor:
        return spaces.Box(low=-torch.inf, high=torch.inf, shape=(4,))
    @property
    def action_space(self) -> torch.Tensor:
        return spaces.Box(low=-1.0, high=1.0, shape=(1,))
    def build_state(self, qpos: torch.Tensor, qvel: torch.Tensor) -> CartPoleState:
        return CartPoleState(
            cart_pos=qpos[:, 0],
            cart_vel=qvel[:, 0],
            pole_angle=qpos[:, 1],
            pole_vel=qvel[:, 1],
        )
    def compute_observations(self, state: CartPoleState) -> torch.Tensor:
        return torch.stack([state.cart_pos, state.cart_vel, state.pole_angle, state.pole_vel], dim=-1)
    def compute_rewards(self, state: CartPoleState, actions: torch.Tensor) -> torch.Tensor:
        upright = self.config.reward_pole_upright_scale * torch.cos(state.pole_angle)
        action_penalty = self.config.reward_action_penalty_scale * torch.sum(actions**2, dim=-1)
        return self.config.reward_alive + upright - action_penalty
    def compute_terminations(self, state: CartPoleState) -> torch.Tensor:
        pole_fallen = torch.abs(state.pole_angle) > self.config.angle_threshold
        cart_out_of_bounds = torch.abs(state.cart_pos) > self.config.cart_limit
        return pole_fallen | cart_out_of_bounds
--- a/src/models/init.py
+++ b/src/models/init.py
--- a/src/models/mlp.py
+++ b/src/models/mlp.py
@@ -0,0 +1,48 @@
 import torch
 import torch.nn as nn
 from gymnasium import spaces
 from skrl.models.torch import Model, GaussianMixin, DeterministicMixin
 class SharedMLP(GaussianMixin, DeterministicMixin, Model):
    def __init__(self, observation_space: spaces.Space, action_space: spaces.Space, device: torch.device, hidden_sizes: tuple[int, ...] = (32, 32), clip_actions: bool = False, clip_log_std: bool = True, min_log_std: float = -20.0, max_log_std: float = 2.0, initial_log_std: float = 0.0):
        Model.__init__(self, observation_space, action_space, device)
        GaussianMixin.__init__(self, clip_actions, clip_log_std, min_log_std, max_log_std)
        DeterministicMixin.__init__(self, clip_actions)
        layers = []
        in_dim: int = self.num_observations
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(in_dim, hidden_size))
            layers.append(nn.ELU())
            in_dim = hidden_size
        self.net: nn.Sequential = nn.Sequential(*layers)
        # Policy head
        self.mean_layer = nn.Linear(in_dim, self.num_actions)
        self.log_std_parameter: nn.Parameter = nn.Parameter(torch.full((self.num_actions,), initial_log_std))
        # Value head
        self.value_layer = nn.Linear(in_dim, 1)
        self._shared_output: torch.Tensor | None = None
    def act(self, inputs: dict[str, torch.Tensor], role: str = "") -> tuple[torch.Tensor, ...]:
        if role == "policy":
            return GaussianMixin.act(self, inputs, role)
        elif role == "value":
            return DeterministicMixin.act(self, inputs, role)
    def compute(
        self, inputs: dict[str, torch.Tensor], role: str = ""
   ) -> tuple[torch.Tensor, ...]:
        if role == "policy":
            self._shared_output = self.net(inputs["states"])
            return self.mean_layer(self._shared_output), self.log_std_parameter, {}
        elif role == "value":
            shared_output = (
                self._shared_output
                if self._shared_output is not None
                else self.net(inputs["states"])
            )
            self._shared_output = None
            return self.value_layer(shared_output), {}
--- a/src/runners/mujoco.py
+++ b/src/runners/mujoco.py
@@ -0,0 +1,155 @@
 import dataclasses
 import tempfile
 import xml.etree.ElementTree as ET
 from src.core.env import BaseEnv, ActuatorConfig
 from src.core.runner import BaseRunner, BaseRunnerConfig
 import torch
 import numpy as np
 import mujoco
 import mujoco.viewer
@dataclasses.dataclass
 class MuJoCoRunnerConfig(BaseRunnerConfig):
    num_envs: int = 16
    device: str = "cpu"
    dt: float = 0.02
    substeps: int = 2
 class MuJoCoRunner(BaseRunner[MuJoCoRunnerConfig]):
    def __init__(self, env: BaseEnv, config: MuJoCoRunnerConfig):
        super().__init__(env, config)
    @property
    def num_envs(self) -> int:
        return self.config.num_envs
    @property
    def device(self) -> torch.device:
        return torch.device(self.config.device)
    @staticmethod
    def _load_model_with_actuators(model_path: str, actuators: list[ActuatorConfig]) -> mujoco.MjModel:
        """Load a URDF (or MJCF) file and programmatically inject actuators.
        Two-step approach required because MuJoCo's URDF parser ignores
        <actuator> in the <mujoco> extension block:
          1. Load the URDF → MuJoCo converts it to internal MJCF
          2. Export the MJCF XML, add <actuator> elements, reload
        This keeps the URDF clean and standard — actuator config lives in
        the env config (Isaac Lab pattern), not in the robot file.
        """
        # Step 1: Load URDF/MJCF as-is (no actuators)
        model_raw = mujoco.MjModel.from_xml_path(model_path)
        if not actuators:
            return model_raw
        # Step 2: Export internal MJCF representation
        tmp_mjcf = tempfile.mktemp(suffix=".xml")
        try:
            mujoco.mj_saveLastXML(tmp_mjcf, model_raw)
            with open(tmp_mjcf) as f:
                mjcf_str = f.read()
        finally:
            import os
            os.unlink(tmp_mjcf)
        # Step 3: Inject actuators into the MJCF XML
        root = ET.fromstring(mjcf_str)
        act_elem = ET.SubElement(root, "actuator")
        for act in actuators:
            ET.SubElement(act_elem, "motor", attrib={
                "name": f"{act.joint}_motor",
                "joint": act.joint,
                "gear": str(act.gear),
                "ctrlrange": f"{act.ctrl_range[0]} {act.ctrl_range[1]}",
            })
        # Step 4: Reload from modified MJCF
        modified_xml = ET.tostring(root, encoding="unicode")
        return mujoco.MjModel.from_xml_string(modified_xml)
    def _sim_initialize(self, config: MuJoCoRunnerConfig) -> None:
        model_path = self.env.config.model_path
        if model_path is None:
            raise ValueError("model_path must be specified in the environment config")
        actuators = self.env.config.actuators
        self._model = self._load_model_with_actuators(str(model_path), actuators)
        self._model.opt.timestep = config.dt
        self._data: list[mujoco.MjData] = [mujoco.MjData(self._model) for _ in range(config.num_envs)]
        self._nq = self._model.nq
        self._nv = self._model.nv
    def _sim_step(self, actions: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        actions_np: np.ndarray = actions.cpu().numpy()
        qpos_batch = np.zeros((self.num_envs, self._nq), dtype=np.float32)
        qvel_batch = np.zeros((self.num_envs, self._nv), dtype=np.float32)
        for i, data in enumerate(self._data):
            data.ctrl[:] = actions_np[i]
            for _ in range(self.config.substeps):
                mujoco.mj_step(self._model, data)
            qpos_batch[i] = data.qpos
            qvel_batch[i] = data.qvel
        return (
            torch.from_numpy(qpos_batch).to(self.device),
            torch.from_numpy(qvel_batch).to(self.device),
        )
    def _sim_reset(self, env_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        ids = env_ids.cpu().numpy()
        n = len(ids)
        qpos_batch = np.zeros((n, self._nq), dtype=np.float32)
        qvel_batch = np.zeros((n, self._nv), dtype=np.float32)
        for i, env_id in enumerate(ids):
            data = self._data[env_id]
            mujoco.mj_resetData(self._model, data)
            # Add small random perturbation so the pole doesn't start perfectly upright
            data.qpos[:] += np.random.uniform(-0.05, 0.05, size=self._nq)
            data.qvel[:] += np.random.uniform(-0.05, 0.05, size=self._nv)
            qpos_batch[i] = data.qpos
            qvel_batch[i] = data.qvel
        return (
            torch.from_numpy(qpos_batch).to(self.device),
            torch.from_numpy(qvel_batch).to(self.device),
        )
    def _sim_close(self) -> None:
        if hasattr(self, "_viewer") and self._viewer is not None:
            self._viewer.close()
            self._viewer = None
        if hasattr(self, "_offscreen_renderer") and self._offscreen_renderer is not None:
            self._offscreen_renderer.close()
            self._offscreen_renderer = None
        self._data.clear()
    def render(self, env_idx: int = 0, mode: str = "human") -> torch.Tensor | None:
        if mode == "human":
            if not hasattr(self, "_viewer") or self._viewer is None:
                self._viewer = mujoco.viewer.launch_passive(
                    self._model, self._data[env_idx]
                )
            # Update visual geometry from current physics state
            mujoco.mj_forward(self._model, self._data[env_idx])
            self._viewer.sync()
            return None
        elif mode == "rgb_array":
            # Cache the offscreen renderer to avoid create/destroy overhead
            if not hasattr(self, "_offscreen_renderer") or self._offscreen_renderer is None:
                self._offscreen_renderer = mujoco.Renderer(self._model, height=480, width=640)
            self._offscreen_renderer.update_scene(self._data[env_idx])
            pixels = self._offscreen_renderer.render().copy()  # copy since buffer is reused
            return torch.from_numpy(pixels)
--- a/src/training/trainer.py
+++ b/src/training/trainer.py
@@ -0,0 +1,243 @@
 import dataclasses
 import sys
 import tempfile
 from pathlib import Path
 import numpy as np
 import tqdm
 from src.core.runner import BaseRunner
 from clearml import Task, Logger
 import torch
 from gymnasium import spaces
 from skrl.memories.torch import RandomMemory
 from src.models.mlp import SharedMLP
 from skrl.agents.torch.ppo import PPO, PPO_DEFAULT_CONFIG
 from skrl.trainers.torch import SequentialTrainer
@dataclasses.dataclass
 class TrainerConfig:
    rollout_steps: int = 2048
    learning_epochs: int = 8
    mini_batches: int = 4
    discount_factor: float = 0.99
    gae_lambda: float = 0.95
    learning_rate: float = 3e-4
    clip_ratio: float = 0.2
    value_loss_scale: float = 0.5
    entropy_loss_scale: float = 0.01
    hidden_sizes: tuple[int, ...] = (64, 64)
    total_timesteps: int = 1_000_000
    log_interval: int = 10
    # Video recording
    record_video_every: int = 10000       # record a video every N timesteps (0 = disabled)
    record_video_min_seconds: float = 10.0  # minimum video duration in seconds
    record_video_fps: int = 0              # 0 = auto-derive from simulation rate
    clearml_project: str | None = None
    clearml_task: str | None = None
 class VideoRecordingTrainer(SequentialTrainer):
    """Subclass of skrl's SequentialTrainer that records videos periodically."""
    def __init__(self, env, agents, cfg=None, trainer_config: TrainerConfig | None = None):
        super().__init__(env=env, agents=agents, cfg=cfg)
        self._trainer_config = trainer_config
        self._video_dir = Path(tempfile.mkdtemp(prefix="rl_videos_"))
    def single_agent_train(self) -> None:
        """Override to add periodic video recording."""
        assert self.num_simultaneous_agents == 1
        assert self.env.num_agents == 1
        states, infos = self.env.reset()
        for timestep in tqdm.tqdm(
            range(self.initial_timestep, self.timesteps),
            disable=self.disable_progressbar,
            file=sys.stdout,
        ):
            # Pre-interaction
            self.agents.pre_interaction(timestep=timestep, timesteps=self.timesteps)
            with torch.no_grad():
                actions = self.agents.act(states, timestep=timestep, timesteps=self.timesteps)[0]
                next_states, rewards, terminated, truncated, infos = self.env.step(actions)
                if not self.headless:
                    self.env.render()
                self.agents.record_transition(
                    states=states,
                    actions=actions,
                    rewards=rewards,
                    next_states=next_states,
                    terminated=terminated,
                    truncated=truncated,
                    infos=infos,
                    timestep=timestep,
                    timesteps=self.timesteps,
                )
                if self.environment_info in infos:
                    for k, v in infos[self.environment_info].items():
                        if isinstance(v, torch.Tensor) and v.numel() == 1:
                            self.agents.track_data(f"Info / {k}", v.item())
            self.agents.post_interaction(timestep=timestep, timesteps=self.timesteps)
            # Reset environments
            if self.env.num_envs > 1:
                states = next_states
            else:
                if terminated.any() or truncated.any():
                    with torch.no_grad():
                        states, infos = self.env.reset()
                else:
                    states = next_states
            # Record video at intervals
            cfg = self._trainer_config
            if (
                cfg
                and cfg.record_video_every > 0
                and (timestep + 1) % cfg.record_video_every == 0
            ):
                self._record_video(timestep + 1)
    def _get_video_fps(self) -> int:
        """Derive video fps from the simulation rate, or use configured value."""
        cfg = self._trainer_config
        if cfg.record_video_fps > 0:
            return cfg.record_video_fps
        # Auto-derive from runner's simulation parameters
        runner = self.env
        dt = getattr(runner.config, "dt", 0.02)
        substeps = getattr(runner.config, "substeps", 1)
        return max(1, int(round(1.0 / (dt * substeps))))
    def _record_video(self, timestep: int) -> None:
        """Record evaluation episodes and upload to ClearML."""
        try:
            import imageio.v3 as iio
        except ImportError:
            try:
                import imageio as iio
            except ImportError:
                return
        cfg = self._trainer_config
        fps = self._get_video_fps()
        min_frames = int(cfg.record_video_min_seconds * fps)
        max_frames = min_frames * 3  # hard cap to prevent runaway recording
        frames: list[np.ndarray] = []
        while len(frames) < min_frames and len(frames) < max_frames:
            obs, _ = self.env.reset()
            done = False
            steps = 0
            max_episode_steps = getattr(self.env.env.config, "max_steps", 500)
            while not done and steps < max_episode_steps:
                with torch.no_grad():
                    action = self.agents.act(obs, timestep=timestep, timesteps=self.timesteps)[0]
                obs, _, terminated, truncated, _ = self.env.step(action)
                frame = self.env.render(mode="rgb_array")
                if frame is not None:
                    frames.append(frame.cpu().numpy() if isinstance(frame, torch.Tensor) else frame)
                done = (terminated | truncated).any().item()
                steps += 1
                if len(frames) >= max_frames:
                    break
        if frames:
            video_path = str(self._video_dir / f"step_{timestep}.mp4")
            iio.imwrite(video_path, frames, fps=fps)
            logger = Logger.current_logger()
            if logger:
                logger.report_media(
                    title="Training Video",
                    series=f"step_{timestep}",
                    local_path=video_path,
                    iteration=timestep,
                )
        # Reset back to training state after recording
        self.env.reset()
 class Trainer:
    def __init__(self, runner: BaseRunner, config: TrainerConfig):
        self.runner = runner
        self.config = config
        self._init_clearml()
        self._init_agent()
    def _init_clearml(self) -> None:
        if self.config.clearml_project and self.config.clearml_task:
            self.clearml_task = Task.init(
                project_name=self.config.clearml_project,
                task_name=self.config.clearml_task,
            )
        else:
            self.clearml_task = None
    def _init_agent(self) -> None:
        device: torch.device = self.runner.device
        obs_space: spaces.Space = self.runner.observation_space
        act_space: spaces.Space = self.runner.action_space
        num_envs: int = self.runner.num_envs
        self.memory: RandomMemory = RandomMemory(memory_size=self.config.rollout_steps, num_envs=num_envs, device=device)
        self.model: SharedMLP = SharedMLP(
            observation_space=obs_space,
            action_space=act_space,
            device=device,
            hidden_sizes=self.config.hidden_sizes,
        )
        models = {
            "policy": self.model,
            "value": self.model,
        }
        agent_cfg = PPO_DEFAULT_CONFIG.copy()
        agent_cfg.update({
            "rollouts": self.config.rollout_steps,
            "learning_epochs": self.config.learning_epochs,
            "mini_batches": self.config.mini_batches,
            "discount_factor": self.config.discount_factor,
            "lambda": self.config.gae_lambda,
            "learning_rate": self.config.learning_rate,
            "ratio_clip": self.config.clip_ratio,
            "value_loss_scale": self.config.value_loss_scale,
            "entropy_loss_scale": self.config.entropy_loss_scale,
        })
        self.agent: PPO = PPO(
            models=models,
            memory=self.memory,
            observation_space=obs_space,
            action_space=act_space,
            device=device,
            cfg=agent_cfg,
        )
    def train(self) -> None:
        trainer = VideoRecordingTrainer(
            env=self.runner,
            agents=self.agent,
            cfg={"timesteps": self.config.total_timesteps},
            trainer_config=self.config,
        )
        trainer.train()
    def close(self) -> None:
        self.runner.close()
        if self.clearml_task:
            self.clearml_task.close()
--- a/train.py
+++ b/train.py
@@ -0,0 +1,47 @@
 import hydra
 from hydra.core.hydra_config import HydraConfig
 from omegaconf import DictConfig, OmegaConf
 from src.envs.cartpole import CartPoleEnv, CartPoleConfig
 from src.runners.mujoco import MuJoCoRunner, MuJoCoRunnerConfig
 from src.training.trainer import Trainer, TrainerConfig
 from src.core.env import ActuatorConfig
 def _build_env_config(cfg: DictConfig) -> CartPoleConfig:
    env_dict = OmegaConf.to_container(cfg.env, resolve=True)
    if "actuators" in env_dict:
        for a in env_dict["actuators"]:
            if "ctrl_range" in a:
                a["ctrl_range"] = tuple(a["ctrl_range"])
        env_dict["actuators"] = [ActuatorConfig(**a) for a in env_dict["actuators"]]
    return CartPoleConfig(**env_dict)
@hydra.main(version_base=None, config_path="configs", config_name="config")
 def main(cfg: DictConfig) -> None:
    env_config = _build_env_config(cfg)
    runner_config = MuJoCoRunnerConfig(**OmegaConf.to_container(cfg.runner, resolve=True))
    training_dict = OmegaConf.to_container(cfg.training, resolve=True)
    # Build ClearML task name dynamically from Hydra config group choices
    if not training_dict.get("clearml_task"):
        choices = HydraConfig.get().runtime.choices
        env_name = choices.get("env", "env")
        runner_name = choices.get("runner", "runner")
        training_name = choices.get("training", "algo")
        training_dict["clearml_task"] = f"{env_name}-{runner_name}-{training_name}"
    trainer_config = TrainerConfig(**training_dict)
    env = CartPoleEnv(env_config)
    runner = MuJoCoRunner(env=env, config=runner_config)
    trainer = Trainer(runner=runner, config=trainer_config)
    try:
        trainer.train()
    finally:
        trainer.close()
 if __name__ == "__main__":
    main()