♻️ cleanup

This commit is contained in:
2026-03-11 23:16:42 +01:00
parent 3db68255f0
commit 23801857f4
6 changed files with 69 additions and 258 deletions

View File

@@ -170,9 +170,12 @@ def _create_base_task(
"registry.kube.optimize/worker-image:latest",
docker_setup_bash_script=(
"apt-get update && apt-get install -y --no-install-recommends "
"libosmesa6 libgl1-mesa-glx libglfw3 && rm -rf /var/lib/apt/lists/* "
"&& pip install 'jax[cuda12]' mujoco-mjx"
"libosmesa6-dev libgl1-mesa-glx libglfw3 && rm -rf /var/lib/apt/lists/* "
"&& pip install 'jax[cuda12]' mujoco-mjx PyOpenGL PyOpenGL-accelerate"
),
docker_arguments=[
"-e", "MUJOCO_GL=osmesa",
],
)
req_file = Path(__file__).resolve().parent.parent / "requirements.txt"
@@ -214,6 +217,10 @@ def main() -> None:
help="Maximum budget (total_timesteps) for promoted trials",
)
parser.add_argument("--eta", type=int, default=3, help="Successive halving reduction factor")
parser.add_argument(
"--max-consecutive-failures", type=int, default=3,
help="Abort HPO after N consecutive trial failures (0 = never abort)",
)
parser.add_argument(
"--time-limit-hours", type=float, default=72,
help="Total wall-clock time limit in hours",
@@ -312,6 +319,7 @@ def main() -> None:
time_limit_per_job=240, # 4 hours per trial max
eta=args.eta,
budget_param_name="Hydra/training.total_timesteps",
max_consecutive_failures=args.max_consecutive_failures,
)
# Send this HPO controller to a remote services worker