♻️ crazy refactor

This commit is contained in:
2026-03-11 22:52:01 +01:00
parent 35223b3560
commit 4115447022
34 changed files with 4255 additions and 102 deletions

1
src/hpo/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Hyperparameter optimization — SMAC3 + ClearML Successive Halving."""

636
src/hpo/smac3.py Normal file
View File

@@ -0,0 +1,636 @@
# Requires: pip install smac==2.0.0 ConfigSpace==0.4.20
import contextlib
import time
from collections.abc import Sequence
from functools import wraps
from typing import Any
from clearml import Task
from clearml.automation.optimization import Objective, SearchStrategy
from clearml.automation.parameters import Parameter
from clearml.backend_interface.session import SendError
from ConfigSpace import (
CategoricalHyperparameter,
ConfigurationSpace,
UniformFloatHyperparameter,
UniformIntegerHyperparameter,
)
from smac import MultiFidelityFacade
from smac.intensifier.successive_halving import SuccessiveHalving
from smac.runhistory.dataclasses import TrialInfo, TrialValue
from smac.scenario import Scenario
def retry_on_error(max_retries=5, initial_delay=2.0, backoff=2.0, exceptions=(Exception,)):
"""Decorator to retry a function on exception with exponential backoff."""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
delay = initial_delay
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except exceptions:
if attempt == max_retries - 1:
return None # Return None instead of raising
time.sleep(delay)
delay *= backoff
return None
return wrapper
return decorator
def _encode_param_name(name: str) -> str:
"""Encode parameter name for ConfigSpace (replace / with __SLASH__)"""
return name.replace("/", "__SLASH__")
def _decode_param_name(name: str) -> str:
"""Decode parameter name back to original (replace __SLASH__ with /)"""
return name.replace("__SLASH__", "/")
def _convert_param_to_cs(param: Parameter):
"""
Convert a ClearML Parameter into a ConfigSpace hyperparameter,
adapted to ConfigSpace>=1.x (no more 'q' argument).
"""
# Encode the name to avoid ConfigSpace issues with special chars like '/'
name = _encode_param_name(param.name)
# Categorical / discrete list
if hasattr(param, "values"):
return CategoricalHyperparameter(name=name, choices=list(param.values))
# Numeric range (float or int)
if hasattr(param, "min_value") and hasattr(param, "max_value"):
min_val = param.min_value
max_val = param.max_value
# Check if this should be treated as integer
if isinstance(min_val, int) and isinstance(max_val, int):
log = getattr(param, "log_scale", False)
# Check for step_size for quantization
if hasattr(param, "step_size"):
sv = int(param.step_size)
if sv != 1:
# emulate quantization by explicit list of values
choices = list(range(min_val, max_val + 1, sv))
return CategoricalHyperparameter(name=name, choices=choices)
# Simple uniform integer range
return UniformIntegerHyperparameter(name=name, lower=min_val, upper=max_val, log=log)
else:
# Treat as float
lower, upper = float(min_val), float(max_val)
log = getattr(param, "log_scale", False)
return UniformFloatHyperparameter(name=name, lower=lower, upper=upper, log=log)
raise ValueError(f"Unsupported Parameter type: {type(param)}")
class OptimizerSMAC(SearchStrategy):
"""
SMAC3-based hyperparameter optimizer, matching OptimizerBOHB interface.
"""
def __init__(
self,
base_task_id: str,
hyper_parameters: Sequence[Parameter],
objective_metric: Objective,
execution_queue: str,
num_concurrent_workers: int,
min_iteration_per_job: int,
max_iteration_per_job: int,
total_max_jobs: int,
pool_period_min: float = 2.0,
time_limit_per_job: float | None = None,
compute_time_limit: float | None = None,
**smac_kwargs: Any,
):
# Initialize base SearchStrategy
super().__init__(
base_task_id=base_task_id,
hyper_parameters=hyper_parameters,
objective_metric=objective_metric,
execution_queue=execution_queue,
num_concurrent_workers=num_concurrent_workers,
pool_period_min=pool_period_min,
time_limit_per_job=time_limit_per_job,
compute_time_limit=compute_time_limit,
min_iteration_per_job=min_iteration_per_job,
max_iteration_per_job=max_iteration_per_job,
total_max_jobs=total_max_jobs,
)
# Expose for internal use (access private attributes from base class)
self.execution_queue = self._execution_queue
self.min_iterations = min_iteration_per_job
self.max_iterations = max_iteration_per_job
self.num_concurrent_workers = self._num_concurrent_workers # Fix: access private attribute
# Objective details
# Handle both single objective (string) and multi-objective (list) cases
if isinstance(self._objective_metric.title, list):
self.metric_title = self._objective_metric.title[0] # Use first objective
else:
self.metric_title = self._objective_metric.title
if isinstance(self._objective_metric.series, list):
self.metric_series = self._objective_metric.series[0] # Use first series
else:
self.metric_series = self._objective_metric.series
# ClearML Objective stores sign as a list, e.g., ['max'] or ['min']
objective_sign = getattr(self._objective_metric, "sign", None) or getattr(self._objective_metric, "order", None)
# Handle list case - extract first element
if isinstance(objective_sign, list):
objective_sign = objective_sign[0] if objective_sign else "max"
# Default to max if nothing found
if objective_sign is None:
objective_sign = "max"
self.maximize_metric = str(objective_sign).lower() in ("max", "max_global")
# Build ConfigSpace
self.config_space = ConfigurationSpace(seed=42)
for p in self._hyper_parameters: # Access private attribute correctly
cs_hp = _convert_param_to_cs(p)
self.config_space.add(cs_hp)
# Configure SMAC Scenario
scenario = Scenario(
configspace=self.config_space,
n_trials=self.total_max_jobs,
min_budget=float(self.min_iterations),
max_budget=float(self.max_iterations),
walltime_limit=(self.compute_time_limit * 60) if self.compute_time_limit else None,
deterministic=True,
)
# build the Successive Halving intensifier (NOT Hyperband!)
# Hyperband runs multiple brackets with different starting budgets - wasteful
# Successive Halving: ALL configs start at min_budget, only best get promoted
# eta controls the reduction factor (default 3 means keep top 1/3 each round)
# eta can be overridden via smac_kwargs from HyperParameterOptimizer
eta = smac_kwargs.pop("eta", 3) # Default to 3 if not specified
intensifier = SuccessiveHalving(scenario=scenario, eta=eta, **smac_kwargs)
# now pass that intensifier instance into the facade
self.smac = MultiFidelityFacade(
scenario=scenario,
target_function=lambda config, budget, seed: 0.0,
intensifier=intensifier,
overwrite=True,
)
# Bookkeeping
self.running_tasks = {} # task_id -> trial info
self.task_start_times = {} # task_id -> start time (for timeout)
self.completed_results = []
self.best_score_so_far = float("-inf") if self.maximize_metric else float("inf")
self.time_limit_per_job = time_limit_per_job # Store time limit (minutes)
# Checkpoint continuation tracking: config_key -> {budget: task_id}
# Used to find the previous task's checkpoint when promoting a config
self.config_to_tasks = {} # config_key -> {budget: task_id}
# Manual Successive Halving control
self.eta = eta
self.current_budget = float(self.min_iterations)
self.configs_at_budget = {} # budget -> list of (config, score, trial)
self.pending_configs = [] # configs waiting to be evaluated at current_budget - list of (trial, prev_task_id)
self.evaluated_at_budget = [] # (config, score, trial, task_id) for current budget
self.smac_asked_configs = set() # track which configs SMAC has given us
# Calculate initial rung size for proper Successive Halving
# With eta=3: rung sizes are n, n/3, n/9, ...
# Total trials = n * (1 + 1/eta + 1/eta^2 + ...) = n * eta/(eta-1) for infinite series
# For finite rungs, calculate exactly
num_rungs = 1
b = float(self.min_iterations)
while b * eta <= self.max_iterations:
num_rungs += 1
b *= eta
# Sum of geometric series: 1 + 1/eta + 1/eta^2 + ... (num_rungs terms)
series_sum = sum(1.0 / (eta**i) for i in range(num_rungs))
self.initial_rung_size = int(self.total_max_jobs / series_sum)
self.initial_rung_size = max(self.initial_rung_size, self.num_concurrent_workers) # at least num_workers
self.configs_needed_for_rung = self.initial_rung_size # how many configs we still need for current rung
self.rung_closed = False # whether we've collected all configs for current rung
@retry_on_error(max_retries=3, initial_delay=5.0, exceptions=(ValueError, SendError, ConnectionError))
def _get_task_safe(self, task_id: str):
"""Safely get a task with retry logic."""
return Task.get_task(task_id=task_id)
@retry_on_error(max_retries=3, initial_delay=5.0, exceptions=(ValueError, SendError, ConnectionError))
def _launch_task(self, config: dict, budget: float, prev_task_id: str | None = None):
"""Launch a task with retry logic for robustness.
Args:
config: Hyperparameter configuration dict
budget: Number of epochs to train
prev_task_id: Optional task ID from previous budget to continue from (checkpoint)
"""
base = self._get_task_safe(task_id=self._base_task_id)
if base is None:
return None
clone = Task.clone(
source_task=base,
name=f"HPO Trial - {base.name}",
parent=Task.current_task().id, # Set the current HPO task as parent
)
# Override hyperparameters
for k, v in config.items():
# Decode parameter name back to original (with slashes)
original_name = _decode_param_name(k)
# Convert numpy types to Python built-in types
if hasattr(v, "item"): # numpy scalar
param_value = v.item()
elif isinstance(v, int | float | str | bool):
param_value = type(v)(v) # Ensure it's the built-in type
else:
param_value = v
clone.set_parameter(original_name, param_value)
# Override epochs budget if multi-fidelity
if self.max_iterations != self.min_iterations:
clone.set_parameter("Hydra/training.max_epochs", int(budget))
else:
clone.set_parameter("Hydra/training.max_epochs", int(self.max_iterations))
# If we have a previous task, pass its ID so the worker can download the checkpoint
if prev_task_id:
clone.set_parameter("Hydra/training.resume_from_task_id", prev_task_id)
Task.enqueue(task=clone, queue_name=self.execution_queue)
# Track start time for timeout enforcement
self.task_start_times[clone.id] = time.time()
return clone
def start(self):
controller = Task.current_task()
total_launched = 0
# Keep launching & collecting until budget exhausted
while total_launched < self.total_max_jobs:
# Check if current budget rung is complete BEFORE asking for new trials
# (no running tasks, no pending configs, and we have results for this budget)
if not self.running_tasks and not self.pending_configs and self.evaluated_at_budget:
# Rung complete! Promote top performers to next budget
# Store results for this budget
self.configs_at_budget[self.current_budget] = self.evaluated_at_budget.copy()
# Sort by score (best first)
sorted_configs = sorted(
self.evaluated_at_budget,
key=lambda x: x[1], # score
reverse=self.maximize_metric,
)
# Print rung results
for _i, (_cfg, _score, _tri, _task_id) in enumerate(sorted_configs[:5], 1):
pass
# Move to next budget?
next_budget = self.current_budget * self.eta
if next_budget <= self.max_iterations:
# How many to promote (top 1/eta)
n_promote = max(1, len(sorted_configs) // self.eta)
promoted = sorted_configs[:n_promote]
# Update budget and reset for next rung
self.current_budget = next_budget
self.evaluated_at_budget = []
self.configs_needed_for_rung = 0 # promoted configs are all we need
self.rung_closed = True # rung is pre-filled with promoted configs
# Re-queue promoted configs with new budget
# Include the previous task ID for checkpoint continuation
for _cfg, _score, old_trial, prev_task_id in promoted:
new_trial = TrialInfo(
config=old_trial.config,
instance=old_trial.instance,
seed=old_trial.seed,
budget=self.current_budget,
)
# Store as tuple: (trial, prev_task_id)
self.pending_configs.append((new_trial, prev_task_id))
else:
# All budgets complete
break
# Fill pending_configs with new trials ONLY if we haven't closed this rung yet
# For the first rung: ask SMAC for initial_rung_size configs total
# For subsequent rungs: only use promoted configs (rung is already closed)
while (
not self.rung_closed
and len(self.pending_configs) + len(self.evaluated_at_budget) + len(self.running_tasks)
< self.initial_rung_size
and total_launched < self.total_max_jobs
):
trial = self.smac.ask()
if trial is None:
self.rung_closed = True
break
# Create new trial with forced budget (TrialInfo is frozen, can't modify)
trial_with_budget = TrialInfo(
config=trial.config,
instance=trial.instance,
seed=trial.seed,
budget=self.current_budget,
)
cfg_key = str(sorted(trial.config.items()))
if cfg_key not in self.smac_asked_configs:
self.smac_asked_configs.add(cfg_key)
# Store as tuple: (trial, None) - no previous task for new configs
self.pending_configs.append((trial_with_budget, None))
# Check if we've collected enough configs for this rung
if (
not self.rung_closed
and len(self.pending_configs) + len(self.evaluated_at_budget) + len(self.running_tasks)
>= self.initial_rung_size
):
self.rung_closed = True
# Launch pending configs up to concurrent limit
while self.pending_configs and len(self.running_tasks) < self.num_concurrent_workers:
# Unpack tuple: (trial, prev_task_id)
trial, prev_task_id = self.pending_configs.pop(0)
t = self._launch_task(trial.config, self.current_budget, prev_task_id=prev_task_id)
if t is None:
# Launch failed, mark trial as failed and continue
# Tell SMAC this trial failed with worst possible score
cost = float("inf") if self.maximize_metric else float("-inf")
self.smac.tell(trial, TrialValue(cost=cost))
total_launched += 1
continue
self.running_tasks[t.id] = trial
# Track which task ID was used for this config at this budget
cfg_key = str(sorted(trial.config.items()))
if cfg_key not in self.config_to_tasks:
self.config_to_tasks[cfg_key] = {}
self.config_to_tasks[cfg_key][self.current_budget] = t.id
total_launched += 1
if not self.running_tasks and not self.pending_configs:
break
# Poll for finished or timed out
done = []
timed_out = []
failed_to_check = []
for tid, _tri in self.running_tasks.items():
try:
task = self._get_task_safe(task_id=tid)
if task is None:
failed_to_check.append(tid)
continue
st = task.get_status()
# Check if task completed normally
if st == Task.TaskStatusEnum.completed or st in (
Task.TaskStatusEnum.failed,
Task.TaskStatusEnum.stopped,
):
done.append(tid)
# Check for timeout (if time limit is set)
elif self.time_limit_per_job and tid in self.task_start_times:
elapsed_minutes = (time.time() - self.task_start_times[tid]) / 60.0
if elapsed_minutes > self.time_limit_per_job:
with contextlib.suppress(Exception):
task.mark_stopped(force=True)
timed_out.append(tid)
except Exception:
# Don't mark as failed immediately, might be transient
# Only mark failed after multiple consecutive failures
if not hasattr(self, "_task_check_failures"):
self._task_check_failures = {}
self._task_check_failures[tid] = self._task_check_failures.get(tid, 0) + 1
if self._task_check_failures[tid] >= 5: # 5 consecutive failures
failed_to_check.append(tid)
del self._task_check_failures[tid]
# Process tasks that failed to check
for tid in failed_to_check:
tri = self.running_tasks.pop(tid)
if tid in self.task_start_times:
del self.task_start_times[tid]
# Tell SMAC this trial failed with worst possible score
res = float("-inf") if self.maximize_metric else float("inf")
cost = -res if self.maximize_metric else res
self.smac.tell(tri, TrialValue(cost=cost))
self.completed_results.append(
{
"task_id": tid,
"config": tri.config,
"budget": tri.budget,
"value": res,
"failed": True,
}
)
# Store result with task_id for checkpoint tracking
self.evaluated_at_budget.append((tri.config, res, tri, tid))
# Process completed tasks
for tid in done:
tri = self.running_tasks.pop(tid)
if tid in self.task_start_times:
del self.task_start_times[tid]
# Clear any accumulated failures for this task
if hasattr(self, "_task_check_failures") and tid in self._task_check_failures:
del self._task_check_failures[tid]
task = self._get_task_safe(task_id=tid)
if task is None:
res = float("-inf") if self.maximize_metric else float("inf")
else:
res = self._get_objective(task)
if res is None or res == float("-inf") or res == float("inf"):
res = float("-inf") if self.maximize_metric else float("inf")
cost = -res if self.maximize_metric else res
self.smac.tell(tri, TrialValue(cost=cost))
self.completed_results.append(
{
"task_id": tid,
"config": tri.config,
"budget": tri.budget,
"value": res,
}
)
# Store result for this budget rung with task_id for checkpoint tracking
self.evaluated_at_budget.append((tri.config, res, tri, tid))
iteration = len(self.completed_results)
# Always report the trial score (even if it's bad)
if res is not None and res != float("-inf") and res != float("inf"):
controller.get_logger().report_scalar(
title="Optimization", series="trial_score", value=res, iteration=iteration
)
controller.get_logger().report_scalar(
title="Optimization",
series="trial_budget",
value=tri.budget or self.max_iterations,
iteration=iteration,
)
# Update best score tracking based on actual results
if res is not None and res != float("-inf") and res != float("inf"):
if self.maximize_metric:
self.best_score_so_far = max(self.best_score_so_far, res)
elif res < self.best_score_so_far:
self.best_score_so_far = res
# Always report best score so far (shows flat line when no improvement)
if self.best_score_so_far != float("-inf") and self.best_score_so_far != float("inf"):
controller.get_logger().report_scalar(
title="Optimization", series="best_score", value=self.best_score_so_far, iteration=iteration
)
# Report running statistics
valid_scores = [
r["value"]
for r in self.completed_results
if r["value"] is not None and r["value"] != float("-inf") and r["value"] != float("inf")
]
if valid_scores:
controller.get_logger().report_scalar(
title="Optimization",
series="mean_score",
value=sum(valid_scores) / len(valid_scores),
iteration=iteration,
)
controller.get_logger().report_scalar(
title="Progress",
series="completed_trials",
value=len(self.completed_results),
iteration=iteration,
)
controller.get_logger().report_scalar(
title="Progress", series="running_tasks", value=len(self.running_tasks), iteration=iteration
)
# Process timed out tasks (treat as failed with current objective value)
for tid in timed_out:
tri = self.running_tasks.pop(tid)
if tid in self.task_start_times:
del self.task_start_times[tid]
# Clear any accumulated failures for this task
if hasattr(self, "_task_check_failures") and tid in self._task_check_failures:
del self._task_check_failures[tid]
# Try to get the last objective value before timeout
task = self._get_task_safe(task_id=tid)
if task is None:
res = float("-inf") if self.maximize_metric else float("inf")
else:
res = self._get_objective(task)
if res is None:
res = float("-inf") if self.maximize_metric else float("inf")
cost = -res if self.maximize_metric else res
self.smac.tell(tri, TrialValue(cost=cost))
self.completed_results.append(
{
"task_id": tid,
"config": tri.config,
"budget": tri.budget,
"value": res,
"timed_out": True,
}
)
# Store timed out result for this budget rung with task_id
self.evaluated_at_budget.append((tri.config, res, tri, tid))
time.sleep(self.pool_period_minutes * 60) # Fix: use correct attribute name from base class
if self.compute_time_limit and controller.get_runtime() > self.compute_time_limit * 60:
break
# Finalize
self._finalize()
return self.completed_results
@retry_on_error(max_retries=3, initial_delay=2.0, exceptions=(SendError, ConnectionError, KeyError))
def _get_objective(self, task: Task):
"""Get objective metric value with retry logic for robustness."""
if task is None:
return None
try:
m = task.get_last_scalar_metrics()
if not m:
return None
metric_data = m[self.metric_title][self.metric_series]
# ClearML returns dict with 'last', 'min', 'max' keys representing
# the last/min/max values of this series over ALL logged iterations.
# For snake_length/train_max: 'last' is the last logged train_max value,
# 'max' is the highest train_max ever logged during training.
# Use 'max' if maximizing (we want the best performance achieved),
# 'min' if minimizing, fallback to 'last'
if self.maximize_metric and "max" in metric_data:
result = metric_data["max"]
elif not self.maximize_metric and "min" in metric_data:
result = metric_data["min"]
else:
result = metric_data["last"]
return result
except (KeyError, Exception):
return None
def _finalize(self):
controller = Task.current_task()
# Report final best score
controller.get_logger().report_text(f"Final best score: {self.best_score_so_far}")
# Also try to get SMAC's incumbent for comparison
try:
incumbent = self.smac.intensifier.get_incumbent()
if incumbent is not None:
runhistory = self.smac.runhistory
# Try different ways to get the cost
incumbent_cost = None
try:
incumbent_cost = runhistory.get_cost(incumbent)
except Exception:
# Fallback: search through runhistory manually
for trial_key, trial_value in runhistory.items():
trial_config = runhistory.get_config(trial_key.config_id)
if trial_config == incumbent and (incumbent_cost is None or trial_value.cost < incumbent_cost):
incumbent_cost = trial_value.cost
if incumbent_cost is not None:
score = -incumbent_cost if self.maximize_metric else incumbent_cost
controller.get_logger().report_text(f"SMAC incumbent: {incumbent}, score: {score}")
controller.upload_artifact(
"best_config",
{"config": dict(incumbent), "score": score, "our_best_score": self.best_score_so_far},
)
else:
controller.upload_artifact("best_config", {"our_best_score": self.best_score_so_far})
except Exception as e:
controller.get_logger().report_text(f"Error getting SMAC incumbent: {e}")
controller.upload_artifact("best_config", {"our_best_score": self.best_score_so_far})