♻️ crazy refactor
This commit is contained in:
1
src/hpo/__init__.py
Normal file
1
src/hpo/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Hyperparameter optimization — SMAC3 + ClearML Successive Halving."""
|
||||
636
src/hpo/smac3.py
Normal file
636
src/hpo/smac3.py
Normal file
@@ -0,0 +1,636 @@
|
||||
# Requires: pip install smac==2.0.0 ConfigSpace==0.4.20
|
||||
import contextlib
|
||||
import time
|
||||
from collections.abc import Sequence
|
||||
from functools import wraps
|
||||
from typing import Any
|
||||
|
||||
from clearml import Task
|
||||
from clearml.automation.optimization import Objective, SearchStrategy
|
||||
from clearml.automation.parameters import Parameter
|
||||
from clearml.backend_interface.session import SendError
|
||||
from ConfigSpace import (
|
||||
CategoricalHyperparameter,
|
||||
ConfigurationSpace,
|
||||
UniformFloatHyperparameter,
|
||||
UniformIntegerHyperparameter,
|
||||
)
|
||||
from smac import MultiFidelityFacade
|
||||
from smac.intensifier.successive_halving import SuccessiveHalving
|
||||
from smac.runhistory.dataclasses import TrialInfo, TrialValue
|
||||
from smac.scenario import Scenario
|
||||
|
||||
|
||||
def retry_on_error(max_retries=5, initial_delay=2.0, backoff=2.0, exceptions=(Exception,)):
|
||||
"""Decorator to retry a function on exception with exponential backoff."""
|
||||
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
delay = initial_delay
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except exceptions:
|
||||
if attempt == max_retries - 1:
|
||||
return None # Return None instead of raising
|
||||
time.sleep(delay)
|
||||
delay *= backoff
|
||||
return None
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
def _encode_param_name(name: str) -> str:
|
||||
"""Encode parameter name for ConfigSpace (replace / with __SLASH__)"""
|
||||
return name.replace("/", "__SLASH__")
|
||||
|
||||
|
||||
def _decode_param_name(name: str) -> str:
|
||||
"""Decode parameter name back to original (replace __SLASH__ with /)"""
|
||||
return name.replace("__SLASH__", "/")
|
||||
|
||||
|
||||
def _convert_param_to_cs(param: Parameter):
|
||||
"""
|
||||
Convert a ClearML Parameter into a ConfigSpace hyperparameter,
|
||||
adapted to ConfigSpace>=1.x (no more 'q' argument).
|
||||
"""
|
||||
# Encode the name to avoid ConfigSpace issues with special chars like '/'
|
||||
name = _encode_param_name(param.name)
|
||||
|
||||
# Categorical / discrete list
|
||||
if hasattr(param, "values"):
|
||||
return CategoricalHyperparameter(name=name, choices=list(param.values))
|
||||
|
||||
# Numeric range (float or int)
|
||||
if hasattr(param, "min_value") and hasattr(param, "max_value"):
|
||||
min_val = param.min_value
|
||||
max_val = param.max_value
|
||||
|
||||
# Check if this should be treated as integer
|
||||
if isinstance(min_val, int) and isinstance(max_val, int):
|
||||
log = getattr(param, "log_scale", False)
|
||||
|
||||
# Check for step_size for quantization
|
||||
if hasattr(param, "step_size"):
|
||||
sv = int(param.step_size)
|
||||
if sv != 1:
|
||||
# emulate quantization by explicit list of values
|
||||
choices = list(range(min_val, max_val + 1, sv))
|
||||
return CategoricalHyperparameter(name=name, choices=choices)
|
||||
|
||||
# Simple uniform integer range
|
||||
return UniformIntegerHyperparameter(name=name, lower=min_val, upper=max_val, log=log)
|
||||
else:
|
||||
# Treat as float
|
||||
lower, upper = float(min_val), float(max_val)
|
||||
log = getattr(param, "log_scale", False)
|
||||
return UniformFloatHyperparameter(name=name, lower=lower, upper=upper, log=log)
|
||||
|
||||
raise ValueError(f"Unsupported Parameter type: {type(param)}")
|
||||
|
||||
|
||||
class OptimizerSMAC(SearchStrategy):
|
||||
"""
|
||||
SMAC3-based hyperparameter optimizer, matching OptimizerBOHB interface.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_task_id: str,
|
||||
hyper_parameters: Sequence[Parameter],
|
||||
objective_metric: Objective,
|
||||
execution_queue: str,
|
||||
num_concurrent_workers: int,
|
||||
min_iteration_per_job: int,
|
||||
max_iteration_per_job: int,
|
||||
total_max_jobs: int,
|
||||
pool_period_min: float = 2.0,
|
||||
time_limit_per_job: float | None = None,
|
||||
compute_time_limit: float | None = None,
|
||||
**smac_kwargs: Any,
|
||||
):
|
||||
# Initialize base SearchStrategy
|
||||
super().__init__(
|
||||
base_task_id=base_task_id,
|
||||
hyper_parameters=hyper_parameters,
|
||||
objective_metric=objective_metric,
|
||||
execution_queue=execution_queue,
|
||||
num_concurrent_workers=num_concurrent_workers,
|
||||
pool_period_min=pool_period_min,
|
||||
time_limit_per_job=time_limit_per_job,
|
||||
compute_time_limit=compute_time_limit,
|
||||
min_iteration_per_job=min_iteration_per_job,
|
||||
max_iteration_per_job=max_iteration_per_job,
|
||||
total_max_jobs=total_max_jobs,
|
||||
)
|
||||
|
||||
# Expose for internal use (access private attributes from base class)
|
||||
self.execution_queue = self._execution_queue
|
||||
self.min_iterations = min_iteration_per_job
|
||||
self.max_iterations = max_iteration_per_job
|
||||
self.num_concurrent_workers = self._num_concurrent_workers # Fix: access private attribute
|
||||
|
||||
# Objective details
|
||||
# Handle both single objective (string) and multi-objective (list) cases
|
||||
if isinstance(self._objective_metric.title, list):
|
||||
self.metric_title = self._objective_metric.title[0] # Use first objective
|
||||
else:
|
||||
self.metric_title = self._objective_metric.title
|
||||
|
||||
if isinstance(self._objective_metric.series, list):
|
||||
self.metric_series = self._objective_metric.series[0] # Use first series
|
||||
else:
|
||||
self.metric_series = self._objective_metric.series
|
||||
|
||||
# ClearML Objective stores sign as a list, e.g., ['max'] or ['min']
|
||||
objective_sign = getattr(self._objective_metric, "sign", None) or getattr(self._objective_metric, "order", None)
|
||||
|
||||
# Handle list case - extract first element
|
||||
if isinstance(objective_sign, list):
|
||||
objective_sign = objective_sign[0] if objective_sign else "max"
|
||||
|
||||
# Default to max if nothing found
|
||||
if objective_sign is None:
|
||||
objective_sign = "max"
|
||||
|
||||
self.maximize_metric = str(objective_sign).lower() in ("max", "max_global")
|
||||
|
||||
# Build ConfigSpace
|
||||
self.config_space = ConfigurationSpace(seed=42)
|
||||
for p in self._hyper_parameters: # Access private attribute correctly
|
||||
cs_hp = _convert_param_to_cs(p)
|
||||
self.config_space.add(cs_hp)
|
||||
|
||||
# Configure SMAC Scenario
|
||||
scenario = Scenario(
|
||||
configspace=self.config_space,
|
||||
n_trials=self.total_max_jobs,
|
||||
min_budget=float(self.min_iterations),
|
||||
max_budget=float(self.max_iterations),
|
||||
walltime_limit=(self.compute_time_limit * 60) if self.compute_time_limit else None,
|
||||
deterministic=True,
|
||||
)
|
||||
|
||||
# build the Successive Halving intensifier (NOT Hyperband!)
|
||||
# Hyperband runs multiple brackets with different starting budgets - wasteful
|
||||
# Successive Halving: ALL configs start at min_budget, only best get promoted
|
||||
# eta controls the reduction factor (default 3 means keep top 1/3 each round)
|
||||
# eta can be overridden via smac_kwargs from HyperParameterOptimizer
|
||||
eta = smac_kwargs.pop("eta", 3) # Default to 3 if not specified
|
||||
intensifier = SuccessiveHalving(scenario=scenario, eta=eta, **smac_kwargs)
|
||||
|
||||
# now pass that intensifier instance into the facade
|
||||
self.smac = MultiFidelityFacade(
|
||||
scenario=scenario,
|
||||
target_function=lambda config, budget, seed: 0.0,
|
||||
intensifier=intensifier,
|
||||
overwrite=True,
|
||||
)
|
||||
|
||||
# Bookkeeping
|
||||
self.running_tasks = {} # task_id -> trial info
|
||||
self.task_start_times = {} # task_id -> start time (for timeout)
|
||||
self.completed_results = []
|
||||
self.best_score_so_far = float("-inf") if self.maximize_metric else float("inf")
|
||||
self.time_limit_per_job = time_limit_per_job # Store time limit (minutes)
|
||||
|
||||
# Checkpoint continuation tracking: config_key -> {budget: task_id}
|
||||
# Used to find the previous task's checkpoint when promoting a config
|
||||
self.config_to_tasks = {} # config_key -> {budget: task_id}
|
||||
|
||||
# Manual Successive Halving control
|
||||
self.eta = eta
|
||||
self.current_budget = float(self.min_iterations)
|
||||
self.configs_at_budget = {} # budget -> list of (config, score, trial)
|
||||
self.pending_configs = [] # configs waiting to be evaluated at current_budget - list of (trial, prev_task_id)
|
||||
self.evaluated_at_budget = [] # (config, score, trial, task_id) for current budget
|
||||
self.smac_asked_configs = set() # track which configs SMAC has given us
|
||||
|
||||
# Calculate initial rung size for proper Successive Halving
|
||||
# With eta=3: rung sizes are n, n/3, n/9, ...
|
||||
# Total trials = n * (1 + 1/eta + 1/eta^2 + ...) = n * eta/(eta-1) for infinite series
|
||||
# For finite rungs, calculate exactly
|
||||
num_rungs = 1
|
||||
b = float(self.min_iterations)
|
||||
while b * eta <= self.max_iterations:
|
||||
num_rungs += 1
|
||||
b *= eta
|
||||
|
||||
# Sum of geometric series: 1 + 1/eta + 1/eta^2 + ... (num_rungs terms)
|
||||
series_sum = sum(1.0 / (eta**i) for i in range(num_rungs))
|
||||
self.initial_rung_size = int(self.total_max_jobs / series_sum)
|
||||
self.initial_rung_size = max(self.initial_rung_size, self.num_concurrent_workers) # at least num_workers
|
||||
self.configs_needed_for_rung = self.initial_rung_size # how many configs we still need for current rung
|
||||
self.rung_closed = False # whether we've collected all configs for current rung
|
||||
|
||||
@retry_on_error(max_retries=3, initial_delay=5.0, exceptions=(ValueError, SendError, ConnectionError))
|
||||
def _get_task_safe(self, task_id: str):
|
||||
"""Safely get a task with retry logic."""
|
||||
return Task.get_task(task_id=task_id)
|
||||
|
||||
@retry_on_error(max_retries=3, initial_delay=5.0, exceptions=(ValueError, SendError, ConnectionError))
|
||||
def _launch_task(self, config: dict, budget: float, prev_task_id: str | None = None):
|
||||
"""Launch a task with retry logic for robustness.
|
||||
|
||||
Args:
|
||||
config: Hyperparameter configuration dict
|
||||
budget: Number of epochs to train
|
||||
prev_task_id: Optional task ID from previous budget to continue from (checkpoint)
|
||||
"""
|
||||
base = self._get_task_safe(task_id=self._base_task_id)
|
||||
if base is None:
|
||||
return None
|
||||
|
||||
clone = Task.clone(
|
||||
source_task=base,
|
||||
name=f"HPO Trial - {base.name}",
|
||||
parent=Task.current_task().id, # Set the current HPO task as parent
|
||||
)
|
||||
# Override hyperparameters
|
||||
for k, v in config.items():
|
||||
# Decode parameter name back to original (with slashes)
|
||||
original_name = _decode_param_name(k)
|
||||
# Convert numpy types to Python built-in types
|
||||
if hasattr(v, "item"): # numpy scalar
|
||||
param_value = v.item()
|
||||
elif isinstance(v, int | float | str | bool):
|
||||
param_value = type(v)(v) # Ensure it's the built-in type
|
||||
else:
|
||||
param_value = v
|
||||
clone.set_parameter(original_name, param_value)
|
||||
# Override epochs budget if multi-fidelity
|
||||
if self.max_iterations != self.min_iterations:
|
||||
clone.set_parameter("Hydra/training.max_epochs", int(budget))
|
||||
else:
|
||||
clone.set_parameter("Hydra/training.max_epochs", int(self.max_iterations))
|
||||
|
||||
# If we have a previous task, pass its ID so the worker can download the checkpoint
|
||||
if prev_task_id:
|
||||
clone.set_parameter("Hydra/training.resume_from_task_id", prev_task_id)
|
||||
|
||||
Task.enqueue(task=clone, queue_name=self.execution_queue)
|
||||
# Track start time for timeout enforcement
|
||||
self.task_start_times[clone.id] = time.time()
|
||||
return clone
|
||||
|
||||
def start(self):
|
||||
controller = Task.current_task()
|
||||
total_launched = 0
|
||||
|
||||
# Keep launching & collecting until budget exhausted
|
||||
while total_launched < self.total_max_jobs:
|
||||
# Check if current budget rung is complete BEFORE asking for new trials
|
||||
# (no running tasks, no pending configs, and we have results for this budget)
|
||||
if not self.running_tasks and not self.pending_configs and self.evaluated_at_budget:
|
||||
# Rung complete! Promote top performers to next budget
|
||||
|
||||
# Store results for this budget
|
||||
self.configs_at_budget[self.current_budget] = self.evaluated_at_budget.copy()
|
||||
|
||||
# Sort by score (best first)
|
||||
sorted_configs = sorted(
|
||||
self.evaluated_at_budget,
|
||||
key=lambda x: x[1], # score
|
||||
reverse=self.maximize_metric,
|
||||
)
|
||||
|
||||
# Print rung results
|
||||
for _i, (_cfg, _score, _tri, _task_id) in enumerate(sorted_configs[:5], 1):
|
||||
pass
|
||||
|
||||
# Move to next budget?
|
||||
next_budget = self.current_budget * self.eta
|
||||
if next_budget <= self.max_iterations:
|
||||
# How many to promote (top 1/eta)
|
||||
n_promote = max(1, len(sorted_configs) // self.eta)
|
||||
promoted = sorted_configs[:n_promote]
|
||||
|
||||
# Update budget and reset for next rung
|
||||
self.current_budget = next_budget
|
||||
self.evaluated_at_budget = []
|
||||
self.configs_needed_for_rung = 0 # promoted configs are all we need
|
||||
self.rung_closed = True # rung is pre-filled with promoted configs
|
||||
|
||||
# Re-queue promoted configs with new budget
|
||||
# Include the previous task ID for checkpoint continuation
|
||||
for _cfg, _score, old_trial, prev_task_id in promoted:
|
||||
new_trial = TrialInfo(
|
||||
config=old_trial.config,
|
||||
instance=old_trial.instance,
|
||||
seed=old_trial.seed,
|
||||
budget=self.current_budget,
|
||||
)
|
||||
# Store as tuple: (trial, prev_task_id)
|
||||
self.pending_configs.append((new_trial, prev_task_id))
|
||||
else:
|
||||
# All budgets complete
|
||||
break
|
||||
|
||||
# Fill pending_configs with new trials ONLY if we haven't closed this rung yet
|
||||
# For the first rung: ask SMAC for initial_rung_size configs total
|
||||
# For subsequent rungs: only use promoted configs (rung is already closed)
|
||||
while (
|
||||
not self.rung_closed
|
||||
and len(self.pending_configs) + len(self.evaluated_at_budget) + len(self.running_tasks)
|
||||
< self.initial_rung_size
|
||||
and total_launched < self.total_max_jobs
|
||||
):
|
||||
trial = self.smac.ask()
|
||||
if trial is None:
|
||||
self.rung_closed = True
|
||||
break
|
||||
# Create new trial with forced budget (TrialInfo is frozen, can't modify)
|
||||
trial_with_budget = TrialInfo(
|
||||
config=trial.config,
|
||||
instance=trial.instance,
|
||||
seed=trial.seed,
|
||||
budget=self.current_budget,
|
||||
)
|
||||
cfg_key = str(sorted(trial.config.items()))
|
||||
if cfg_key not in self.smac_asked_configs:
|
||||
self.smac_asked_configs.add(cfg_key)
|
||||
# Store as tuple: (trial, None) - no previous task for new configs
|
||||
self.pending_configs.append((trial_with_budget, None))
|
||||
|
||||
# Check if we've collected enough configs for this rung
|
||||
if (
|
||||
not self.rung_closed
|
||||
and len(self.pending_configs) + len(self.evaluated_at_budget) + len(self.running_tasks)
|
||||
>= self.initial_rung_size
|
||||
):
|
||||
self.rung_closed = True
|
||||
|
||||
# Launch pending configs up to concurrent limit
|
||||
while self.pending_configs and len(self.running_tasks) < self.num_concurrent_workers:
|
||||
# Unpack tuple: (trial, prev_task_id)
|
||||
trial, prev_task_id = self.pending_configs.pop(0)
|
||||
t = self._launch_task(trial.config, self.current_budget, prev_task_id=prev_task_id)
|
||||
if t is None:
|
||||
# Launch failed, mark trial as failed and continue
|
||||
# Tell SMAC this trial failed with worst possible score
|
||||
cost = float("inf") if self.maximize_metric else float("-inf")
|
||||
self.smac.tell(trial, TrialValue(cost=cost))
|
||||
total_launched += 1
|
||||
continue
|
||||
self.running_tasks[t.id] = trial
|
||||
|
||||
# Track which task ID was used for this config at this budget
|
||||
cfg_key = str(sorted(trial.config.items()))
|
||||
if cfg_key not in self.config_to_tasks:
|
||||
self.config_to_tasks[cfg_key] = {}
|
||||
self.config_to_tasks[cfg_key][self.current_budget] = t.id
|
||||
|
||||
total_launched += 1
|
||||
|
||||
if not self.running_tasks and not self.pending_configs:
|
||||
break
|
||||
|
||||
# Poll for finished or timed out
|
||||
done = []
|
||||
timed_out = []
|
||||
failed_to_check = []
|
||||
for tid, _tri in self.running_tasks.items():
|
||||
try:
|
||||
task = self._get_task_safe(task_id=tid)
|
||||
if task is None:
|
||||
failed_to_check.append(tid)
|
||||
continue
|
||||
|
||||
st = task.get_status()
|
||||
|
||||
# Check if task completed normally
|
||||
if st == Task.TaskStatusEnum.completed or st in (
|
||||
Task.TaskStatusEnum.failed,
|
||||
Task.TaskStatusEnum.stopped,
|
||||
):
|
||||
done.append(tid)
|
||||
# Check for timeout (if time limit is set)
|
||||
elif self.time_limit_per_job and tid in self.task_start_times:
|
||||
elapsed_minutes = (time.time() - self.task_start_times[tid]) / 60.0
|
||||
if elapsed_minutes > self.time_limit_per_job:
|
||||
with contextlib.suppress(Exception):
|
||||
task.mark_stopped(force=True)
|
||||
timed_out.append(tid)
|
||||
except Exception:
|
||||
# Don't mark as failed immediately, might be transient
|
||||
# Only mark failed after multiple consecutive failures
|
||||
if not hasattr(self, "_task_check_failures"):
|
||||
self._task_check_failures = {}
|
||||
self._task_check_failures[tid] = self._task_check_failures.get(tid, 0) + 1
|
||||
if self._task_check_failures[tid] >= 5: # 5 consecutive failures
|
||||
failed_to_check.append(tid)
|
||||
del self._task_check_failures[tid]
|
||||
|
||||
# Process tasks that failed to check
|
||||
for tid in failed_to_check:
|
||||
tri = self.running_tasks.pop(tid)
|
||||
if tid in self.task_start_times:
|
||||
del self.task_start_times[tid]
|
||||
# Tell SMAC this trial failed with worst possible score
|
||||
res = float("-inf") if self.maximize_metric else float("inf")
|
||||
cost = -res if self.maximize_metric else res
|
||||
self.smac.tell(tri, TrialValue(cost=cost))
|
||||
self.completed_results.append(
|
||||
{
|
||||
"task_id": tid,
|
||||
"config": tri.config,
|
||||
"budget": tri.budget,
|
||||
"value": res,
|
||||
"failed": True,
|
||||
}
|
||||
)
|
||||
# Store result with task_id for checkpoint tracking
|
||||
self.evaluated_at_budget.append((tri.config, res, tri, tid))
|
||||
|
||||
# Process completed tasks
|
||||
for tid in done:
|
||||
tri = self.running_tasks.pop(tid)
|
||||
if tid in self.task_start_times:
|
||||
del self.task_start_times[tid]
|
||||
|
||||
# Clear any accumulated failures for this task
|
||||
if hasattr(self, "_task_check_failures") and tid in self._task_check_failures:
|
||||
del self._task_check_failures[tid]
|
||||
|
||||
task = self._get_task_safe(task_id=tid)
|
||||
if task is None:
|
||||
res = float("-inf") if self.maximize_metric else float("inf")
|
||||
else:
|
||||
res = self._get_objective(task)
|
||||
|
||||
if res is None or res == float("-inf") or res == float("inf"):
|
||||
res = float("-inf") if self.maximize_metric else float("inf")
|
||||
|
||||
cost = -res if self.maximize_metric else res
|
||||
self.smac.tell(tri, TrialValue(cost=cost))
|
||||
self.completed_results.append(
|
||||
{
|
||||
"task_id": tid,
|
||||
"config": tri.config,
|
||||
"budget": tri.budget,
|
||||
"value": res,
|
||||
}
|
||||
)
|
||||
|
||||
# Store result for this budget rung with task_id for checkpoint tracking
|
||||
self.evaluated_at_budget.append((tri.config, res, tri, tid))
|
||||
|
||||
iteration = len(self.completed_results)
|
||||
|
||||
# Always report the trial score (even if it's bad)
|
||||
if res is not None and res != float("-inf") and res != float("inf"):
|
||||
controller.get_logger().report_scalar(
|
||||
title="Optimization", series="trial_score", value=res, iteration=iteration
|
||||
)
|
||||
controller.get_logger().report_scalar(
|
||||
title="Optimization",
|
||||
series="trial_budget",
|
||||
value=tri.budget or self.max_iterations,
|
||||
iteration=iteration,
|
||||
)
|
||||
|
||||
# Update best score tracking based on actual results
|
||||
if res is not None and res != float("-inf") and res != float("inf"):
|
||||
if self.maximize_metric:
|
||||
self.best_score_so_far = max(self.best_score_so_far, res)
|
||||
elif res < self.best_score_so_far:
|
||||
self.best_score_so_far = res
|
||||
|
||||
# Always report best score so far (shows flat line when no improvement)
|
||||
if self.best_score_so_far != float("-inf") and self.best_score_so_far != float("inf"):
|
||||
controller.get_logger().report_scalar(
|
||||
title="Optimization", series="best_score", value=self.best_score_so_far, iteration=iteration
|
||||
)
|
||||
|
||||
# Report running statistics
|
||||
valid_scores = [
|
||||
r["value"]
|
||||
for r in self.completed_results
|
||||
if r["value"] is not None and r["value"] != float("-inf") and r["value"] != float("inf")
|
||||
]
|
||||
if valid_scores:
|
||||
controller.get_logger().report_scalar(
|
||||
title="Optimization",
|
||||
series="mean_score",
|
||||
value=sum(valid_scores) / len(valid_scores),
|
||||
iteration=iteration,
|
||||
)
|
||||
controller.get_logger().report_scalar(
|
||||
title="Progress",
|
||||
series="completed_trials",
|
||||
value=len(self.completed_results),
|
||||
iteration=iteration,
|
||||
)
|
||||
controller.get_logger().report_scalar(
|
||||
title="Progress", series="running_tasks", value=len(self.running_tasks), iteration=iteration
|
||||
)
|
||||
|
||||
# Process timed out tasks (treat as failed with current objective value)
|
||||
for tid in timed_out:
|
||||
tri = self.running_tasks.pop(tid)
|
||||
if tid in self.task_start_times:
|
||||
del self.task_start_times[tid]
|
||||
|
||||
# Clear any accumulated failures for this task
|
||||
if hasattr(self, "_task_check_failures") and tid in self._task_check_failures:
|
||||
del self._task_check_failures[tid]
|
||||
|
||||
# Try to get the last objective value before timeout
|
||||
task = self._get_task_safe(task_id=tid)
|
||||
if task is None:
|
||||
res = float("-inf") if self.maximize_metric else float("inf")
|
||||
else:
|
||||
res = self._get_objective(task)
|
||||
|
||||
if res is None:
|
||||
res = float("-inf") if self.maximize_metric else float("inf")
|
||||
cost = -res if self.maximize_metric else res
|
||||
self.smac.tell(tri, TrialValue(cost=cost))
|
||||
self.completed_results.append(
|
||||
{
|
||||
"task_id": tid,
|
||||
"config": tri.config,
|
||||
"budget": tri.budget,
|
||||
"value": res,
|
||||
"timed_out": True,
|
||||
}
|
||||
)
|
||||
|
||||
# Store timed out result for this budget rung with task_id
|
||||
self.evaluated_at_budget.append((tri.config, res, tri, tid))
|
||||
|
||||
time.sleep(self.pool_period_minutes * 60) # Fix: use correct attribute name from base class
|
||||
if self.compute_time_limit and controller.get_runtime() > self.compute_time_limit * 60:
|
||||
break
|
||||
|
||||
# Finalize
|
||||
self._finalize()
|
||||
return self.completed_results
|
||||
|
||||
@retry_on_error(max_retries=3, initial_delay=2.0, exceptions=(SendError, ConnectionError, KeyError))
|
||||
def _get_objective(self, task: Task):
|
||||
"""Get objective metric value with retry logic for robustness."""
|
||||
if task is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
m = task.get_last_scalar_metrics()
|
||||
if not m:
|
||||
return None
|
||||
|
||||
metric_data = m[self.metric_title][self.metric_series]
|
||||
|
||||
# ClearML returns dict with 'last', 'min', 'max' keys representing
|
||||
# the last/min/max values of this series over ALL logged iterations.
|
||||
# For snake_length/train_max: 'last' is the last logged train_max value,
|
||||
# 'max' is the highest train_max ever logged during training.
|
||||
|
||||
# Use 'max' if maximizing (we want the best performance achieved),
|
||||
# 'min' if minimizing, fallback to 'last'
|
||||
if self.maximize_metric and "max" in metric_data:
|
||||
result = metric_data["max"]
|
||||
elif not self.maximize_metric and "min" in metric_data:
|
||||
result = metric_data["min"]
|
||||
else:
|
||||
result = metric_data["last"]
|
||||
return result
|
||||
except (KeyError, Exception):
|
||||
return None
|
||||
|
||||
def _finalize(self):
|
||||
controller = Task.current_task()
|
||||
# Report final best score
|
||||
controller.get_logger().report_text(f"Final best score: {self.best_score_so_far}")
|
||||
|
||||
# Also try to get SMAC's incumbent for comparison
|
||||
try:
|
||||
incumbent = self.smac.intensifier.get_incumbent()
|
||||
if incumbent is not None:
|
||||
runhistory = self.smac.runhistory
|
||||
# Try different ways to get the cost
|
||||
incumbent_cost = None
|
||||
try:
|
||||
incumbent_cost = runhistory.get_cost(incumbent)
|
||||
except Exception:
|
||||
# Fallback: search through runhistory manually
|
||||
for trial_key, trial_value in runhistory.items():
|
||||
trial_config = runhistory.get_config(trial_key.config_id)
|
||||
if trial_config == incumbent and (incumbent_cost is None or trial_value.cost < incumbent_cost):
|
||||
incumbent_cost = trial_value.cost
|
||||
|
||||
if incumbent_cost is not None:
|
||||
score = -incumbent_cost if self.maximize_metric else incumbent_cost
|
||||
controller.get_logger().report_text(f"SMAC incumbent: {incumbent}, score: {score}")
|
||||
controller.upload_artifact(
|
||||
"best_config",
|
||||
{"config": dict(incumbent), "score": score, "our_best_score": self.best_score_so_far},
|
||||
)
|
||||
else:
|
||||
controller.upload_artifact("best_config", {"our_best_score": self.best_score_so_far})
|
||||
except Exception as e:
|
||||
controller.get_logger().report_text(f"Error getting SMAC incumbent: {e}")
|
||||
controller.upload_artifact("best_config", {"our_best_score": self.best_score_so_far})
|
||||
Reference in New Issue
Block a user