feat: add guidance rescale options for Adaptive Projected Guidance in inference

2026-04-08 06:28:48 +00:00 · 2025-09-21 13:03:14 +09:00
parent 9621d9d637
commit 040d976597
3 changed files with 27 additions and 5 deletions
--- a/docs/hunyuan_image_train_network.md
+++ b/docs/hunyuan_image_train_network.md
@@ -454,6 +454,9 @@ python hunyuan_image_minimal_inference.py \
 - `--flow_shift`: Flow matching shift parameter (default: 5.0)
 - `--text_encoder_cpu`: Run the text encoders on CPU to reduce VRAM usage
 - `--vae_chunk_size`: Chunk size for VAE decoding to reduce memory usage (default: None, no chunking). 16 is recommended if enabled.
+- `--apg_start_step_general` and `--apg_start_step_ocr`: Start steps for APG (Adaptive Projected Guidance) if using APG during inference. `5` and `38` are the official recommended values for 50 steps. If this value exceeds `--infer_steps`, APG will not be applied.
+- `--guidance_rescale`: Rescales the guidance for steps before APG starts. Default is `0.0` (no rescaling). If you use this option, a value around `0.5` might be good starting point.
+- `--guidance_rescale_apg`: Rescales the guidance for APG. Default is `0.0` (no rescaling). This option doesn't seem to have a large effect, but if you use it, a value around `0.5` might be a good starting point.

 `--split_attn` is not supported (since inference is done one at a time). `--fp8_vl` is not supported, please use CPU for the text encoder if VRAM is insufficient.

@@ -470,6 +473,9 @@ python hunyuan_image_minimal_inference.py \
 - `--flow_shift`: Flow Matchingシフトパラメータ（デフォルト: 5.0）
 - `--text_encoder_cpu`: テキストエンコーダをCPUで実行してVRAM使用量削減
 - `--vae_chunk_size`: VAEデコーディングのチャンクサイズ（デフォルト: None、チャンク処理なし）。有効にする場合は16を推奨。
+- `--apg_start_step_general` と `--apg_start_step_ocr`: 推論中にAPGを使用する場合の開始ステップ。50ステップの場合、公式推奨値はそれぞれ5と38です。この値が`--infer_steps`を超えると、APGは適用されません。
+- `--guidance_rescale`: APG開始前のステップに対するガイダンスのリスケーリング。デフォルトは0.0（リスケーリングなし）。使用する場合、0.5程度から始めて調整してください。
+- `--guidance_rescale_apg`: APGに対するガイダンスのリスケーリング。デフォルトは0.0（リスケーリングなし）。このオプションは大きな効果はないようですが、使用する場合は0.5程度から始めて調整してください。

 `--split_attn`はサポートされていません（1件ずつ推論するため）。`--fp8_vl`もサポートされていません。VRAMが不足する場合はテキストエンコーダをCPUで実行してください。

--- a/hunyuan_image_minimal_inference.py
+++ b/hunyuan_image_minimal_inference.py
@@ -85,7 +85,13 @@ def parse_args() -> argparse.Namespace:
        "--guidance_rescale",
        type=float,
        default=0.0,
-        help="Guidance rescale factor for steps without APG, 0.0 to 1.0. Default is 0.0 (no rescale)."
+        help="Guidance rescale factor for steps without APG, 0.0 to 1.0. Default is 0.0 (no rescale).",
+    )
+    parser.add_argument(
+        "--guidance_rescale_apg",
+        type=float,
+        default=0.0,
+        help="Guidance rescale factor for steps with APG, 0.0 to 1.0. Default is 0.0 (no rescale).",
    )
    parser.add_argument("--prompt", type=str, default=None, help="prompt for generation")
    parser.add_argument("--negative_prompt", type=str, default="", help="negative prompt for generation, default is empty string")
@@ -695,10 +701,18 @@ def generate_body(

    # Prepare Guider
    cfg_guider_ocr = hunyuan_image_utils.AdaptiveProjectedGuidance(
-        guidance_scale=10.0, eta=0.0, adaptive_projected_guidance_rescale=10.0, adaptive_projected_guidance_momentum=-0.5
+        guidance_scale=10.0,
+        eta=0.0,
+        adaptive_projected_guidance_rescale=10.0,
+        adaptive_projected_guidance_momentum=-0.5,
+        guidance_rescale=args.guidance_rescale_apg,
    )
    cfg_guider_general = hunyuan_image_utils.AdaptiveProjectedGuidance(
-        guidance_scale=10.0, eta=0.0, adaptive_projected_guidance_rescale=10.0, adaptive_projected_guidance_momentum=-0.5
+        guidance_scale=10.0,
+        eta=0.0,
+        adaptive_projected_guidance_rescale=10.0,
+        adaptive_projected_guidance_momentum=-0.5,
+        guidance_rescale=args.guidance_rescale_apg,
    )

    # Denoising loop
--- a/library/hunyuan_image_utils.py
+++ b/library/hunyuan_image_utils.py
@@ -401,8 +401,6 @@ class AdaptiveProjectedGuidance:
        guidance_rescale: float = 0.0,
        use_original_formulation: bool = False,
    ):
-        assert guidance_rescale == 0.0, "guidance_rescale > 0.0 not supported."
-
        self.guidance_scale = guidance_scale
        self.adaptive_projected_guidance_momentum = adaptive_projected_guidance_momentum
        self.adaptive_projected_guidance_rescale = adaptive_projected_guidance_rescale
@@ -425,6 +423,10 @@ class AdaptiveProjectedGuidance:
            self.use_original_formulation,
        )

+        if self.guidance_rescale > 0.0:
+            print(f"Applying guidance rescale with factor {self.guidance_rescale} at step {step}")
+            pred = rescale_noise_cfg(pred, pred_cond, self.guidance_rescale)
+
        return pred