Merge pull request #714 from kohya-ss/dev

pool output fix, v_pred loss like etc.
2026-04-16 17:02:45 +00:00 · 2023-08-04 22:36:25 +09:00
parent 4072f723c1 9d855091bf
commit f4935dd6be
17 changed files with 877 additions and 700 deletions
--- a/README.md
+++ b/README.md
@@ -22,6 +22,17 @@ __Stable Diffusion web UI now seems to support LoRA trained by ``sd-scripts``.__

 The feature of SDXL training is now available in sdxl branch as an experimental feature. 

+Aug 4, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version.
+
+- `bitsandbytes` is now optional. Please install it if you want to use it. The insructions are in the later section.
+- `albumentations` is not required anymore.
+- An issue for pooled output for Textual Inversion training is fixed.
+- `--v_pred_like_loss ratio` option is added. This option adds the loss like v-prediction loss in SDXL training. `0.1` means that the loss is added 10% of the v-prediction loss. The default value is None (disabled).
+  - In v-prediction, the loss is higher in the early timesteps (near the noise). This option can be used to increase the loss in the early timesteps.
+- Arbitrary options can be used for Diffusers' schedulers. For example `--lr_scheduler_args "lr_end=1e-8"`.
+- `sdxl_gen_imgs.py` supports batch size > 1.
+- Fix ControlNet to work with attention couple and reginal LoRA in `gen_img_diffusers.py`.
+
 Summary of the feature:

 - `tools/cache_latents.py` is added. This script can be used to cache the latents to disk in advance. 
@@ -65,12 +76,17 @@ Summary of the feature:
 ### Tips for SDXL training

 - The default resolution of SDXL is 1024x1024.
- The fine-tuning can be done with 24GB GPU memory with the batch size of 1. For 24GB GPU, the following options are recommended:
+- The fine-tuning can be done with 24GB GPU memory with the batch size of 1. For 24GB GPU, the following options are recommended __for the fine-tuning with 24GB GPU memory__:
  - Train U-Net only.
  - Use gradient checkpointing.
  - Use `--cache_text_encoder_outputs` option and caching latents.
  - Use Adafactor optimizer. RMSprop 8bit or Adagrad 8bit may work. AdamW 8bit doesn't seem to work.
- The LoRA training can be done with 12GB GPU memory.
+- The LoRA training can be done with 8GB GPU memory (10GB recommended). For reducing the GPU memory usage, the following options are recommended:
+  - Train U-Net only.
+  - Use gradient checkpointing.
+  - Use `--cache_text_encoder_outputs` option and caching latents.
+  - Use one of 8bit optimizers or Adafactor optimizer.
+  - Use lower dim (-8 for 8GB GPU).
 - `--network_train_unet_only` option is highly recommended for SDXL LoRA. Because SDXL has two text encoders, the result of the training will be unexpected.
 - PyTorch 2 seems to use slightly less GPU memory than PyTorch 1.
 - `--bucket_reso_steps` can be set to 32 instead of the default value 64. Smaller values than 32 will not work for SDXL training.
@@ -93,19 +109,11 @@ state_dict = {"clip_g": embs_for_text_encoder_1280, "clip_l": embs_for_text_enco
 save_file(state_dict, file)
 ```

-### TODO
-
- [ ] Support conversion of Diffusers SDXL models.
- [ ] Support `--weighted_captions` option.
- [ ] Change `--output_config` option to continue the training.
- [ ] Extend `--full_bf16` for all the scripts.
- [x] Support Textual Inversion training.
-
 ## About requirements.txt

 These files do not contain requirements for PyTorch. Because the versions of them depend on your environment. Please install PyTorch at first (see installation guide below.) 

-The scripts are tested with PyTorch 1.12.1 and 2.0.1, Diffusers 0.17.1.
+The scripts are tested with PyTorch 1.12.1 and 2.0.1, Diffusers 0.18.2.

 ## Links to how-to-use documents

@@ -151,13 +159,16 @@ pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url http
 pip install --upgrade -r requirements.txt
 pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl

-cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
-cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
-cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
-
 accelerate config
 ```

+__Note:__ Now bitsandbytes is optional. Please install any version of bitsandbytes as needed. Installation instructions are in the following section.
+
+<!-- 
+cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
+cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
+cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
+-->
 Answers to accelerate config:

 ```txt
@@ -190,10 +201,6 @@ pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 --index-url https://dow
 pip install --upgrade -r requirements.txt
 pip install xformers==0.0.20

-cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
-cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
-cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
-
 accelerate config
 ```

@@ -204,26 +211,43 @@ Answers to accelerate config should be the same as above.
 Other versions of PyTorch and xformers seem to have problems with training.
 If there is no other reason, please install the specified version.

-### Optional: Use Lion8bit
+### Optional: Use `bitsandbytes` (8bit optimizer)

-For Lion8bit, you need to upgrade `bitsandbytes` to 0.38.0 or later. Uninstall `bitsandbytes`, and for Windows, install the Windows version whl file from [here](https://github.com/jllllll/bitsandbytes-windows-webui) or other sources, like:
+For 8bit optimizer, you need to install `bitsandbytes`. For Linux, please install `bitsandbytes` as usual (0.41.1 or later is recommended.)
+
+For Windows, there are several versions of `bitsandbytes`:
+
+- `bitsandbytes` 0.35.0: Stable version. AdamW8bit is available. `full_bf16` is not available.
+- `bitsandbytes` 0.39.1: Lion8bit, PagedAdamW8bit and PagedLion8bit are available. `full_bf16` is available.
+
+Note: `bitsandbytes`above 0.35.0 till 0.41.0 seems to have an issue: https://github.com/TimDettmers/bitsandbytes/issues/659
+
+Follow the instructions below to install `bitsandbytes` for Windows.
+
+### bitsandbytes 0.35.0 for Windows
+
+Open a regular Powershell terminal and type the following inside:
+
+```powershell
+cd sd-scripts
+.\venv\Scripts\activate
+pip install bitsandbytes==0.35.0
+
+cp .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
+cp .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
+cp .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
+```
+
+This will install `bitsandbytes` 0.35.0 and copy the necessary files to the `bitsandbytes` directory.
+
+### bitsandbytes 0.39.1 for Windows
+
+Install the Windows version whl file from [here](https://github.com/jllllll/bitsandbytes-windows-webui) or other sources, like:

 ```powershell
 pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl
 ```

-For upgrading, upgrade this repo with `pip install .`, and upgrade necessary packages manually.
-
-### Optional: Use PagedAdamW8bit and PagedLion8bit
-
-For PagedAdamW8bit and PagedLion8bit, you need to upgrade `bitsandbytes` to 0.39.0 or later. Uninstall `bitsandbytes`, and for Windows, install the Windows version whl file from [here](https://github.com/jllllll/bitsandbytes-windows-webui) or other sources, like:
-
-```powershell
-pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.39.1-py3-none-win_amd64.whl
-```
-
-For upgrading, upgrade this repo with `pip install .`, and upgrade necessary packages manually.
-
 ## Upgrade

 When a new release comes out you can upgrade your repo with the following command:
--- a/bitsandbytes_windows/libbitsandbytes_cuda118.dll
+++ b/bitsandbytes_windows/libbitsandbytes_cuda118.dll
--- a/bitsandbytes_windows/main.py
+++ b/bitsandbytes_windows/main.py
@@ -1,166 +1,166 @@
-"""
-extract factors the build is dependent on:
-[X] compute capability
-    [ ] TODO: Q - What if we have multiple GPUs of different makes?
- CUDA version
- Software:
-    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiple)
-    - CuBLAS-LT: full-build 8-bit optimizer
-    - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
-
-evaluation:
-    - if paths faulty, return meaningful error
-    - else:
-        - determine CUDA version
-        - determine capabilities
-        - based on that set the default path
-"""
-
-import ctypes
-
-from .paths import determine_cuda_runtime_lib_path
-
-
-def check_cuda_result(cuda, result_val):
-    # 3. Check for CUDA errors
-    if result_val != 0:
-        error_str = ctypes.c_char_p()
-        cuda.cuGetErrorString(result_val, ctypes.byref(error_str))
-        print(f"CUDA exception! Error code: {error_str.value.decode()}")
-
-def get_cuda_version(cuda, cudart_path):
-    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION
-    try:
-        cudart = ctypes.CDLL(cudart_path)
-    except OSError:
-        # TODO: shouldn't we error or at least warn here?
-        print(f'ERROR: libcudart.so could not be read from path: {cudart_path}!')
-        return None
-
-    version = ctypes.c_int()
-    check_cuda_result(cuda, cudart.cudaRuntimeGetVersion(ctypes.byref(version)))
-    version = int(version.value)
-    major = version//1000
-    minor = (version-(major*1000))//10
-
-    if major < 11:
-       print('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
-
-    return f'{major}{minor}'
-
-
-def get_cuda_lib_handle():
-    # 1. find libcuda.so library (GPU driver) (/usr/lib)
-    try:
-        cuda = ctypes.CDLL("libcuda.so")
-    except OSError:
-        # TODO: shouldn't we error or at least warn here?
-        print('CUDA SETUP: WARNING! libcuda.so not found! Do you have a CUDA driver installed? If you are on a cluster, make sure you are on a CUDA machine!')
-        return None
-    check_cuda_result(cuda, cuda.cuInit(0))
-
-    return cuda
-
-
-def get_compute_capabilities(cuda):
-    """
-    1. find libcuda.so library (GPU driver) (/usr/lib)
-       init_device -> init variables -> call function by reference
-    2. call extern C function to determine CC
-       (https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html)
-    3. Check for CUDA errors
-       https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
-    # bits taken from https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549
-    """
-
-
-    nGpus = ctypes.c_int()
-    cc_major = ctypes.c_int()
-    cc_minor = ctypes.c_int()
-
-    device = ctypes.c_int()
-
-    check_cuda_result(cuda, cuda.cuDeviceGetCount(ctypes.byref(nGpus)))
-    ccs = []
-    for i in range(nGpus.value):
-        check_cuda_result(cuda, cuda.cuDeviceGet(ctypes.byref(device), i))
-        ref_major = ctypes.byref(cc_major)
-        ref_minor = ctypes.byref(cc_minor)
-        # 2. call extern C function to determine CC
-        check_cuda_result(
-            cuda, cuda.cuDeviceComputeCapability(ref_major, ref_minor, device)
-        )
-        ccs.append(f"{cc_major.value}.{cc_minor.value}")
-
-    return ccs
-
-
-# def get_compute_capability()-> Union[List[str, ...], None]: # FIXME: error
-def get_compute_capability(cuda):
-    """
-    Extracts the highest compute capbility from all available GPUs, as compute
-    capabilities are downwards compatible. If no GPUs are detected, it returns
-    None.
-    """
-    ccs = get_compute_capabilities(cuda)
-    if ccs is not None:
-        # TODO: handle different compute capabilities; for now, take the max
-        return ccs[-1]
-    return None
-
-
-def evaluate_cuda_setup():
-    print('')
-    print('='*35 + 'BUG REPORT' + '='*35)
-    print('Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')
-    print('For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link')
-    print('='*80)
-    return "libbitsandbytes_cuda116.dll"            # $$$
-    
-    binary_name = "libbitsandbytes_cpu.so"
-    #if not torch.cuda.is_available():
-        #print('No GPU detected. Loading CPU library...')
-        #return binary_name
-
-    cudart_path = determine_cuda_runtime_lib_path()
-    if cudart_path is None:
-        print(
-            "WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!"
-        )
-        return binary_name
-
-    print(f"CUDA SETUP: CUDA runtime path found: {cudart_path}")
-    cuda = get_cuda_lib_handle()
-    cc = get_compute_capability(cuda)
-    print(f"CUDA SETUP: Highest compute capability among GPUs detected: {cc}")
-    cuda_version_string = get_cuda_version(cuda, cudart_path)
-
-
-    if cc == '':
-        print(
-            "WARNING: No GPU detected! Check your CUDA paths. Processing to load CPU-only library..."
-        )
-        return binary_name
-
-    # 7.5 is the minimum CC vor cublaslt
-    has_cublaslt = cc in ["7.5", "8.0", "8.6"]
-
-    # TODO:
-    # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
-    # (2) Multiple CUDA versions installed
-
-    # we use ls -l instead of nvcc to determine the cuda version
-    # since most installations will have the libcudart.so installed, but not the compiler
-    print(f'CUDA SETUP: Detected CUDA version {cuda_version_string}')
-
-    def get_binary_name():
-        "if not has_cublaslt (CC < 7.5), then we have to choose  _nocublaslt.so"
-        bin_base_name = "libbitsandbytes_cuda"
-        if has_cublaslt:
-            return f"{bin_base_name}{cuda_version_string}.so"
-        else:
-            return f"{bin_base_name}{cuda_version_string}_nocublaslt.so"
-
-    binary_name = get_binary_name()
-
-    return binary_name
+"""
+extract factors the build is dependent on:
+[X] compute capability
+    [ ] TODO: Q - What if we have multiple GPUs of different makes?
+- CUDA version
+- Software:
+    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiple)
+    - CuBLAS-LT: full-build 8-bit optimizer
+    - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
+
+evaluation:
+    - if paths faulty, return meaningful error
+    - else:
+        - determine CUDA version
+        - determine capabilities
+        - based on that set the default path
+"""
+
+import ctypes
+
+from .paths import determine_cuda_runtime_lib_path
+
+
+def check_cuda_result(cuda, result_val):
+    # 3. Check for CUDA errors
+    if result_val != 0:
+        error_str = ctypes.c_char_p()
+        cuda.cuGetErrorString(result_val, ctypes.byref(error_str))
+        print(f"CUDA exception! Error code: {error_str.value.decode()}")
+
+def get_cuda_version(cuda, cudart_path):
+    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION
+    try:
+        cudart = ctypes.CDLL(cudart_path)
+    except OSError:
+        # TODO: shouldn't we error or at least warn here?
+        print(f'ERROR: libcudart.so could not be read from path: {cudart_path}!')
+        return None
+
+    version = ctypes.c_int()
+    check_cuda_result(cuda, cudart.cudaRuntimeGetVersion(ctypes.byref(version)))
+    version = int(version.value)
+    major = version//1000
+    minor = (version-(major*1000))//10
+
+    if major < 11:
+       print('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!')
+
+    return f'{major}{minor}'
+
+
+def get_cuda_lib_handle():
+    # 1. find libcuda.so library (GPU driver) (/usr/lib)
+    try:
+        cuda = ctypes.CDLL("libcuda.so")
+    except OSError:
+        # TODO: shouldn't we error or at least warn here?
+        print('CUDA SETUP: WARNING! libcuda.so not found! Do you have a CUDA driver installed? If you are on a cluster, make sure you are on a CUDA machine!')
+        return None
+    check_cuda_result(cuda, cuda.cuInit(0))
+
+    return cuda
+
+
+def get_compute_capabilities(cuda):
+    """
+    1. find libcuda.so library (GPU driver) (/usr/lib)
+       init_device -> init variables -> call function by reference
+    2. call extern C function to determine CC
+       (https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE__DEPRECATED.html)
+    3. Check for CUDA errors
+       https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
+    # bits taken from https://gist.github.com/f0k/63a664160d016a491b2cbea15913d549
+    """
+
+
+    nGpus = ctypes.c_int()
+    cc_major = ctypes.c_int()
+    cc_minor = ctypes.c_int()
+
+    device = ctypes.c_int()
+
+    check_cuda_result(cuda, cuda.cuDeviceGetCount(ctypes.byref(nGpus)))
+    ccs = []
+    for i in range(nGpus.value):
+        check_cuda_result(cuda, cuda.cuDeviceGet(ctypes.byref(device), i))
+        ref_major = ctypes.byref(cc_major)
+        ref_minor = ctypes.byref(cc_minor)
+        # 2. call extern C function to determine CC
+        check_cuda_result(
+            cuda, cuda.cuDeviceComputeCapability(ref_major, ref_minor, device)
+        )
+        ccs.append(f"{cc_major.value}.{cc_minor.value}")
+
+    return ccs
+
+
+# def get_compute_capability()-> Union[List[str, ...], None]: # FIXME: error
+def get_compute_capability(cuda):
+    """
+    Extracts the highest compute capbility from all available GPUs, as compute
+    capabilities are downwards compatible. If no GPUs are detected, it returns
+    None.
+    """
+    ccs = get_compute_capabilities(cuda)
+    if ccs is not None:
+        # TODO: handle different compute capabilities; for now, take the max
+        return ccs[-1]
+    return None
+
+
+def evaluate_cuda_setup():
+    print('')
+    print('='*35 + 'BUG REPORT' + '='*35)
+    print('Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')
+    print('For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link')
+    print('='*80)
+    return "libbitsandbytes_cuda116.dll"            # $$$
+    
+    binary_name = "libbitsandbytes_cpu.so"
+    #if not torch.cuda.is_available():
+        #print('No GPU detected. Loading CPU library...')
+        #return binary_name
+
+    cudart_path = determine_cuda_runtime_lib_path()
+    if cudart_path is None:
+        print(
+            "WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!"
+        )
+        return binary_name
+
+    print(f"CUDA SETUP: CUDA runtime path found: {cudart_path}")
+    cuda = get_cuda_lib_handle()
+    cc = get_compute_capability(cuda)
+    print(f"CUDA SETUP: Highest compute capability among GPUs detected: {cc}")
+    cuda_version_string = get_cuda_version(cuda, cudart_path)
+
+
+    if cc == '':
+        print(
+            "WARNING: No GPU detected! Check your CUDA paths. Processing to load CPU-only library..."
+        )
+        return binary_name
+
+    # 7.5 is the minimum CC vor cublaslt
+    has_cublaslt = cc in ["7.5", "8.0", "8.6"]
+
+    # TODO:
+    # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
+    # (2) Multiple CUDA versions installed
+
+    # we use ls -l instead of nvcc to determine the cuda version
+    # since most installations will have the libcudart.so installed, but not the compiler
+    print(f'CUDA SETUP: Detected CUDA version {cuda_version_string}')
+
+    def get_binary_name():
+        "if not has_cublaslt (CC < 7.5), then we have to choose  _nocublaslt.so"
+        bin_base_name = "libbitsandbytes_cuda"
+        if has_cublaslt:
+            return f"{bin_base_name}{cuda_version_string}.so"
+        else:
+            return f"{bin_base_name}{cuda_version_string}_nocublaslt.so"
+
+    binary_name = get_binary_name()
+
+    return binary_name
--- a/gen_img_diffusers.py
+++ b/gen_img_diffusers.py
@@ -937,6 +937,17 @@ class PipelineLike:
        if self.control_nets:
            guided_hints = original_control_net.get_guided_hints(self.control_nets, num_latent_input, batch_size, clip_guide_images)

+        if reginonal_network:
+            num_sub_and_neg_prompts = len(text_embeddings) // batch_size
+            # last subprompt and negative prompt
+            text_emb_last = []
+            for j in range(batch_size):
+                text_emb_last.append(text_embeddings[(j + 1) * num_sub_and_neg_prompts - 2])
+                text_emb_last.append(text_embeddings[(j + 1) * num_sub_and_neg_prompts - 1])
+            text_emb_last = torch.stack(text_emb_last)
+        else:
+            text_emb_last = text_embeddings
+            
        for i, t in enumerate(tqdm(timesteps)):
            # expand the latents if we are doing classifier free guidance
            latent_model_input = latents.repeat((num_latent_input, 1, 1, 1))
@@ -944,11 +955,6 @@ class PipelineLike:

            # predict the noise residual
            if self.control_nets and self.control_net_enabled:
-                if reginonal_network:
-                    num_sub_and_neg_prompts = len(text_embeddings) // batch_size
-                    text_emb_last = text_embeddings[num_sub_and_neg_prompts - 2 :: num_sub_and_neg_prompts]  # last subprompt
-                else:
-                    text_emb_last = text_embeddings
                noise_pred = original_control_net.call_unet_and_control_net(
                    i,
                    num_latent_input,
@@ -958,6 +964,7 @@ class PipelineLike:
                    i / len(timesteps),
                    latent_model_input,
                    t,
+                    text_embeddings,
                    text_emb_last,
                ).sample
            else:
@@ -2746,6 +2753,10 @@ def main(args):
        print(f"iteration {gen_iter+1}/{args.n_iter}")
        iter_seed = random.randint(0, 0x7FFFFFFF)

+        # shuffle prompt list
+        if args.shuffle_prompts:
+            random.shuffle(prompt_list)
+
        # バッチ処理の関数
        def process_batch(batch: List[BatchData], highres_fix, highres_1st=False):
            batch_size = len(batch)
@@ -2963,6 +2974,8 @@ def main(args):
            for i, (image, prompt, negative_prompts, seed, clip_prompt) in enumerate(
                zip(images, prompts, negative_prompts, seeds, clip_prompts)
            ):
+                if highres_fix:
+                    seed -= 1  # record original seed
                metadata = PngInfo()
                metadata.add_text("prompt", prompt)
                metadata.add_text("seed", str(seed))
@@ -3319,6 +3332,11 @@ def setup_parser() -> argparse.ArgumentParser:
        action="store_true",
        help="use same seed for all prompts in iteration if no seed specified / 乱数seedの指定がないとき繰り返し内はすべて同じseedを使う（プロンプト間の差異の比較用）",
    )
+    parser.add_argument(
+        "--shuffle_prompts",
+        action="store_true",
+        help="shuffle prompts in iteration / 繰り返し内のプロンプトをシャッフルする",
+    )
    parser.add_argument("--fp16", action="store_true", help="use fp16 / fp16を指定し省メモリ化する")
    parser.add_argument("--bf16", action="store_true", help="use bfloat16 / bfloat16を指定し省メモリ化する")
    parser.add_argument("--xformers", action="store_true", help="use xformers / xformersを使用し高速化する")
--- a/library/custom_train_functions.py
+++ b/library/custom_train_functions.py
@@ -18,6 +18,7 @@ def prepare_scheduler_for_custom_training(noise_scheduler, device):

    noise_scheduler.all_snr = all_snr.to(device)

+
 def fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler):
    # fix beta: zero terminal SNR
    print(f"fix noise scheduler betas: https://arxiv.org/abs/2305.08891")
@@ -55,6 +56,7 @@ def fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler):
    noise_scheduler.alphas = alphas
    noise_scheduler.alphas_cumprod = alphas_cumprod

+
 def apply_snr_weight(loss, timesteps, noise_scheduler, gamma):
    snr = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])
    gamma_over_snr = torch.div(torch.ones_like(snr) * gamma, snr)
@@ -64,11 +66,24 @@ def apply_snr_weight(loss, timesteps, noise_scheduler, gamma):


 def scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    loss = loss * scale
+    return loss
+
+
+def get_snr_scale(timesteps, noise_scheduler):
    snr_t = torch.stack([noise_scheduler.all_snr[t] for t in timesteps])  # batch_size
    snr_t = torch.minimum(snr_t, torch.ones_like(snr_t) * 1000)  # if timestep is 0, snr_t is inf, so limit it to 1000
    scale = snr_t / (snr_t + 1)
+    # # show debug info
+    # print(f"timesteps: {timesteps}, snr_t: {snr_t}, scale: {scale}")
+    return scale

-    loss = loss * scale
+
+def add_v_prediction_like_loss(loss, timesteps, noise_scheduler, v_pred_like_loss):
+    scale = get_snr_scale(timesteps, noise_scheduler)
+    # print(f"add v-prediction like loss: {v_pred_like_loss}, scale: {scale}, loss: {loss}, time: {timesteps}")
+    loss = loss + loss / scale * v_pred_like_loss
    return loss


@@ -87,6 +102,12 @@ def add_custom_train_arguments(parser: argparse.ArgumentParser, support_weighted
        action="store_true",
        help="scale v-prediction loss like noise prediction loss / v-prediction lossをnoise prediction lossと同じようにスケーリングする",
    )
+    parser.add_argument(
+        "--v_pred_like_loss",
+        type=float,
+        default=None,
+        help="add v-prediction like loss multiplied by this value / v-prediction lossをこの値をかけたものをlossに加算する",
+    )
    if support_weighted_captions:
        parser.add_argument(
            "--weighted_captions",
--- a/library/sdxl_lpw_stable_diffusion.py
+++ b/library/sdxl_lpw_stable_diffusion.py
@@ -18,7 +18,7 @@ from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput,
 from diffusers.utils import logging
 from PIL import Image

-from library import sdxl_model_util, sdxl_train_util
+from library import sdxl_model_util, sdxl_train_util, train_util


 try:
@@ -210,7 +210,7 @@ def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos
    return tokens, weights


-def get_hidden_states(text_encoder, input_ids, is_sdxl_text_encoder2: bool, device):
+def get_hidden_states(text_encoder, input_ids, is_sdxl_text_encoder2: bool, eos_token_id, device):
    if not is_sdxl_text_encoder2:
        # text_encoder1: same as SD1/2
        enc_out = text_encoder(input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=True)
@@ -220,7 +220,8 @@ def get_hidden_states(text_encoder, input_ids, is_sdxl_text_encoder2: bool, devi
        # text_encoder2
        enc_out = text_encoder(input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=True)
        hidden_states = enc_out["hidden_states"][-2]  # penuultimate layer
-        pool = enc_out["text_embeds"]
+        # pool = enc_out["text_embeds"]
+        pool = train_util.pool_workaround(text_encoder, enc_out["last_hidden_state"], input_ids, eos_token_id)
    hidden_states = hidden_states.to(device)
    if pool is not None:
        pool = pool.to(device)
@@ -261,7 +262,7 @@ def get_unweighted_text_embeddings(
                        text_input_chunk[j, 1] = eos

            text_embedding, current_text_pool = get_hidden_states(
-                pipe.text_encoder, text_input_chunk, is_sdxl_text_encoder2, pipe.device
+                pipe.text_encoder, text_input_chunk, is_sdxl_text_encoder2, eos, pipe.device
            )
            if text_pool is None:
                text_pool = current_text_pool
@@ -280,7 +281,7 @@ def get_unweighted_text_embeddings(
            text_embeddings.append(text_embedding)
        text_embeddings = torch.concat(text_embeddings, axis=1)
    else:
-        text_embeddings, text_pool = get_hidden_states(pipe.text_encoder, text_input, is_sdxl_text_encoder2, pipe.device)
+        text_embeddings, text_pool = get_hidden_states(pipe.text_encoder, text_input, is_sdxl_text_encoder2, eos, pipe.device)
    return text_embeddings, text_pool


--- a/library/sdxl_model_util.py
+++ b/library/sdxl_model_util.py
@@ -1,6 +1,9 @@
 import torch
+from accelerate import init_empty_weights
+from accelerate.utils.modeling import set_module_tensor_to_device
 from safetensors.torch import load_file, save_file
 from transformers import CLIPTextModel, CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer
+from typing import List
 from diffusers import AutoencoderKL, EulerDiscreteScheduler, UNet2DConditionModel
 from library import model_util
 from library import sdxl_original_unet
@@ -133,13 +136,39 @@ def convert_sdxl_text_encoder_2_checkpoint(checkpoint, max_length):
    return new_sd, logit_scale


-def load_models_from_sdxl_checkpoint(model_version, ckpt_path, map_location):
+# load state_dict without allocating new tensors
+def _load_state_dict_on_device(model, state_dict, device, dtype=None):
+    # dtype will use fp32 as default
+    missing_keys = list(model.state_dict().keys() - state_dict.keys())
+    unexpected_keys = list(state_dict.keys() - model.state_dict().keys())
+
+    # similar to model.load_state_dict()
+    if not missing_keys and not unexpected_keys:
+        for k in list(state_dict.keys()):
+            set_module_tensor_to_device(model, k, device, value=state_dict.pop(k), dtype=dtype)
+        return "<All keys matched successfully>"
+
+    # error_msgs
+    error_msgs: List[str] = []
+    if missing_keys:
+        error_msgs.insert(0, "Missing key(s) in state_dict: {}. ".format(", ".join('"{}"'.format(k) for k in missing_keys)))
+    if unexpected_keys:
+        error_msgs.insert(0, "Unexpected key(s) in state_dict: {}. ".format(", ".join('"{}"'.format(k) for k in unexpected_keys)))
+
+    raise RuntimeError("Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs)))
+
+
+def load_models_from_sdxl_checkpoint(model_version, ckpt_path, map_location, dtype=None):
    # model_version is reserved for future use
+    # dtype is reserved for full_fp16/bf16 integration. Text Encoder will remain fp32, because it runs on CPU when caching

    # Load the state dict
    if model_util.is_safetensors(ckpt_path):
        checkpoint = None
-        state_dict = load_file(ckpt_path, device=map_location)
+        try:
+            state_dict = load_file(ckpt_path, device=map_location)
+        except:
+            state_dict = load_file(ckpt_path)  # prevent device invalid Error
        epoch = None
        global_step = None
    else:
@@ -156,16 +185,16 @@ def load_models_from_sdxl_checkpoint(model_version, ckpt_path, map_location):

    # U-Net
    print("building U-Net")
-    unet = sdxl_original_unet.SdxlUNet2DConditionModel()
+    with init_empty_weights():
+        unet = sdxl_original_unet.SdxlUNet2DConditionModel()

    print("loading U-Net from checkpoint")
    unet_sd = {}
    for k in list(state_dict.keys()):
        if k.startswith("model.diffusion_model."):
            unet_sd[k.replace("model.diffusion_model.", "")] = state_dict.pop(k)
-    info = unet.load_state_dict(unet_sd)
+    info = _load_state_dict_on_device(unet, unet_sd, device=map_location)
    print("U-Net: ", info)
-    del unet_sd

    # Text Encoders
    print("building text encoders")
--- a/library/sdxl_train_util.py
+++ b/library/sdxl_train_util.py
@@ -4,6 +4,7 @@ import math
 import os
 from typing import Optional
 import torch
+from accelerate import init_empty_weights
 from tqdm import tqdm
 from transformers import CLIPTokenizer
 from library import model_util, sdxl_model_util, train_util, sdxl_original_unet
@@ -66,7 +67,7 @@ def _load_target_model(name_or_path: str, vae_path: Optional[str], model_version
            unet,
            logit_scale,
            ckpt_info,
-        ) = sdxl_model_util.load_models_from_sdxl_checkpoint(model_version, name_or_path, device)
+        ) = sdxl_model_util.load_models_from_sdxl_checkpoint(model_version, name_or_path, device, weight_dtype)
    else:
        # Diffusers model is loaded to CPU
        from diffusers import StableDiffusionXLPipeline
@@ -75,7 +76,7 @@ def _load_target_model(name_or_path: str, vae_path: Optional[str], model_version
        print(f"load Diffusers pretrained models: {name_or_path}, variant={variant}")
        try:
            try:
-                pipe = StableDiffusionXLPipeline.from_pretrained(name_or_path, variant=variant, tokenizer=None)
+                pipe = StableDiffusionXLPipeline.from_pretrained(name_or_path, torch_dtype=weight_dtype, variant=variant, tokenizer=None)
            except EnvironmentError as ex:
                if variant is not None:
                    print("try to load fp32 model")
@@ -95,10 +96,10 @@ def _load_target_model(name_or_path: str, vae_path: Optional[str], model_version
        del pipe

        # Diffusers U-Net to original U-Net
-        original_unet = sdxl_original_unet.SdxlUNet2DConditionModel()
        state_dict = sdxl_model_util.convert_diffusers_unet_state_dict_to_sdxl(unet.state_dict())
-        original_unet.load_state_dict(state_dict)
-        unet = original_unet
+        with init_empty_weights():
+            unet = sdxl_original_unet.SdxlUNet2DConditionModel() # overwrite unet
+        sdxl_model_util._load_state_dict_on_device(unet, state_dict, device=device)
        print("U-Net converted to original U-Net")

        logit_scale = None
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -34,7 +34,7 @@ import torch
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torchvision import transforms
-from transformers import CLIPTokenizer
+from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextModelWithProjection
 import transformers
 from diffusers.optimization import SchedulerType, TYPE_TO_SCHEDULER_FUNCTION
 from diffusers import (
@@ -55,7 +55,6 @@ from diffusers import (
 from library import custom_train_functions
 from library.original_unet import UNet2DConditionModel
 from huggingface_hub import hf_hub_download
-import albumentations as albu
 import numpy as np
 from PIL import Image
 import cv2
@@ -65,6 +64,7 @@ import safetensors.torch
 from library.lpw_stable_diffusion import StableDiffusionLongPromptWeightingPipeline
 import library.model_util as model_util
 import library.huggingface_util as huggingface_util
+
 # from library.attention_processors import FlashAttnProcessor
 # from library.hypernetwork import replace_attentions_for_hypernetwork
 from library.original_unet import UNet2DConditionModel
@@ -284,42 +284,40 @@ class BucketBatchIndex(NamedTuple):


 class AugHelper:
+    # albumentationsへの依存をなくしたがとりあえず同じinterfaceを持たせる
+
    def __init__(self):
-        # prepare all possible augmentators
-        self.color_aug_method = albu.OneOf(
-            [
-                albu.HueSaturationValue(8, 0, 0, p=0.5),
-                albu.RandomGamma((95, 105), p=0.5),
-            ],
-            p=0.33,
-        )
+        pass

-        # key: (use_color_aug, use_flip_aug)
-        # self.augmentors = {
-        #     (True, True): albu.Compose(
-        #         [
-        #             color_aug_method,
-        #             flip_aug_method,
-        #         ],
-        #         p=1.0,
-        #     ),
-        #     (True, False): albu.Compose(
-        #         [
-        #             color_aug_method,
-        #         ],
-        #         p=1.0,
-        #     ),
-        #     (False, True): albu.Compose(
-        #         [
-        #             flip_aug_method,
-        #         ],
-        #         p=1.0,
-        #     ),
-        #     (False, False): None,
-        # }
+    def color_aug(self, image: np.ndarray):
+        # self.color_aug_method = albu.OneOf(
+        #     [
+        #         albu.HueSaturationValue(8, 0, 0, p=0.5),
+        #         albu.RandomGamma((95, 105), p=0.5),
+        #     ],
+        #     p=0.33,
+        # )
+        hue_shift_limit = 8

-    def get_augmentor(self, use_color_aug: bool) -> Optional[albu.Compose]:
-        return self.color_aug_method if use_color_aug else None
+        # remove dependency to albumentations
+        if random.random() <= 0.33:
+            if random.random() > 0.5:
+                # hue shift
+                hsv_img = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+                hue_shift = random.uniform(-hue_shift_limit, hue_shift_limit)
+                if hue_shift < 0:
+                    hue_shift = 180 + hue_shift
+                hsv_img[:, :, 0] = (hsv_img[:, :, 0] + hue_shift) % 180
+                image = cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR)
+            else:
+                # random gamma
+                gamma = random.uniform(0.95, 1.05)
+                image = np.clip(image**gamma, 0, 255).astype(np.uint8)
+
+        return {"image": image}
+
+    def get_augmentor(self, use_color_aug: bool):  # -> Optional[Callable[[np.ndarray], Dict[str, np.ndarray]]]:
+        return self.color_aug if use_color_aug else None


 class BaseSubset:
@@ -2166,6 +2164,10 @@ def cache_batch_latents(
            if flip_aug:
                info.latents_flipped = flipped_latent

+    # FIXME this slows down caching a lot, specify this as an option
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+

 def cache_batch_text_encoder_outputs(
    image_infos, tokenizers, text_encoders, max_token_length, cache_to_disk, input_ids1, input_ids2, dtype
@@ -2868,6 +2870,11 @@ def verify_training_args(args: argparse.Namespace):
            "scale_v_pred_loss_like_noise_pred can be enabled only with v_parameterization / scale_v_pred_loss_like_noise_predはv_parameterizationが有効なときのみ有効にできます"
        )

+    if args.v_pred_like_loss and args.v_parameterization:
+        raise ValueError(
+            "v_pred_like_loss cannot be enabled with v_parameterization / v_pred_like_lossはv_parameterizationが有効なときには有効にできません"
+        )
+
    if args.zero_terminal_snr and not args.v_parameterization:
        print(
            f"zero_terminal_snr is enabled, but v_parameterization is not enabled. training will be unexpected"
@@ -3181,32 +3188,9 @@ def get_optimizer(args, trainable_params):
    # print("optkwargs:", optimizer_kwargs)

    lr = args.learning_rate
+    optimizer = None

-    if optimizer_type == "AdamW8bit".lower():
-        try:
-            import bitsandbytes as bnb
-        except ImportError:
-            raise ImportError("No bitsand bytes / bitsandbytesがインストールされていないようです")
-        print(f"use 8-bit AdamW optimizer | {optimizer_kwargs}")
-        optimizer_class = bnb.optim.AdamW8bit
-        optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
-
-    elif optimizer_type == "SGDNesterov8bit".lower():
-        try:
-            import bitsandbytes as bnb
-        except ImportError:
-            raise ImportError("No bitsand bytes / bitsandbytesがインストールされていないようです")
-        print(f"use 8-bit SGD with Nesterov optimizer | {optimizer_kwargs}")
-        if "momentum" not in optimizer_kwargs:
-            print(
-                f"8-bit SGD with Nesterov must be with momentum, set momentum to 0.9 / 8-bit SGD with Nesterovはmomentum指定が必須のため0.9に設定します"
-            )
-            optimizer_kwargs["momentum"] = 0.9
-
-        optimizer_class = bnb.optim.SGD8bit
-        optimizer = optimizer_class(trainable_params, lr=lr, nesterov=True, **optimizer_kwargs)
-
-    elif optimizer_type == "Lion".lower():
+    if optimizer_type == "Lion".lower():
        try:
            import lion_pytorch
        except ImportError:
@@ -3214,37 +3198,53 @@ def get_optimizer(args, trainable_params):
        print(f"use Lion optimizer | {optimizer_kwargs}")
        optimizer_class = lion_pytorch.Lion
        optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
-        
+
    elif optimizer_type.endswith("8bit".lower()):
        try:
            import bitsandbytes as bnb
        except ImportError:
            raise ImportError("No bitsandbytes / bitsandbytesがインストールされていないようです")

-        if optimizer_type == "Lion8bit".lower():
-                print(f"use 8-bit Lion optimizer | {optimizer_kwargs}")
-                try:
-                    optimizer_class = bnb.optim.Lion8bit
-                except AttributeError:
-                    raise AttributeError(
-                        "No Lion8bit. The version of bitsandbytes installed seems to be old. Please install 0.38.0 or later. / Lion8bitが定義されていません。インストールされているbitsandbytesのバージョンが古いようです。0.38.0以上をインストールしてください"
-                    )
+        if optimizer_type == "AdamW8bit".lower():
+            print(f"use 8-bit AdamW optimizer | {optimizer_kwargs}")
+            optimizer_class = bnb.optim.AdamW8bit
+            optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
+        elif optimizer_type == "SGDNesterov8bit".lower():
+            print(f"use 8-bit SGD with Nesterov optimizer | {optimizer_kwargs}")
+            if "momentum" not in optimizer_kwargs:
+                print(
+                    f"8-bit SGD with Nesterov must be with momentum, set momentum to 0.9 / 8-bit SGD with Nesterovはmomentum指定が必須のため0.9に設定します"
+                )
+                optimizer_kwargs["momentum"] = 0.9
+
+            optimizer_class = bnb.optim.SGD8bit
+            optimizer = optimizer_class(trainable_params, lr=lr, nesterov=True, **optimizer_kwargs)
+
+        elif optimizer_type == "Lion8bit".lower():
+            print(f"use 8-bit Lion optimizer | {optimizer_kwargs}")
+            try:
+                optimizer_class = bnb.optim.Lion8bit
+            except AttributeError:
+                raise AttributeError(
+                    "No Lion8bit. The version of bitsandbytes installed seems to be old. Please install 0.38.0 or later. / Lion8bitが定義されていません。インストールされているbitsandbytesのバージョンが古いようです。0.38.0以上をインストールしてください"
+                )
        elif optimizer_type == "PagedAdamW8bit".lower():
-                print(f"use 8-bit PagedAdamW optimizer | {optimizer_kwargs}")
-                try:
-                    optimizer_class = bnb.optim.PagedAdamW8bit
-                except AttributeError:
-                    raise AttributeError(
-                        "No PagedAdamW8bit. The version of bitsandbytes installed seems to be old. Please install 0.39.0 or later. / PagedAdamW8bitが定義されていません。インストールされているbitsandbytesのバージョンが古いようです。0.39.0以上をインストールしてください"
-                    )
+            print(f"use 8-bit PagedAdamW optimizer | {optimizer_kwargs}")
+            try:
+                optimizer_class = bnb.optim.PagedAdamW8bit
+            except AttributeError:
+                raise AttributeError(
+                    "No PagedAdamW8bit. The version of bitsandbytes installed seems to be old. Please install 0.39.0 or later. / PagedAdamW8bitが定義されていません。インストールされているbitsandbytesのバージョンが古いようです。0.39.0以上をインストールしてください"
+                )
        elif optimizer_type == "PagedLion8bit".lower():
-                print(f"use 8-bit Paged Lion optimizer | {optimizer_kwargs}")
-                try:
-                    optimizer_class = bnb.optim.PagedLion8bit
-                except AttributeError:
-                    raise AttributeError(
-                        "No PagedLion8bit. The version of bitsandbytes installed seems to be old. Please install 0.39.0 or later. / PagedLion8bitが定義されていません。インストールされているbitsandbytesのバージョンが古いようです。0.39.0以上をインストールしてください"
-                    )
+            print(f"use 8-bit Paged Lion optimizer | {optimizer_kwargs}")
+            try:
+                optimizer_class = bnb.optim.PagedLion8bit
+            except AttributeError:
+                raise AttributeError(
+                    "No PagedLion8bit. The version of bitsandbytes installed seems to be old. Please install 0.39.0 or later. / PagedLion8bitが定義されていません。インストールされているbitsandbytesのバージョンが古いようです。0.39.0以上をインストールしてください"
+                )

        optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)

@@ -3376,7 +3376,7 @@ def get_optimizer(args, trainable_params):
        optimizer_class = torch.optim.AdamW
        optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)

-    else:
+    if optimizer is None:
        # 任意のoptimizerを使う
        optimizer_type = args.optimizer_type  # lowerでないやつ（微妙）
        print(f"use {optimizer_type} | {optimizer_kwargs}")
@@ -3396,10 +3396,8 @@ def get_optimizer(args, trainable_params):
    return optimizer_name, optimizer_args, optimizer


-# Monkeypatch newer get_scheduler() function overridng current version of diffusers.optimizer.get_scheduler
-# code is taken from https://github.com/huggingface/diffusers diffusers.optimizer, commit d87cc15977b87160c30abaace3894e802ad9e1e6
-# Which is a newer release of diffusers than currently packaged with sd-scripts
-# This code can be removed when newer diffusers version (v0.12.1 or greater) is tested and implemented to sd-scripts
+# Modified version of get_scheduler() function from diffusers.optimizer.get_scheduler
+# Add some checking and features to the original function.


 def get_scheduler_fix(args, optimizer: Optimizer, num_processes: int):
@@ -3416,19 +3414,7 @@ def get_scheduler_fix(args, optimizer: Optimizer, num_processes: int):
    if args.lr_scheduler_args is not None and len(args.lr_scheduler_args) > 0:
        for arg in args.lr_scheduler_args:
            key, value = arg.split("=")
-
            value = ast.literal_eval(value)
-            # value = value.split(",")
-            # for i in range(len(value)):
-            #     if value[i].lower() == "true" or value[i].lower() == "false":
-            #         value[i] = value[i].lower() == "true"
-            #     else:
-            #         value[i] = ast.literal_eval(value[i])
-            # if len(value) == 1:
-            #     value = value[0]
-            # else:
-            #     value = list(value)  # some may use list?
-
            lr_scheduler_kwargs[key] = value

    def wrap_check_needless_num_warmup_steps(return_vals):
@@ -3460,15 +3446,19 @@ def get_scheduler_fix(args, optimizer: Optimizer, num_processes: int):

    name = SchedulerType(name)
    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+
    if name == SchedulerType.CONSTANT:
-        return wrap_check_needless_num_warmup_steps(schedule_func(optimizer))
+        return wrap_check_needless_num_warmup_steps(schedule_func(optimizer, **lr_scheduler_kwargs))
+
+    if name == SchedulerType.PIECEWISE_CONSTANT:
+        return schedule_func(optimizer, **lr_scheduler_kwargs)  # step_rules and last_epoch are given as kwargs

    # All other schedulers require `num_warmup_steps`
    if num_warmup_steps is None:
        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")

    if name == SchedulerType.CONSTANT_WITH_WARMUP:
-        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, **lr_scheduler_kwargs)

    # All other schedulers require `num_training_steps`
    if num_training_steps is None:
@@ -3476,13 +3466,19 @@ def get_scheduler_fix(args, optimizer: Optimizer, num_processes: int):

    if name == SchedulerType.COSINE_WITH_RESTARTS:
        return schedule_func(
-            optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_cycles=num_cycles
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            num_cycles=num_cycles,
+            **lr_scheduler_kwargs,
        )

    if name == SchedulerType.POLYNOMIAL:
-        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, power=power)
+        return schedule_func(
+            optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, power=power, **lr_scheduler_kwargs
+        )

-    return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
+    return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, **lr_scheduler_kwargs)


 def prepare_dataset_args(args: argparse.Namespace, support_metadata: bool):
@@ -3738,8 +3734,48 @@ def get_hidden_states(args: argparse.Namespace, input_ids, tokenizer, text_encod
    return encoder_hidden_states


+def pool_workaround(
+    text_encoder: CLIPTextModelWithProjection, last_hidden_state: torch.Tensor, input_ids: torch.Tensor, eos_token_id: int
+):
+    r"""
+    workaround for CLIP's pooling bug: it returns the hidden states for the max token id as the pooled output
+    instead of the hidden states for the EOS token
+    If we use Textual Inversion, we need to use the hidden states for the EOS token as the pooled output
+
+    Original code from CLIP's pooling function:
+
+    \# text_embeds.shape = [batch_size, sequence_length, transformer.width]
+    \# take features from the eot embedding (eot_token is the highest number in each sequence)
+    \# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+    pooled_output = last_hidden_state[
+        torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+        input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+    ]
+    """
+
+    # input_ids: b*n,77
+    # find index for EOS token
+    eos_token_index = torch.where(input_ids == eos_token_id)[1]
+    eos_token_index = eos_token_index.to(device=last_hidden_state.device)
+
+    # get hidden states for EOS token
+    pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), eos_token_index]
+
+    # apply projection
+    pooled_output = text_encoder.text_projection(pooled_output)
+
+    return pooled_output
+
+
 def get_hidden_states_sdxl(
-    max_token_length, input_ids1, input_ids2, tokenizer1, tokenizer2, text_encoder1, text_encoder2, weight_dtype=None
+    max_token_length: int,
+    input_ids1: torch.Tensor,
+    input_ids2: torch.Tensor,
+    tokenizer1: CLIPTokenizer,
+    tokenizer2: CLIPTokenizer,
+    text_encoder1: CLIPTextModel,
+    text_encoder2: CLIPTextModelWithProjection,
+    weight_dtype: Optional[str] = None,
 ):
    # input_ids: b,n,77 -> b*n, 77
    b_size = input_ids1.size()[0]
@@ -3753,7 +3789,9 @@ def get_hidden_states_sdxl(
    # text_encoder2
    enc_out = text_encoder2(input_ids2, output_hidden_states=True, return_dict=True)
    hidden_states2 = enc_out["hidden_states"][-2]  # penuultimate layer
-    pool2 = enc_out["text_embeds"]
+
+    # pool2 = enc_out["text_embeds"]
+    pool2 = pool_workaround(text_encoder2, enc_out["last_hidden_state"], input_ids2, tokenizer2.eos_token_id)

    # b*n, 77, 768 or 1280 -> b, n*77, 768 or 1280
    n_size = 1 if max_token_length is None else max_token_length // 75
--- a/networks/svd_merge_lora.py
+++ b/networks/svd_merge_lora.py
@@ -1,4 +1,3 @@
-
 import math
 import argparse
 import os
@@ -13,180 +12,198 @@ CLAMP_QUANTILE = 0.99


 def load_state_dict(file_name, dtype):
-  if os.path.splitext(file_name)[1] == '.safetensors':
-    sd = load_file(file_name)
-  else:
-    sd = torch.load(file_name, map_location='cpu')
-  for key in list(sd.keys()):
-    if type(sd[key]) == torch.Tensor:
-      sd[key] = sd[key].to(dtype)
-  return sd
+    if os.path.splitext(file_name)[1] == ".safetensors":
+        sd = load_file(file_name)
+    else:
+        sd = torch.load(file_name, map_location="cpu")
+    for key in list(sd.keys()):
+        if type(sd[key]) == torch.Tensor:
+            sd[key] = sd[key].to(dtype)
+    return sd


 def save_to_file(file_name, state_dict, dtype):
-  if dtype is not None:
-    for key in list(state_dict.keys()):
-      if type(state_dict[key]) == torch.Tensor:
-        state_dict[key] = state_dict[key].to(dtype)
+    if dtype is not None:
+        for key in list(state_dict.keys()):
+            if type(state_dict[key]) == torch.Tensor:
+                state_dict[key] = state_dict[key].to(dtype)

-  if os.path.splitext(file_name)[1] == '.safetensors':
-    save_file(state_dict, file_name)
-  else:
-    torch.save(state_dict, file_name)
+    if os.path.splitext(file_name)[1] == ".safetensors":
+        save_file(state_dict, file_name)
+    else:
+        torch.save(state_dict, file_name)


 def merge_lora_models(models, ratios, new_rank, new_conv_rank, device, merge_dtype):
-  print(f"new rank: {new_rank}, new conv rank: {new_conv_rank}")
-  merged_sd = {}
-  for model, ratio in zip(models, ratios):
-    print(f"loading: {model}")
-    lora_sd = load_state_dict(model, merge_dtype)
+    print(f"new rank: {new_rank}, new conv rank: {new_conv_rank}")
+    merged_sd = {}
+    for model, ratio in zip(models, ratios):
+        print(f"loading: {model}")
+        lora_sd = load_state_dict(model, merge_dtype)

-    # merge
-    print(f"merging...")
-    for key in tqdm(list(lora_sd.keys())):
-      if 'lora_down' not in key:
-        continue
+        # merge
+        print(f"merging...")
+        for key in tqdm(list(lora_sd.keys())):
+            if "lora_down" not in key:
+                continue

-      lora_module_name = key[:key.rfind(".lora_down")]
+            lora_module_name = key[: key.rfind(".lora_down")]

-      down_weight = lora_sd[key]
-      network_dim = down_weight.size()[0]
+            down_weight = lora_sd[key]
+            network_dim = down_weight.size()[0]

-      up_weight = lora_sd[lora_module_name + '.lora_up.weight']
-      alpha = lora_sd.get(lora_module_name + '.alpha', network_dim)
+            up_weight = lora_sd[lora_module_name + ".lora_up.weight"]
+            alpha = lora_sd.get(lora_module_name + ".alpha", network_dim)

-      in_dim = down_weight.size()[1]
-      out_dim = up_weight.size()[0]
-      conv2d = len(down_weight.size()) == 4
-      kernel_size = None if not conv2d else down_weight.size()[2:4]
-      # print(lora_module_name, network_dim, alpha, in_dim, out_dim, kernel_size)
+            in_dim = down_weight.size()[1]
+            out_dim = up_weight.size()[0]
+            conv2d = len(down_weight.size()) == 4
+            kernel_size = None if not conv2d else down_weight.size()[2:4]
+            # print(lora_module_name, network_dim, alpha, in_dim, out_dim, kernel_size)

-      # make original weight if not exist
-      if lora_module_name not in merged_sd:
-        weight = torch.zeros((out_dim, in_dim, *kernel_size) if conv2d else (out_dim, in_dim), dtype=merge_dtype)
-        if device:
-          weight = weight.to(device)
-      else:
-        weight = merged_sd[lora_module_name]
+            # make original weight if not exist
+            if lora_module_name not in merged_sd:
+                weight = torch.zeros((out_dim, in_dim, *kernel_size) if conv2d else (out_dim, in_dim), dtype=merge_dtype)
+                if device:
+                    weight = weight.to(device)
+            else:
+                weight = merged_sd[lora_module_name]

-      # merge to weight
-      if device:
-        up_weight = up_weight.to(device)
-        down_weight = down_weight.to(device)
+            # merge to weight
+            if device:
+                up_weight = up_weight.to(device)
+                down_weight = down_weight.to(device)

-      # W <- W + U * D
-      scale = (alpha / network_dim)
+            # W <- W + U * D
+            scale = alpha / network_dim

-      if device:                      # and isinstance(scale, torch.Tensor):
-        scale = scale.to(device)
+            if device:  # and isinstance(scale, torch.Tensor):
+                scale = scale.to(device)

-      if not conv2d:        # linear
-        weight = weight + ratio * (up_weight @ down_weight) * scale
-      elif kernel_size == (1, 1):
-        weight = weight + ratio * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)
-                                   ).unsqueeze(2).unsqueeze(3) * scale
-      else:
-        conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
-        weight = weight + ratio * conved * scale
+            if not conv2d:  # linear
+                weight = weight + ratio * (up_weight @ down_weight) * scale
+            elif kernel_size == (1, 1):
+                weight = (
+                    weight
+                    + ratio
+                    * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                    * scale
+                )
+            else:
+                conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+                weight = weight + ratio * conved * scale

-      merged_sd[lora_module_name] = weight
+            merged_sd[lora_module_name] = weight

-  # extract from merged weights
-  print("extract new lora...")
-  merged_lora_sd = {}
-  with torch.no_grad():
-    for lora_module_name, mat in tqdm(list(merged_sd.items())):
-      conv2d = (len(mat.size()) == 4)
-      kernel_size = None if not conv2d else mat.size()[2:4]
-      conv2d_3x3 = conv2d and kernel_size != (1, 1)
-      out_dim, in_dim = mat.size()[0:2]
+    # extract from merged weights
+    print("extract new lora...")
+    merged_lora_sd = {}
+    with torch.no_grad():
+        for lora_module_name, mat in tqdm(list(merged_sd.items())):
+            conv2d = len(mat.size()) == 4
+            kernel_size = None if not conv2d else mat.size()[2:4]
+            conv2d_3x3 = conv2d and kernel_size != (1, 1)
+            out_dim, in_dim = mat.size()[0:2]

-      if conv2d:
-        if conv2d_3x3:
-          mat = mat.flatten(start_dim=1)
-        else:
-          mat = mat.squeeze()
+            if conv2d:
+                if conv2d_3x3:
+                    mat = mat.flatten(start_dim=1)
+                else:
+                    mat = mat.squeeze()

-      module_new_rank = new_conv_rank if conv2d_3x3 else new_rank
-      module_new_rank = min(module_new_rank, in_dim, out_dim)                           # LoRA rank cannot exceed the original dim
+            module_new_rank = new_conv_rank if conv2d_3x3 else new_rank
+            module_new_rank = min(module_new_rank, in_dim, out_dim)  # LoRA rank cannot exceed the original dim

-      U, S, Vh = torch.linalg.svd(mat)
+            U, S, Vh = torch.linalg.svd(mat)

-      U = U[:, :module_new_rank]
-      S = S[:module_new_rank]
-      U = U @ torch.diag(S)
+            U = U[:, :module_new_rank]
+            S = S[:module_new_rank]
+            U = U @ torch.diag(S)

-      Vh = Vh[:module_new_rank, :]
+            Vh = Vh[:module_new_rank, :]

-      dist = torch.cat([U.flatten(), Vh.flatten()])
-      hi_val = torch.quantile(dist, CLAMP_QUANTILE)
-      low_val = -hi_val
+            dist = torch.cat([U.flatten(), Vh.flatten()])
+            hi_val = torch.quantile(dist, CLAMP_QUANTILE)
+            low_val = -hi_val

-      U = U.clamp(low_val, hi_val)
-      Vh = Vh.clamp(low_val, hi_val)
+            U = U.clamp(low_val, hi_val)
+            Vh = Vh.clamp(low_val, hi_val)

-      if conv2d:
-        U = U.reshape(out_dim, module_new_rank, 1, 1)
-        Vh = Vh.reshape(module_new_rank, in_dim, kernel_size[0], kernel_size[1])
+            if conv2d:
+                U = U.reshape(out_dim, module_new_rank, 1, 1)
+                Vh = Vh.reshape(module_new_rank, in_dim, kernel_size[0], kernel_size[1])

-      up_weight = U
-      down_weight = Vh
+            up_weight = U
+            down_weight = Vh

-      merged_lora_sd[lora_module_name + '.lora_up.weight'] = up_weight.to("cpu").contiguous()
-      merged_lora_sd[lora_module_name + '.lora_down.weight'] = down_weight.to("cpu").contiguous()
-      merged_lora_sd[lora_module_name + '.alpha'] = torch.tensor(module_new_rank)
+            merged_lora_sd[lora_module_name + ".lora_up.weight"] = up_weight.to("cpu").contiguous()
+            merged_lora_sd[lora_module_name + ".lora_down.weight"] = down_weight.to("cpu").contiguous()
+            merged_lora_sd[lora_module_name + ".alpha"] = torch.tensor(module_new_rank)

-  return merged_lora_sd
+    return merged_lora_sd


 def merge(args):
-  assert len(args.models) == len(args.ratios), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"
+    assert len(args.models) == len(args.ratios), f"number of models must be equal to number of ratios / モデルの数と重みの数は合わせてください"

-  def str_to_dtype(p):
-    if p == 'float':
-      return torch.float
-    if p == 'fp16':
-      return torch.float16
-    if p == 'bf16':
-      return torch.bfloat16
-    return None
+    def str_to_dtype(p):
+        if p == "float":
+            return torch.float
+        if p == "fp16":
+            return torch.float16
+        if p == "bf16":
+            return torch.bfloat16
+        return None

-  merge_dtype = str_to_dtype(args.precision)
-  save_dtype = str_to_dtype(args.save_precision)
-  if save_dtype is None:
-    save_dtype = merge_dtype
+    merge_dtype = str_to_dtype(args.precision)
+    save_dtype = str_to_dtype(args.save_precision)
+    if save_dtype is None:
+        save_dtype = merge_dtype

-  new_conv_rank = args.new_conv_rank if args.new_conv_rank is not None else args.new_rank
-  state_dict = merge_lora_models(args.models, args.ratios, args.new_rank, new_conv_rank, args.device, merge_dtype)
+    new_conv_rank = args.new_conv_rank if args.new_conv_rank is not None else args.new_rank
+    state_dict = merge_lora_models(args.models, args.ratios, args.new_rank, new_conv_rank, args.device, merge_dtype)

-  print(f"saving model to: {args.save_to}")
-  save_to_file(args.save_to, state_dict, save_dtype)
+    print(f"saving model to: {args.save_to}")
+    save_to_file(args.save_to, state_dict, save_dtype)


 def setup_parser() -> argparse.ArgumentParser:
-  parser = argparse.ArgumentParser()
-  parser.add_argument("--save_precision", type=str, default=None,
-                      choices=[None, "float", "fp16", "bf16"], help="precision in saving, same to merging if omitted / 保存時に精度を変更して保存する、省略時はマージ時の精度と同じ")
-  parser.add_argument("--precision", type=str, default="float",
-                      choices=["float", "fp16", "bf16"], help="precision in merging (float is recommended) / マージの計算時の精度（floatを推奨）")
-  parser.add_argument("--save_to", type=str, default=None,
-                      help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors")
-  parser.add_argument("--models", type=str, nargs='*',
-                      help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors")
-  parser.add_argument("--ratios", type=float, nargs='*',
-                      help="ratios for each model / それぞれのLoRAモデルの比率")
-  parser.add_argument("--new_rank", type=int, default=4,
-                      help="Specify rank of output LoRA / 出力するLoRAのrank (dim)")
-  parser.add_argument("--new_conv_rank", type=int, default=None,
-                      help="Specify rank of output LoRA for Conv2d 3x3, None for same as new_rank / 出力するConv2D 3x3 LoRAのrank (dim)、Noneでnew_rankと同じ")
-  parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う")
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--save_precision",
+        type=str,
+        default=None,
+        choices=[None, "float", "fp16", "bf16"],
+        help="precision in saving, same to merging if omitted / 保存時に精度を変更して保存する、省略時はマージ時の精度と同じ",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default="float",
+        choices=["float", "fp16", "bf16"],
+        help="precision in merging (float is recommended) / マージの計算時の精度（floatを推奨）",
+    )
+    parser.add_argument(
+        "--save_to", type=str, default=None, help="destination file name: ckpt or safetensors file / 保存先のファイル名、ckptまたはsafetensors"
+    )
+    parser.add_argument(
+        "--models", type=str, nargs="*", help="LoRA models to merge: ckpt or safetensors file / マージするLoRAモデル、ckptまたはsafetensors"
+    )
+    parser.add_argument("--ratios", type=float, nargs="*", help="ratios for each model / それぞれのLoRAモデルの比率")
+    parser.add_argument("--new_rank", type=int, default=4, help="Specify rank of output LoRA / 出力するLoRAのrank (dim)")
+    parser.add_argument(
+        "--new_conv_rank",
+        type=int,
+        default=None,
+        help="Specify rank of output LoRA for Conv2d 3x3, None for same as new_rank / 出力するConv2D 3x3 LoRAのrank (dim)、Noneでnew_rankと同じ",
+    )
+    parser.add_argument("--device", type=str, default=None, help="device to use, cuda for GPU / 計算を行うデバイス、cuda でGPUを使う")

-  return parser
+    return parser


-if __name__ == '__main__':
-  parser = setup_parser()
+if __name__ == "__main__":
+    parser = setup_parser()

-  args = parser.parse_args()
-  merge(args)
+    args = parser.parse_args()
+    merge(args)
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,11 +2,11 @@ accelerate==0.19.0
 transformers==4.30.2
 diffusers[torch]==0.18.2
 ftfy==6.1.1
-albumentations==1.3.0
+# albumentations==1.3.0
 opencv-python==4.7.0.68
 einops==0.6.0
 pytorch-lightning==1.9.0
-bitsandbytes==0.35.0
+# bitsandbytes==0.39.1
 tensorboard==2.10.1
 safetensors==0.3.1
 # gradio==3.16.2
--- a/sdxl_gen_img.py
+++ b/sdxl_gen_img.py
@@ -93,7 +93,8 @@ def replace_vae_modules(vae: diffusers.models.AutoencoderKL, mem_eff_attn, xform
    if mem_eff_attn:
        replace_vae_attn_to_memory_efficient()
    elif xformers:
-        replace_vae_attn_to_xformers()
+        # replace_vae_attn_to_xformers() # 解像度によってxformersがエラーを出す？
+        vae.set_use_memory_efficient_attention_xformers(True)  # とりあえずこっちを使う
    elif sdpa:
        replace_vae_attn_to_sdpa()

@@ -511,7 +512,7 @@ class PipelineLike:
        emb1 = sdxl_train_util.get_timestep_embedding(torch.FloatTensor([original_height, original_width]).unsqueeze(0), 256)
        emb2 = sdxl_train_util.get_timestep_embedding(torch.FloatTensor([crop_top, crop_left]).unsqueeze(0), 256)
        emb3 = sdxl_train_util.get_timestep_embedding(torch.FloatTensor([height, width]).unsqueeze(0), 256)
-        c_vector = torch.cat([emb1, emb2, emb3], dim=1).to(self.device, dtype=text_embeddings.dtype)
+        c_vector = torch.cat([emb1, emb2, emb3], dim=1).to(self.device, dtype=text_embeddings.dtype).repeat(batch_size, 1)
        uc_vector = c_vector.clone().to(self.device, dtype=text_embeddings.dtype)

        c_vector = torch.cat([text_pool, c_vector], dim=1)
@@ -959,6 +960,8 @@ def get_unweighted_text_embeddings(
            text_embedding = enc_out["hidden_states"][-2]
            if pool is None:
                pool = enc_out.get("text_embeds", None)  # use 1st chunk, if provided
+                if pool is not None:
+                    pool = train_util.pool_workaround(text_encoder, enc_out["last_hidden_state"], text_input_chunk, eos)

            if no_boseos_middle:
                if i == 0:
@@ -977,6 +980,8 @@ def get_unweighted_text_embeddings(
        enc_out = text_encoder(text_input, output_hidden_states=True, return_dict=True)
        text_embeddings = enc_out["hidden_states"][-2]
        pool = enc_out.get("text_embeds", None)  # text encoder 1 doesn't return this
+        if pool is not None:
+            pool = train_util.pool_workaround(text_encoder, enc_out["last_hidden_state"], text_input_chunk, eos)
    return text_embeddings, pool


@@ -2019,6 +2024,8 @@ def main(args):
            for i, (image, prompt, negative_prompts, seed, clip_prompt) in enumerate(
                zip(images, prompts, negative_prompts, seeds, clip_prompts)
            ):
+                if highres_fix:
+                    seed -= 1  # record original seed
                metadata = PngInfo()
                metadata.add_text("prompt", prompt)
                metadata.add_text("seed", str(seed))
--- a/sdxl_minimal_inference.py
+++ b/sdxl_minimal_inference.py
@@ -213,7 +213,7 @@ if __name__ == "__main__":
                enc_out = text_model2(tokens, output_hidden_states=True, return_dict=True)
                text_embedding2_penu = enc_out["hidden_states"][-2]
                # print("hidden_states2", text_embedding2_penu.shape)
-                text_embedding2_pool = enc_out["text_embeds"]
+                text_embedding2_pool = enc_out["text_embeds"]   # do not support Textual Inversion

            # 連結して終了 concat and finish
            text_embedding = torch.cat([text_embedding1, text_embedding2_penu], dim=2)
--- a/sdxl_train.py
+++ b/sdxl_train.py
@@ -24,9 +24,8 @@ import library.custom_train_functions as custom_train_functions
 from library.custom_train_functions import (
    apply_snr_weight,
    prepare_scheduler_for_custom_training,
-    pyramid_noise_like,
-    apply_noise_offset,
    scale_v_prediction_loss_like_noise_prediction,
+    add_v_prediction_like_loss,
 )
 from library.sdxl_original_unet import SdxlUNet2DConditionModel

@@ -175,7 +174,7 @@ def train(args):
        # Windows版のxformersはfloatで学習できなかったりするのでxformersを使わない設定も可能にしておく必要がある
        accelerator.print("Disable Diffusers' xformers")
        train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers, args.sdpa)
-        if torch.__version__ >= "2.0.0": # PyTorch 2.0.0 以上対応のxformersなら以下が使える
+        if torch.__version__ >= "2.0.0":  # PyTorch 2.0.0 以上対応のxformersなら以下が使える
            vae.set_use_memory_efficient_attention_xformers(args.xformers)

    # 学習を準備する
@@ -338,9 +337,7 @@ def train(args):
    accelerator.print(f"  num examples / サンプル数: {train_dataset_group.num_train_images}")
    accelerator.print(f"  num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
    accelerator.print(f"  num epochs / epoch数: {num_train_epochs}")
-    accelerator.print(
-        f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}"
-    )
+    accelerator.print(f"  batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}")
    # accelerator.print(
    #     f"  total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ（並列学習、勾配合計含む）: {total_batch_size}"
    # )
@@ -459,13 +456,17 @@ def train(args):

                target = noise

-                if args.min_snr_gamma:
+                if args.min_snr_gamma or args.scale_v_pred_loss_like_noise_pred or args.v_pred_like_loss:
                    # do not mean over batch dimension for snr weight or scale v-pred loss
                    loss = torch.nn.functional.mse_loss(noise_pred.float(), target.float(), reduction="none")
                    loss = loss.mean([1, 2, 3])

                    if args.min_snr_gamma:
                        loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
+                    if args.scale_v_pred_loss_like_noise_pred:
+                        loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
+                    if args.v_pred_like_loss:
+                        loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)

                    loss = loss.mean()  # mean over batch dimension
                else:
--- a/tools/original_control_net.py
+++ b/tools/original_control_net.py
@@ -4,176 +4,187 @@ import cv2
 import torch
 from safetensors.torch import load_file

-from diffusers import UNet2DConditionModel
-from diffusers.models.unet_2d_condition import UNet2DConditionOutput
+from library.original_unet import UNet2DConditionModel, SampleOutput

 import library.model_util as model_util


 class ControlNetInfo(NamedTuple):
-  unet: Any
-  net: Any
-  prep: Any
-  weight: float
-  ratio: float
+    unet: Any
+    net: Any
+    prep: Any
+    weight: float
+    ratio: float


 class ControlNet(torch.nn.Module):
-  def __init__(self) -> None:
-    super().__init__()
+    def __init__(self) -> None:
+        super().__init__()

-    # make control model
-    self.control_model = torch.nn.Module()
+        # make control model
+        self.control_model = torch.nn.Module()

-    dims = [320, 320, 320, 320, 640, 640, 640, 1280, 1280, 1280, 1280, 1280]
-    zero_convs = torch.nn.ModuleList()
-    for i, dim in enumerate(dims):
-      sub_list = torch.nn.ModuleList([torch.nn.Conv2d(dim, dim, 1)])
-      zero_convs.append(sub_list)
-    self.control_model.add_module("zero_convs", zero_convs)
+        dims = [320, 320, 320, 320, 640, 640, 640, 1280, 1280, 1280, 1280, 1280]
+        zero_convs = torch.nn.ModuleList()
+        for i, dim in enumerate(dims):
+            sub_list = torch.nn.ModuleList([torch.nn.Conv2d(dim, dim, 1)])
+            zero_convs.append(sub_list)
+        self.control_model.add_module("zero_convs", zero_convs)

-    middle_block_out = torch.nn.Conv2d(1280, 1280, 1)
-    self.control_model.add_module("middle_block_out", torch.nn.ModuleList([middle_block_out]))
+        middle_block_out = torch.nn.Conv2d(1280, 1280, 1)
+        self.control_model.add_module("middle_block_out", torch.nn.ModuleList([middle_block_out]))

-    dims = [16, 16, 32, 32, 96, 96, 256, 320]
-    strides = [1, 1, 2, 1, 2, 1, 2, 1]
-    prev_dim = 3
-    input_hint_block = torch.nn.Sequential()
-    for i, (dim, stride) in enumerate(zip(dims, strides)):
-      input_hint_block.append(torch.nn.Conv2d(prev_dim, dim, 3, stride, 1))
-      if i < len(dims) - 1:
-        input_hint_block.append(torch.nn.SiLU())
-      prev_dim = dim
-    self.control_model.add_module("input_hint_block", input_hint_block)
+        dims = [16, 16, 32, 32, 96, 96, 256, 320]
+        strides = [1, 1, 2, 1, 2, 1, 2, 1]
+        prev_dim = 3
+        input_hint_block = torch.nn.Sequential()
+        for i, (dim, stride) in enumerate(zip(dims, strides)):
+            input_hint_block.append(torch.nn.Conv2d(prev_dim, dim, 3, stride, 1))
+            if i < len(dims) - 1:
+                input_hint_block.append(torch.nn.SiLU())
+            prev_dim = dim
+        self.control_model.add_module("input_hint_block", input_hint_block)


 def load_control_net(v2, unet, model):
-  device = unet.device
+    device = unet.device

-  # control sdからキー変換しつつU-Netに対応する部分のみ取り出し、DiffusersのU-Netに読み込む
-  # state dictを読み込む
-  print(f"ControlNet: loading control SD model : {model}")
+    # control sdからキー変換しつつU-Netに対応する部分のみ取り出し、DiffusersのU-Netに読み込む
+    # state dictを読み込む
+    print(f"ControlNet: loading control SD model : {model}")

-  if model_util.is_safetensors(model):
-    ctrl_sd_sd = load_file(model)
-  else:
-    ctrl_sd_sd = torch.load(model, map_location='cpu')
-    ctrl_sd_sd = ctrl_sd_sd.pop("state_dict", ctrl_sd_sd)
+    if model_util.is_safetensors(model):
+        ctrl_sd_sd = load_file(model)
+    else:
+        ctrl_sd_sd = torch.load(model, map_location="cpu")
+        ctrl_sd_sd = ctrl_sd_sd.pop("state_dict", ctrl_sd_sd)

-  # 重みをU-Netに読み込めるようにする。ControlNetはSD版のstate dictなので、それを読み込む
-  is_difference = "difference" in ctrl_sd_sd
-  print("ControlNet: loading difference:", is_difference)
+    # 重みをU-Netに読み込めるようにする。ControlNetはSD版のstate dictなので、それを読み込む
+    is_difference = "difference" in ctrl_sd_sd
+    print("ControlNet: loading difference:", is_difference)

-  # ControlNetには存在しないキーがあるので、まず現在のU-NetでSD版の全keyを作っておく
-  # またTransfer Controlの元weightとなる
-  ctrl_unet_sd_sd = model_util.convert_unet_state_dict_to_sd(v2, unet.state_dict())
+    # ControlNetには存在しないキーがあるので、まず現在のU-NetでSD版の全keyを作っておく
+    # またTransfer Controlの元weightとなる
+    ctrl_unet_sd_sd = model_util.convert_unet_state_dict_to_sd(v2, unet.state_dict())

-  # 元のU-Netに影響しないようにコピーする。またprefixが付いていないので付ける
-  for key in list(ctrl_unet_sd_sd.keys()):
-    ctrl_unet_sd_sd["model.diffusion_model." + key] = ctrl_unet_sd_sd.pop(key).clone()
+    # 元のU-Netに影響しないようにコピーする。またprefixが付いていないので付ける
+    for key in list(ctrl_unet_sd_sd.keys()):
+        ctrl_unet_sd_sd["model.diffusion_model." + key] = ctrl_unet_sd_sd.pop(key).clone()

-  zero_conv_sd = {}
-  for key in list(ctrl_sd_sd.keys()):
-    if key.startswith("control_"):
-      unet_key = "model.diffusion_" + key[len("control_"):]
-      if unet_key not in ctrl_unet_sd_sd:               # zero conv
-        zero_conv_sd[key] = ctrl_sd_sd[key]
-        continue
-      if is_difference:                                 # Transfer Control
-        ctrl_unet_sd_sd[unet_key] += ctrl_sd_sd[key].to(device, dtype=unet.dtype)
-      else:
-        ctrl_unet_sd_sd[unet_key] = ctrl_sd_sd[key].to(device, dtype=unet.dtype)
+    zero_conv_sd = {}
+    for key in list(ctrl_sd_sd.keys()):
+        if key.startswith("control_"):
+            unet_key = "model.diffusion_" + key[len("control_") :]
+            if unet_key not in ctrl_unet_sd_sd:  # zero conv
+                zero_conv_sd[key] = ctrl_sd_sd[key]
+                continue
+            if is_difference:  # Transfer Control
+                ctrl_unet_sd_sd[unet_key] += ctrl_sd_sd[key].to(device, dtype=unet.dtype)
+            else:
+                ctrl_unet_sd_sd[unet_key] = ctrl_sd_sd[key].to(device, dtype=unet.dtype)

-  unet_config = model_util.create_unet_diffusers_config(v2)
-  ctrl_unet_du_sd = model_util.convert_ldm_unet_checkpoint(v2, ctrl_unet_sd_sd, unet_config)    # DiffUsers版ControlNetのstate dict
+    unet_config = model_util.create_unet_diffusers_config(v2)
+    ctrl_unet_du_sd = model_util.convert_ldm_unet_checkpoint(v2, ctrl_unet_sd_sd, unet_config)  # DiffUsers版ControlNetのstate dict

-  # ControlNetのU-Netを作成する
-  ctrl_unet = UNet2DConditionModel(**unet_config)
-  info = ctrl_unet.load_state_dict(ctrl_unet_du_sd)
-  print("ControlNet: loading Control U-Net:", info)
+    # ControlNetのU-Netを作成する
+    ctrl_unet = UNet2DConditionModel(**unet_config)
+    info = ctrl_unet.load_state_dict(ctrl_unet_du_sd)
+    print("ControlNet: loading Control U-Net:", info)

-  # U-Net以外のControlNetを作成する
-  # TODO support middle only
-  ctrl_net = ControlNet()
-  info = ctrl_net.load_state_dict(zero_conv_sd)
-  print("ControlNet: loading ControlNet:", info)
+    # U-Net以外のControlNetを作成する
+    # TODO support middle only
+    ctrl_net = ControlNet()
+    info = ctrl_net.load_state_dict(zero_conv_sd)
+    print("ControlNet: loading ControlNet:", info)

-  ctrl_unet.to(unet.device, dtype=unet.dtype)
-  ctrl_net.to(unet.device, dtype=unet.dtype)
-  return ctrl_unet, ctrl_net
+    ctrl_unet.to(unet.device, dtype=unet.dtype)
+    ctrl_net.to(unet.device, dtype=unet.dtype)
+    return ctrl_unet, ctrl_net


 def load_preprocess(prep_type: str):
-  if prep_type is None or prep_type.lower() == "none":
+    if prep_type is None or prep_type.lower() == "none":
+        return None
+
+    if prep_type.startswith("canny"):
+        args = prep_type.split("_")
+        th1 = int(args[1]) if len(args) >= 2 else 63
+        th2 = int(args[2]) if len(args) >= 3 else 191
+
+        def canny(img):
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+            return cv2.Canny(img, th1, th2)
+
+        return canny
+
+    print("Unsupported prep type:", prep_type)
    return None

-  if prep_type.startswith("canny"):
-    args = prep_type.split("_")
-    th1 = int(args[1]) if len(args) >= 2 else 63
-    th2 = int(args[2]) if len(args) >= 3 else 191
-
-    def canny(img):
-      img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
-      return cv2.Canny(img, th1, th2)
-    return canny
-
-  print("Unsupported prep type:", prep_type)
-  return None
-

 def preprocess_ctrl_net_hint_image(image):
-  image = np.array(image).astype(np.float32) / 255.0
-  # ControlNetのサンプルはcv2を使っているが、読み込みはGradioなので実はRGBになっている
-  # image = image[:, :, ::-1].copy()                         # rgb to bgr
-  image = image[None].transpose(0, 3, 1, 2)       # nchw
-  image = torch.from_numpy(image)
-  return image                              # 0 to 1
+    image = np.array(image).astype(np.float32) / 255.0
+    # ControlNetのサンプルはcv2を使っているが、読み込みはGradioなので実はRGBになっている
+    # image = image[:, :, ::-1].copy()                         # rgb to bgr
+    image = image[None].transpose(0, 3, 1, 2)  # nchw
+    image = torch.from_numpy(image)
+    return image  # 0 to 1


 def get_guided_hints(control_nets: List[ControlNetInfo], num_latent_input, b_size, hints):
-  guided_hints = []
-  for i, cnet_info in enumerate(control_nets):
-    # hintは 1枚目の画像のcnet1, 1枚目の画像のcnet2, 1枚目の画像のcnet3, 2枚目の画像のcnet1, 2枚目の画像のcnet2 ... と並んでいること
-    b_hints = []
-    if len(hints) == 1:           # すべて同じ画像をhintとして使う
-      hint = hints[0]
-      if cnet_info.prep is not None:
-        hint = cnet_info.prep(hint)
-      hint = preprocess_ctrl_net_hint_image(hint)
-      b_hints = [hint for _ in range(b_size)]
-    else:
-      for bi in range(b_size):
-        hint = hints[(bi * len(control_nets) + i) % len(hints)]
-        if cnet_info.prep is not None:
-          hint = cnet_info.prep(hint)
-        hint = preprocess_ctrl_net_hint_image(hint)
-        b_hints.append(hint)
-    b_hints = torch.cat(b_hints, dim=0)
-    b_hints = b_hints.to(cnet_info.unet.device, dtype=cnet_info.unet.dtype)
+    guided_hints = []
+    for i, cnet_info in enumerate(control_nets):
+        # hintは 1枚目の画像のcnet1, 1枚目の画像のcnet2, 1枚目の画像のcnet3, 2枚目の画像のcnet1, 2枚目の画像のcnet2 ... と並んでいること
+        b_hints = []
+        if len(hints) == 1:  # すべて同じ画像をhintとして使う
+            hint = hints[0]
+            if cnet_info.prep is not None:
+                hint = cnet_info.prep(hint)
+            hint = preprocess_ctrl_net_hint_image(hint)
+            b_hints = [hint for _ in range(b_size)]
+        else:
+            for bi in range(b_size):
+                hint = hints[(bi * len(control_nets) + i) % len(hints)]
+                if cnet_info.prep is not None:
+                    hint = cnet_info.prep(hint)
+                hint = preprocess_ctrl_net_hint_image(hint)
+                b_hints.append(hint)
+        b_hints = torch.cat(b_hints, dim=0)
+        b_hints = b_hints.to(cnet_info.unet.device, dtype=cnet_info.unet.dtype)

-    guided_hint = cnet_info.net.control_model.input_hint_block(b_hints)
-    guided_hints.append(guided_hint)
-  return guided_hints
+        guided_hint = cnet_info.net.control_model.input_hint_block(b_hints)
+        guided_hints.append(guided_hint)
+    return guided_hints


-def call_unet_and_control_net(step, num_latent_input, original_unet, control_nets: List[ControlNetInfo], guided_hints, current_ratio, sample, timestep, encoder_hidden_states):
-  # ControlNet
-  # 複数のControlNetの場合は、出力をマージするのではなく交互に適用する
-  cnet_cnt = len(control_nets)
-  cnet_idx = step % cnet_cnt
-  cnet_info = control_nets[cnet_idx]
+def call_unet_and_control_net(
+    step,
+    num_latent_input,
+    original_unet,
+    control_nets: List[ControlNetInfo],
+    guided_hints,
+    current_ratio,
+    sample,
+    timestep,
+    encoder_hidden_states,
+    encoder_hidden_states_for_control_net,
+):
+    # ControlNet
+    # 複数のControlNetの場合は、出力をマージするのではなく交互に適用する
+    cnet_cnt = len(control_nets)
+    cnet_idx = step % cnet_cnt
+    cnet_info = control_nets[cnet_idx]

-  # print(current_ratio, cnet_info.prep, cnet_info.weight, cnet_info.ratio)
-  if cnet_info.ratio < current_ratio:
-    return original_unet(sample, timestep, encoder_hidden_states)
+    # print(current_ratio, cnet_info.prep, cnet_info.weight, cnet_info.ratio)
+    if cnet_info.ratio < current_ratio:
+        return original_unet(sample, timestep, encoder_hidden_states)

-  guided_hint = guided_hints[cnet_idx]
-  guided_hint = guided_hint.repeat((num_latent_input, 1, 1, 1))
-  outs = unet_forward(True, cnet_info.net, cnet_info.unet, guided_hint, None, sample, timestep, encoder_hidden_states)
-  outs = [o * cnet_info.weight for o in outs]
+    guided_hint = guided_hints[cnet_idx]
+    guided_hint = guided_hint.repeat((num_latent_input, 1, 1, 1))
+    outs = unet_forward(True, cnet_info.net, cnet_info.unet, guided_hint, None, sample, timestep, encoder_hidden_states_for_control_net)
+    outs = [o * cnet_info.weight for o in outs]

-  # U-Net
-  return unet_forward(False, cnet_info.net, original_unet, None, outs, sample, timestep, encoder_hidden_states)
+    # U-Net
+    return unet_forward(False, cnet_info.net, original_unet, None, outs, sample, timestep, encoder_hidden_states)


 """
@@ -204,118 +215,123 @@ def call_unet_and_control_net(step, num_latent_input, original_unet, control_net
 """


-def unet_forward(is_control_net, control_net: ControlNet, unet: UNet2DConditionModel, guided_hint, ctrl_outs, sample, timestep, encoder_hidden_states):
-  # copy from UNet2DConditionModel
-  default_overall_up_factor = 2**unet.num_upsamplers
+def unet_forward(
+    is_control_net,
+    control_net: ControlNet,
+    unet: UNet2DConditionModel,
+    guided_hint,
+    ctrl_outs,
+    sample,
+    timestep,
+    encoder_hidden_states,
+):
+    # copy from UNet2DConditionModel
+    default_overall_up_factor = 2**unet.num_upsamplers

-  forward_upsample_size = False
-  upsample_size = None
+    forward_upsample_size = False
+    upsample_size = None

-  if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
-    print("Forward upsample size to force interpolation output size.")
-    forward_upsample_size = True
+    if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+        print("Forward upsample size to force interpolation output size.")
+        forward_upsample_size = True

-  # 0. center input if necessary
-  if unet.config.center_input_sample:
-    sample = 2 * sample - 1.0
+    # 1. time
+    timesteps = timestep
+    if not torch.is_tensor(timesteps):
+        # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+        # This would be a good case for the `match` statement (Python 3.10+)
+        is_mps = sample.device.type == "mps"
+        if isinstance(timestep, float):
+            dtype = torch.float32 if is_mps else torch.float64
+        else:
+            dtype = torch.int32 if is_mps else torch.int64
+        timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+    elif len(timesteps.shape) == 0:
+        timesteps = timesteps[None].to(sample.device)

-  # 1. time
-  timesteps = timestep
-  if not torch.is_tensor(timesteps):
-    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-    # This would be a good case for the `match` statement (Python 3.10+)
-    is_mps = sample.device.type == "mps"
-    if isinstance(timestep, float):
-      dtype = torch.float32 if is_mps else torch.float64
-    else:
-      dtype = torch.int32 if is_mps else torch.int64
-    timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-  elif len(timesteps.shape) == 0:
-    timesteps = timesteps[None].to(sample.device)
+    # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+    timesteps = timesteps.expand(sample.shape[0])

-  # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-  timesteps = timesteps.expand(sample.shape[0])
+    t_emb = unet.time_proj(timesteps)

-  t_emb = unet.time_proj(timesteps)
+    # timesteps does not contain any weights and will always return f32 tensors
+    # but time_embedding might actually be running in fp16. so we need to cast here.
+    # there might be better ways to encapsulate this.
+    t_emb = t_emb.to(dtype=unet.dtype)
+    emb = unet.time_embedding(t_emb)

-  # timesteps does not contain any weights and will always return f32 tensors
-  # but time_embedding might actually be running in fp16. so we need to cast here.
-  # there might be better ways to encapsulate this.
-  t_emb = t_emb.to(dtype=unet.dtype)
-  emb = unet.time_embedding(t_emb)
+    outs = []  # output of ControlNet
+    zc_idx = 0

-  outs = []                     # output of ControlNet
-  zc_idx = 0
-
-  # 2. pre-process
-  sample = unet.conv_in(sample)
-  if is_control_net:
-    sample += guided_hint
-    outs.append(control_net.control_model.zero_convs[zc_idx][0](sample))  # , emb, encoder_hidden_states))
-    zc_idx += 1
-
-  # 3. down
-  down_block_res_samples = (sample,)
-  for downsample_block in unet.down_blocks:
-    if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-      sample, res_samples = downsample_block(
-          hidden_states=sample,
-          temb=emb,
-          encoder_hidden_states=encoder_hidden_states,
-      )
-    else:
-      sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+    # 2. pre-process
+    sample = unet.conv_in(sample)
    if is_control_net:
-      for rs in res_samples:
-        outs.append(control_net.control_model.zero_convs[zc_idx][0](rs))  # , emb, encoder_hidden_states))
+        sample += guided_hint
+        outs.append(control_net.control_model.zero_convs[zc_idx][0](sample))  # , emb, encoder_hidden_states))
        zc_idx += 1

-    down_block_res_samples += res_samples
+    # 3. down
+    down_block_res_samples = (sample,)
+    for downsample_block in unet.down_blocks:
+        if downsample_block.has_cross_attention:
+            sample, res_samples = downsample_block(
+                hidden_states=sample,
+                temb=emb,
+                encoder_hidden_states=encoder_hidden_states,
+            )
+        else:
+            sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+        if is_control_net:
+            for rs in res_samples:
+                outs.append(control_net.control_model.zero_convs[zc_idx][0](rs))  # , emb, encoder_hidden_states))
+                zc_idx += 1

-  # 4. mid
-  sample = unet.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)
-  if is_control_net:
-    outs.append(control_net.control_model.middle_block_out[0](sample))
-    return outs
+        down_block_res_samples += res_samples

-  if not is_control_net:
-    sample += ctrl_outs.pop()
+    # 4. mid
+    sample = unet.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)
+    if is_control_net:
+        outs.append(control_net.control_model.middle_block_out[0](sample))
+        return outs

-  # 5. up
-  for i, upsample_block in enumerate(unet.up_blocks):
-    is_final_block = i == len(unet.up_blocks) - 1
+    if not is_control_net:
+        sample += ctrl_outs.pop()

-    res_samples = down_block_res_samples[-len(upsample_block.resnets):]
-    down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+    # 5. up
+    for i, upsample_block in enumerate(unet.up_blocks):
+        is_final_block = i == len(unet.up_blocks) - 1

-    if not is_control_net and len(ctrl_outs) > 0:
-      res_samples = list(res_samples)
-      apply_ctrl_outs = ctrl_outs[-len(res_samples):]
-      ctrl_outs = ctrl_outs[:-len(res_samples)]
-      for j in range(len(res_samples)):
-        res_samples[j] = res_samples[j] + apply_ctrl_outs[j]
-      res_samples = tuple(res_samples)
+        res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+        down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

-    # if we have not reached the final block and need to forward the
-    # upsample size, we do it here
-    if not is_final_block and forward_upsample_size:
-      upsample_size = down_block_res_samples[-1].shape[2:]
+        if not is_control_net and len(ctrl_outs) > 0:
+            res_samples = list(res_samples)
+            apply_ctrl_outs = ctrl_outs[-len(res_samples) :]
+            ctrl_outs = ctrl_outs[: -len(res_samples)]
+            for j in range(len(res_samples)):
+                res_samples[j] = res_samples[j] + apply_ctrl_outs[j]
+            res_samples = tuple(res_samples)

-    if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
-      sample = upsample_block(
-          hidden_states=sample,
-          temb=emb,
-          res_hidden_states_tuple=res_samples,
-          encoder_hidden_states=encoder_hidden_states,
-          upsample_size=upsample_size,
-      )
-    else:
-      sample = upsample_block(
-          hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
-      )
-  # 6. post-process
-  sample = unet.conv_norm_out(sample)
-  sample = unet.conv_act(sample)
-  sample = unet.conv_out(sample)
+        # if we have not reached the final block and need to forward the
+        # upsample size, we do it here
+        if not is_final_block and forward_upsample_size:
+            upsample_size = down_block_res_samples[-1].shape[2:]

-  return UNet2DConditionOutput(sample=sample)
+        if upsample_block.has_cross_attention:
+            sample = upsample_block(
+                hidden_states=sample,
+                temb=emb,
+                res_hidden_states_tuple=res_samples,
+                encoder_hidden_states=encoder_hidden_states,
+                upsample_size=upsample_size,
+            )
+        else:
+            sample = upsample_block(
+                hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+            )
+    # 6. post-process
+    sample = unet.conv_norm_out(sample)
+    sample = unet.conv_act(sample)
+    sample = unet.conv_out(sample)
+
+    return SampleOutput(sample=sample)
--- a/train_network.py
+++ b/train_network.py
@@ -31,9 +31,8 @@ from library.custom_train_functions import (
    apply_snr_weight,
    get_weighted_text_embeddings,
    prepare_scheduler_for_custom_training,
-    pyramid_noise_like,
-    apply_noise_offset,
    scale_v_prediction_loss_like_noise_prediction,
+    add_v_prediction_like_loss,
 )


@@ -792,6 +791,8 @@ class NetworkTrainer:
                        loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
                    if args.scale_v_pred_loss_like_noise_pred:
                        loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
+                    if args.v_pred_like_loss:
+                        loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)

                    loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし

--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -24,6 +24,7 @@ from library.custom_train_functions import (
    apply_snr_weight,
    prepare_scheduler_for_custom_training,
    scale_v_prediction_loss_like_noise_prediction,
+    add_v_prediction_like_loss,
 )

 imagenet_templates_small = [
@@ -566,6 +567,8 @@ class TextualInversionTrainer:
                        loss = apply_snr_weight(loss, timesteps, noise_scheduler, args.min_snr_gamma)
                    if args.scale_v_pred_loss_like_noise_pred:
                        loss = scale_v_prediction_loss_like_noise_prediction(loss, timesteps, noise_scheduler)
+                    if args.v_pred_like_loss:
+                        loss = add_v_prediction_like_loss(loss, timesteps, noise_scheduler, args.v_pred_like_loss)

                    loss = loss.mean()  # 平均なのでbatch_sizeで割る必要なし