diff --git a/flux_train.py b/flux_train.py
index 91ae3af5..79c44d7b 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -29,7 +29,7 @@ init_ipex()
 
 from accelerate.utils import set_seed
 from library import deepspeed_utils, flux_train_utils, flux_utils, strategy_base, strategy_flux
-from library.sd3_train_utils import load_prompts, FlowMatchEulerDiscreteScheduler
+from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
 
 import library.train_util as train_util
 
@@ -241,7 +241,7 @@ def train(args):
 
             text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
 
-            prompts = load_prompts(args.sample_prompts)
+            prompts = train_util.load_prompts(args.sample_prompts)
             sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
             with accelerator.autocast(), torch.no_grad():
                 for prompt_dict in prompts:
diff --git a/flux_train_network.py b/flux_train_network.py
index 9cc8811b..cffeb3b1 100644
--- a/flux_train_network.py
+++ b/flux_train_network.py
@@ -231,7 +231,7 @@ class FluxNetworkTrainer(train_network.NetworkTrainer):
                 tokenize_strategy: strategy_flux.FluxTokenizeStrategy = strategy_base.TokenizeStrategy.get_strategy()
                 text_encoding_strategy: strategy_flux.FluxTextEncodingStrategy = strategy_base.TextEncodingStrategy.get_strategy()
 
-                prompts = sd3_train_utils.load_prompts(args.sample_prompts)
+                prompts = train_util.load_prompts(args.sample_prompts)
                 sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
                 with accelerator.autocast(), torch.no_grad():
                     for prompt_dict in prompts:
diff --git a/library/flux_train_utils.py b/library/flux_train_utils.py
index b3c9184f..fa673a2f 100644
--- a/library/flux_train_utils.py
+++ b/library/flux_train_utils.py
@@ -15,7 +15,6 @@ from PIL import Image
 from safetensors.torch import save_file
 
 from library import flux_models, flux_utils, strategy_base, train_util
-from library.sd3_train_utils import load_prompts
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -70,7 +69,7 @@ def sample_images(
         text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
 
-    prompts = load_prompts(args.sample_prompts)
+    prompts = train_util.load_prompts(args.sample_prompts)
 
     save_dir = args.output_dir + "/sample"
     os.makedirs(save_dir, exist_ok=True)
diff --git a/library/flux_utils.py b/library/flux_utils.py
index 7a1ec37b..86a2ec60 100644
--- a/library/flux_utils.py
+++ b/library/flux_utils.py
@@ -10,40 +10,21 @@ from safetensors import safe_open
 from accelerate import init_empty_weights
 from transformers import CLIPTextModel, CLIPConfig, T5EncoderModel, T5Config
 
-from library import flux_models
-
-from library.utils import setup_logging, MemoryEfficientSafeOpen
+from library.utils import setup_logging
 
 setup_logging()
 import logging
 
 logger = logging.getLogger(__name__)
 
+from library import flux_models
+from library.utils import load_safetensors
+
 MODEL_VERSION_FLUX_V1 = "flux1"
 MODEL_NAME_DEV = "dev"
 MODEL_NAME_SCHNELL = "schnell"
 
 
-# temporary copy from sd3_utils TODO refactor
-def load_safetensors(
-    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = torch.float32
-):
-    if disable_mmap:
-        # return safetensors.torch.load(open(path, "rb").read())
-        # use experimental loader
-        logger.info(f"Loading without mmap (experimental)")
-        state_dict = {}
-        with MemoryEfficientSafeOpen(path) as f:
-            for key in f.keys():
-                state_dict[key] = f.get_tensor(key).to(device, dtype=dtype)
-        return state_dict
-    else:
-        try:
-            return load_file(path, device=device)
-        except:
-            return load_file(path)  # prevent device invalid Error
-
-
 def analyze_checkpoint_state(ckpt_path: str) -> Tuple[bool, bool, Tuple[int, int], List[str]]:
     """
     チェックポイントの状態を分析し、DiffusersかBFLか、devかschnellか、ブロック数を計算して返す。
@@ -161,8 +142,14 @@ def load_ae(
     return ae
 
 
-def load_clip_l(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.device], disable_mmap: bool = False) -> CLIPTextModel:
-    logger.info("Building CLIP")
+def load_clip_l(
+    ckpt_path: Optional[str],
+    dtype: torch.dtype,
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    state_dict: Optional[dict] = None,
+) -> CLIPTextModel:
+    logger.info("Building CLIP-L")
     CLIPL_CONFIG = {
         "_name_or_path": "clip-vit-large-patch14/",
         "architectures": ["CLIPModel"],
@@ -255,15 +242,22 @@ def load_clip_l(ckpt_path: str, dtype: torch.dtype, device: Union[str, torch.dev
     with init_empty_weights():
         clip = CLIPTextModel._from_config(config)
 
-    logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    if state_dict is not None:
+        sd = state_dict
+    else:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
     info = clip.load_state_dict(sd, strict=False, assign=True)
-    logger.info(f"Loaded CLIP: {info}")
+    logger.info(f"Loaded CLIP-L: {info}")
     return clip
 
 
 def load_t5xxl(
-    ckpt_path: str, dtype: Optional[torch.dtype], device: Union[str, torch.device], disable_mmap: bool = False
+    ckpt_path: str,
+    dtype: Optional[torch.dtype],
+    device: Union[str, torch.device],
+    disable_mmap: bool = False,
+    state_dict: Optional[dict] = None,
 ) -> T5EncoderModel:
     T5_CONFIG_JSON = """
 {
@@ -303,8 +297,11 @@ def load_t5xxl(
     with init_empty_weights():
         t5xxl = T5EncoderModel._from_config(config)
 
-    logger.info(f"Loading state dict from {ckpt_path}")
-    sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    if state_dict is not None:
+        sd = state_dict
+    else:
+        logger.info(f"Loading state dict from {ckpt_path}")
+        sd = load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
     info = t5xxl.load_state_dict(sd, strict=False, assign=True)
     logger.info(f"Loaded T5xxl: {info}")
     return t5xxl
diff --git a/library/sai_model_spec.py b/library/sai_model_spec.py
index ad72ec00..8896c047 100644
--- a/library/sai_model_spec.py
+++ b/library/sai_model_spec.py
@@ -57,8 +57,8 @@ ARCH_SD_V1 = "stable-diffusion-v1"
 ARCH_SD_V2_512 = "stable-diffusion-v2-512"
 ARCH_SD_V2_768_V = "stable-diffusion-v2-768-v"
 ARCH_SD_XL_V1_BASE = "stable-diffusion-xl-v1-base"
-ARCH_SD3_M = "stable-diffusion-3-medium"
-ARCH_SD3_UNKNOWN = "stable-diffusion-3"
+ARCH_SD3_M = "stable-diffusion-3"  # may be followed by "-m" or "-5-large" etc.
+# ARCH_SD3_UNKNOWN = "stable-diffusion-3"
 ARCH_FLUX_1_DEV = "flux-1-dev"
 ARCH_FLUX_1_UNKNOWN = "flux-1"
 
@@ -140,10 +140,7 @@ def build_metadata(
     if sdxl:
         arch = ARCH_SD_XL_V1_BASE
     elif sd3 is not None:
-        if sd3 == "m":
-            arch = ARCH_SD3_M
-        else:
-            arch = ARCH_SD3_UNKNOWN
+        arch = ARCH_SD3_M + "-" + sd3
     elif flux is not None:
         if flux == "dev":
             arch = ARCH_FLUX_1_DEV
diff --git a/library/sd3_models.py b/library/sd3_models.py
index ec704dcb..c81aa479 100644
--- a/library/sd3_models.py
+++ b/library/sd3_models.py
@@ -4,6 +4,7 @@
 # and some module/classes are contributed from KohakuBlueleaf. Thanks for the contribution!
 
 from ast import Tuple
+from dataclasses import dataclass
 from functools import partial
 import math
 from types import SimpleNamespace
@@ -15,6 +16,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.checkpoint import checkpoint
 from transformers import CLIPTokenizer, T5TokenizerFast
+
 from .utils import setup_logging
 
 setup_logging()
@@ -35,141 +37,23 @@ except:
     memory_efficient_attention = None
 
 
-# region tokenizer
-class SDTokenizer:
-    def __init__(
-        self, max_length=77, pad_with_end=True, tokenizer=None, has_start_token=True, pad_to_max_length=True, min_length=None
-    ):
-        """
-        サブクラスで各種の設定を行ってる。このクラスはその設定に基づき重み付きのトークン化を行うようだ。
-        Some settings are done in subclasses. This class seems to perform tokenization with weights based on those settings.
-        """
-        self.tokenizer: CLIPTokenizer = tokenizer
-        self.max_length = max_length
-        self.min_length = min_length
-        empty = self.tokenizer("")["input_ids"]
-        if has_start_token:
-            self.tokens_start = 1
-            self.start_token = empty[0]
-            self.end_token = empty[1]
-        else:
-            self.tokens_start = 0
-            self.start_token = None
-            self.end_token = empty[0]
-        self.pad_with_end = pad_with_end
-        self.pad_to_max_length = pad_to_max_length
-        vocab = self.tokenizer.get_vocab()
-        self.inv_vocab = {v: k for k, v in vocab.items()}
-        self.max_word_length = 8
-
-    def tokenize(self, text: Union[str, List[str]]) -> List[torch.Tensor]:
-        """
-        Tokenize the text without weights.
-        """
-        if type(text) == str:
-            text = [text]
-        batch_tokens = self.tokenizer(text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
-        # return tokens["input_ids"]
-
-        pad_token = self.end_token if self.pad_with_end else 0
-        for tokens in batch_tokens["input_ids"]:
-            assert tokens[0] == self.start_token, f"tokens[0]: {tokens[0]}, start_token: {self.start_token}"
-
-    def tokenize_with_weights(self, text: str, truncate_to_max_length=True, truncate_length=None):
-        """Tokenize the text, with weight values - presume 1.0 for all and ignore other features here.
-        The details aren't relevant for a reference impl, and weights themselves has weak effect on SD3."""
-        """
-        ja: テキストをトークン化し、重み値を持ちます - すべての値に1.0を仮定し、他の機能を無視します。
-        詳細は参考実装には関係なく、重み自体はSD3に対して弱い影響しかありません。へぇ～
-        """
-        if self.pad_with_end:
-            pad_token = self.end_token
-        else:
-            pad_token = 0
-        batch = []
-        if self.start_token is not None:
-            batch.append((self.start_token, 1.0))
-        to_tokenize = text.replace("\n", " ").split(" ")
-        to_tokenize = [x for x in to_tokenize if x != ""]
-        for word in to_tokenize:
-            batch.extend([(t, 1) for t in self.tokenizer(word)["input_ids"][self.tokens_start : -1]])
-        batch.append((self.end_token, 1.0))
-        print(len(batch), self.max_length, self.min_length)
-        if self.pad_to_max_length:
-            batch.extend([(pad_token, 1.0)] * (self.max_length - len(batch)))
-        if self.min_length is not None and len(batch) < self.min_length:
-            batch.extend([(pad_token, 1.0)] * (self.min_length - len(batch)))
-
-        # truncate to max_length
-        print(
-            f"batch: {batch}, max_length: {self.max_length}, truncate: {truncate_to_max_length}, truncate_length: {truncate_length}"
-        )
-        if truncate_to_max_length and len(batch) > self.max_length:
-            batch = batch[: self.max_length]
-        if truncate_length is not None and len(batch) > truncate_length:
-            batch = batch[:truncate_length]
-
-        return [batch]
-
-
-class T5XXLTokenizer(SDTokenizer):
-    """Wraps the T5 Tokenizer from HF into the SDTokenizer interface"""
-
-    def __init__(self):
-        super().__init__(
-            pad_with_end=False,
-            tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"),
-            has_start_token=False,
-            pad_to_max_length=False,
-            max_length=99999999,
-            min_length=77,
-        )
-
-
-class SDXLClipGTokenizer(SDTokenizer):
-    def __init__(self, tokenizer):
-        super().__init__(pad_with_end=False, tokenizer=tokenizer)
-
-
-class SD3Tokenizer:
-    def __init__(self, t5xxl=True, t5xxl_max_length: Optional[int] = 256):
-        if t5xxl_max_length is None:
-            t5xxl_max_length = 256
-
-        # TODO cache tokenizer settings locally or hold them in the repo like ComfyUI
-        clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-        self.clip_l = SDTokenizer(tokenizer=clip_tokenizer)
-        self.clip_g = SDXLClipGTokenizer(clip_tokenizer)
-        # self.clip_l = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-        # self.clip_g = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
-        self.t5xxl = T5XXLTokenizer() if t5xxl else None
-        # t5xxl has 99999999 max length, clip has 77
-        self.t5xxl_max_length = t5xxl_max_length
-
-    def tokenize_with_weights(self, text: str):
-        return (
-            self.clip_l.tokenize_with_weights(text),
-            self.clip_g.tokenize_with_weights(text),
-            (
-                self.t5xxl.tokenize_with_weights(text, truncate_to_max_length=False, truncate_length=self.t5xxl_max_length)
-                if self.t5xxl is not None
-                else None
-            ),
-        )
-
-    def tokenize(self, text: str):
-        return (
-            self.clip_l.tokenize(text),
-            self.clip_g.tokenize(text),
-            (self.t5xxl.tokenize(text) if self.t5xxl is not None else None),
-        )
-
-
-# endregion
-
 # region mmdit
 
 
+@dataclass
+class SD3Params:
+    patch_size: int
+    depth: int
+    num_patches: int
+    pos_embed_max_size: int
+    adm_in_channels: int
+    qk_norm: Optional[str]
+    x_block_self_attn_layers: List[int]
+    context_embedder_in_features: int
+    context_embedder_out_features: int
+    model_type: str
+
+
 def get_2d_sincos_pos_embed(
     embed_dim,
     grid_size,
@@ -286,10 +170,6 @@ def timestep_embedding(t, dim, max_period=10000):
     return embedding
 
 
-def rmsnorm(x, eps=1e-6):
-    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
-
-
 class PatchEmbed(nn.Module):
     def __init__(
         self,
@@ -301,8 +181,9 @@ class PatchEmbed(nn.Module):
         flatten=True,
         bias=True,
         strict_img_size=True,
-        dynamic_img_pad=True,
+        dynamic_img_pad=False,
     ):
+        # dynamic_img_pad and norm is omitted in SD3.5
         super().__init__()
         self.patch_size = patch_size
         self.flatten = flatten
@@ -432,6 +313,10 @@ class Embedder(nn.Module):
         return self.mlp(x)
 
 
+def rmsnorm(x, eps=1e-6):
+    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+
+
 class RMSNorm(torch.nn.Module):
     def __init__(
         self,
@@ -604,53 +489,6 @@ def attention(q, k, v, head_dim, mask=None, scale=None, mode="xformers"):
     return scores
 
 
-class SelfAttention(AttentionLinears):
-    def __init__(self, dim, num_heads=8, mode="xformers"):
-        super().__init__(dim, num_heads, qkv_bias=True, pre_only=False)
-        assert mode in MEMORY_LAYOUTS
-        self.head_dim = dim // num_heads
-        self.attn_mode = mode
-
-    def set_attn_mode(self, mode):
-        self.attn_mode = mode
-
-    def forward(self, x):
-        q, k, v = self.pre_attention(x)
-        attn_score = attention(q, k, v, self.head_dim, mode=self.attn_mode)
-        return self.post_attention(attn_score)
-
-
-class TransformerBlock(nn.Module):
-    def __init__(self, context_size, mode="xformers"):
-        super().__init__()
-        self.context_size = context_size
-        self.norm1 = nn.LayerNorm(context_size, elementwise_affine=False, eps=1e-6)
-        self.attn = SelfAttention(context_size, mode=mode)
-        self.norm2 = nn.LayerNorm(context_size, elementwise_affine=False, eps=1e-6)
-        self.mlp = MLP(
-            in_features=context_size,
-            hidden_features=context_size * 4,
-            act_layer=lambda: nn.GELU(approximate="tanh"),
-        )
-
-    def forward(self, x):
-        x = x + self.attn(self.norm1(x))
-        x = x + self.mlp(self.norm2(x))
-        return x
-
-
-class Transformer(nn.Module):
-    def __init__(self, context_size, num_layers, mode="xformers"):
-        super().__init__()
-        self.layers = nn.ModuleList([TransformerBlock(context_size, mode) for _ in range(num_layers)])
-        self.norm = nn.LayerNorm(context_size, elementwise_affine=False, eps=1e-6)
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return self.norm(x)
-
-
 # DismantledBlock in mmdit.py
 class SingleDiTBlock(nn.Module):
     """
@@ -823,7 +661,8 @@ class MMDiT(nn.Module):
         mlp_ratio: float = 4.0,
         learn_sigma: bool = False,
         adm_in_channels: Optional[int] = None,
-        context_embedder_config: Optional[Dict] = None,
+        context_embedder_in_features: Optional[int] = None,
+        context_embedder_out_features: Optional[int] = None,
         use_checkpoint: bool = False,
         register_length: int = 0,
         attn_mode: str = "torch",
@@ -837,10 +676,10 @@ class MMDiT(nn.Module):
         num_patches=None,
         qk_norm: Optional[str] = None,
         qkv_bias: bool = True,
-        context_processor_layers=None,
-        context_size=4096,
+        model_type: str = "sd3m",
     ):
         super().__init__()
+        self._model_type = model_type
         self.learn_sigma = learn_sigma
         self.in_channels = in_channels
         default_out_channels = in_channels * 2 if learn_sigma else in_channels
@@ -875,12 +714,11 @@ class MMDiT(nn.Module):
             assert isinstance(adm_in_channels, int)
             self.y_embedder = Embedder(adm_in_channels, self.hidden_size)
 
-        if context_processor_layers is not None:
-            self.context_processor = Transformer(context_size, context_processor_layers, attn_mode)
+        if context_embedder_in_features is not None:
+            self.context_embedder = nn.Linear(context_embedder_in_features, context_embedder_out_features)
         else:
-            self.context_processor = None
+            self.context_embedder = nn.Identity()
 
-        self.context_embedder = nn.Linear(context_size, self.hidden_size)
         self.register_length = register_length
         if self.register_length > 0:
             self.register = nn.Parameter(torch.randn(1, register_length, self.hidden_size))
@@ -922,7 +760,7 @@ class MMDiT(nn.Module):
 
     @property
     def model_type(self):
-        return "m"  # only support medium
+        return self._model_type
 
     @property
     def device(self):
@@ -1024,9 +862,6 @@ class MMDiT(nn.Module):
         y: (N, D) tensor of class labels
         """
 
-        if self.context_processor is not None:
-            context = self.context_processor(context)
-
         B, C, H, W = x.shape
         x = self.x_embedder(x) + self.cropped_pos_embed(H, W, device=x.device).to(dtype=x.dtype)
         c = self.t_embedder(t, dtype=x.dtype)  # (N, D)
@@ -1052,22 +887,21 @@ class MMDiT(nn.Module):
         return x[:, :, :H, :W]
 
 
-def create_mmdit_sd3_medium_configs(attn_mode: str):
-    # {'patch_size': 2, 'depth': 24, 'num_patches': 36864,
-    # 'pos_embed_max_size': 192, 'adm_in_channels': 2048, 'context_embedder':
-    # {'target': 'torch.nn.Linear', 'params': {'in_features': 4096, 'out_features': 1536}}}
+def create_sd3_mmdit(params: SD3Params, attn_mode: str = "torch") -> MMDiT:
     mmdit = MMDiT(
         input_size=None,
-        pos_embed_max_size=192,
-        patch_size=2,
+        pos_embed_max_size=params.pos_embed_max_size,
+        patch_size=params.patch_size,
         in_channels=16,
-        adm_in_channels=2048,
-        depth=24,
+        adm_in_channels=params.adm_in_channels,
+        context_embedder_in_features=params.context_embedder_in_features,
+        context_embedder_out_features=params.context_embedder_out_features,
+        depth=params.depth,
         mlp_ratio=4,
-        qk_norm=None,
-        num_patches=36864,
-        context_size=4096,
+        qk_norm=params.qk_norm,
+        num_patches=params.num_patches,
         attn_mode=attn_mode,
+        model_type=params.model_type,
     )
     return mmdit
 
@@ -1075,7 +909,6 @@ def create_mmdit_sd3_medium_configs(attn_mode: str):
 # endregion
 
 # region VAE
-# TODO support xformers
 
 VAE_SCALE_FACTOR = 1.5305
 VAE_SHIFT_FACTOR = 0.0609
@@ -1322,759 +1155,4 @@ class SDVAE(torch.nn.Module):
         return (latent / VAE_SCALE_FACTOR) + VAE_SHIFT_FACTOR
 
 
-class VAEOutput:
-    def __init__(self, latent):
-        self.latent = latent
-
-    @property
-    def latent_dist(self):
-        return self
-
-    def sample(self):
-        return self.latent
-
-
-class VAEWrapper:
-    def __init__(self, vae):
-        self.vae = vae
-
-    @property
-    def device(self):
-        return self.vae.device
-
-    @property
-    def dtype(self):
-        return self.vae.dtype
-
-    # latents = vae.encode(img_tensors).latent_dist.sample().to("cpu")
-    def encode(self, image):
-        return VAEOutput(self.vae.encode(image))
-
-
-# endregion
-
-
-# region Text Encoder
-class CLIPAttention(torch.nn.Module):
-    def __init__(self, embed_dim, heads, dtype, device, mode="xformers"):
-        super().__init__()
-        self.heads = heads
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True, dtype=dtype, device=device)
-        self.attn_mode = mode
-
-    def set_attn_mode(self, mode):
-        self.attn_mode = mode
-
-    def forward(self, x, mask=None):
-        q = self.q_proj(x)
-        k = self.k_proj(x)
-        v = self.v_proj(x)
-        out = attention(q, k, v, self.heads, mask, mode=self.attn_mode)
-        return self.out_proj(out)
-
-
-ACTIVATIONS = {
-    "quick_gelu": lambda: (lambda a: a * torch.sigmoid(1.702 * a)),
-    # "gelu": torch.nn.functional.gelu,
-    "gelu": lambda: nn.GELU(),
-}
-
-
-class CLIPLayer(torch.nn.Module):
-    def __init__(self, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
-        super().__init__()
-        self.layer_norm1 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
-        self.self_attn = CLIPAttention(embed_dim, heads, dtype, device)
-        self.layer_norm2 = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
-        # # self.mlp = CLIPMLP(embed_dim, intermediate_size, intermediate_activation, dtype, device)
-        # self.mlp = Mlp(
-        #     embed_dim, intermediate_size, embed_dim, act_layer=ACTIVATIONS[intermediate_activation], dtype=dtype, device=device
-        # )
-        self.mlp = MLP(embed_dim, intermediate_size, embed_dim, act_layer=ACTIVATIONS[intermediate_activation])
-        self.mlp.to(device=device, dtype=dtype)
-
-    def forward(self, x, mask=None):
-        x += self.self_attn(self.layer_norm1(x), mask)
-        x += self.mlp(self.layer_norm2(x))
-        return x
-
-
-class CLIPEncoder(torch.nn.Module):
-    def __init__(self, num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device):
-        super().__init__()
-        self.layers = torch.nn.ModuleList(
-            [CLIPLayer(embed_dim, heads, intermediate_size, intermediate_activation, dtype, device) for i in range(num_layers)]
-        )
-
-    def forward(self, x, mask=None, intermediate_output=None):
-        if intermediate_output is not None:
-            if intermediate_output < 0:
-                intermediate_output = len(self.layers) + intermediate_output
-        intermediate = None
-        for i, l in enumerate(self.layers):
-            x = l(x, mask)
-            if i == intermediate_output:
-                intermediate = x.clone()
-        return x, intermediate
-
-
-class CLIPEmbeddings(torch.nn.Module):
-    def __init__(self, embed_dim, vocab_size=49408, num_positions=77, dtype=None, device=None):
-        super().__init__()
-        self.token_embedding = torch.nn.Embedding(vocab_size, embed_dim, dtype=dtype, device=device)
-        self.position_embedding = torch.nn.Embedding(num_positions, embed_dim, dtype=dtype, device=device)
-
-    def forward(self, input_tokens):
-        return self.token_embedding(input_tokens) + self.position_embedding.weight
-
-
-class CLIPTextModel_(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device):
-        num_layers = config_dict["num_hidden_layers"]
-        embed_dim = config_dict["hidden_size"]
-        heads = config_dict["num_attention_heads"]
-        intermediate_size = config_dict["intermediate_size"]
-        intermediate_activation = config_dict["hidden_act"]
-        super().__init__()
-        self.embeddings = CLIPEmbeddings(embed_dim, dtype=torch.float32, device=device)
-        self.encoder = CLIPEncoder(num_layers, embed_dim, heads, intermediate_size, intermediate_activation, dtype, device)
-        self.final_layer_norm = nn.LayerNorm(embed_dim, dtype=dtype, device=device)
-
-    def forward(self, input_tokens, intermediate_output=None, final_layer_norm_intermediate=True):
-        x = self.embeddings(input_tokens)
-
-        if x.dtype == torch.bfloat16:
-            causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=torch.float32, device=x.device).fill_(float("-inf")).triu_(1)
-            causal_mask = causal_mask.to(dtype=x.dtype)
-        else:
-            causal_mask = torch.empty(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device).fill_(float("-inf")).triu_(1)
-
-        x, i = self.encoder(x, mask=causal_mask, intermediate_output=intermediate_output)
-        x = self.final_layer_norm(x)
-        if i is not None and final_layer_norm_intermediate:
-            i = self.final_layer_norm(i)
-        pooled_output = x[
-            torch.arange(x.shape[0], device=x.device),
-            input_tokens.to(dtype=torch.int, device=x.device).argmax(dim=-1),
-        ]
-        return x, i, pooled_output
-
-
-class CLIPTextModel(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device):
-        super().__init__()
-        self.num_layers = config_dict["num_hidden_layers"]
-        self.text_model = CLIPTextModel_(config_dict, dtype, device)
-        embed_dim = config_dict["hidden_size"]
-        self.text_projection = nn.Linear(embed_dim, embed_dim, bias=False, dtype=dtype, device=device)
-        self.text_projection.weight.copy_(torch.eye(embed_dim))
-        self.dtype = dtype
-
-    def get_input_embeddings(self):
-        return self.text_model.embeddings.token_embedding
-
-    def set_input_embeddings(self, embeddings):
-        self.text_model.embeddings.token_embedding = embeddings
-
-    def forward(self, *args, **kwargs):
-        x = self.text_model(*args, **kwargs)
-        out = self.text_projection(x[2])
-        return (x[0], x[1], out, x[2])
-
-
-class ClipTokenWeightEncoder:
-    # def encode_token_weights(self, token_weight_pairs):
-    #     tokens = list(map(lambda a: a[0], token_weight_pairs[0]))
-    #     out, pooled = self([tokens])
-    #     if pooled is not None:
-    #         first_pooled = pooled[0:1]
-    #     else:
-    #         first_pooled = pooled
-    #     output = [out[0:1]]
-    #     return torch.cat(output, dim=-2), first_pooled
-
-    # fix to support batched inputs
-    # : Union[List[Tuple[torch.Tensor, torch.Tensor]], List[List[Tuple[torch.Tensor, torch.Tensor]]]]
-    def encode_token_weights(self, list_of_token_weight_pairs):
-        has_batch = isinstance(list_of_token_weight_pairs[0][0], list)
-
-        if has_batch:
-            list_of_tokens = []
-            for pairs in list_of_token_weight_pairs:
-                tokens = [a[0] for a in pairs[0]]  # I'm not sure why this is [0]
-                list_of_tokens.append(tokens)
-        else:
-            if isinstance(list_of_token_weight_pairs[0], torch.Tensor):
-                list_of_tokens = [list(list_of_token_weight_pairs[0])]
-            else:
-                list_of_tokens = [[a[0] for a in list_of_token_weight_pairs[0]]]
-
-        out, pooled = self(list_of_tokens)
-        if has_batch:
-            return out, pooled
-        else:
-            if pooled is not None:
-                first_pooled = pooled[0:1]
-            else:
-                first_pooled = pooled
-            output = [out[0:1]]
-            return torch.cat(output, dim=-2), first_pooled
-
-
-class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
-    """Uses the CLIP transformer encoder for text (from huggingface)"""
-
-    LAYERS = ["last", "pooled", "hidden"]
-
-    def __init__(
-        self,
-        device="cpu",
-        max_length=77,
-        layer="last",
-        layer_idx=None,
-        textmodel_json_config=None,
-        dtype=None,
-        model_class=CLIPTextModel,
-        special_tokens={"start": 49406, "end": 49407, "pad": 49407},
-        layer_norm_hidden_state=True,
-        return_projected_pooled=True,
-    ):
-        super().__init__()
-        assert layer in self.LAYERS
-        self.transformer = model_class(textmodel_json_config, dtype, device)
-        self.num_layers = self.transformer.num_layers
-        self.max_length = max_length
-        self.transformer = self.transformer.eval()
-        for param in self.parameters():
-            param.requires_grad = False
-        self.layer = layer
-        self.layer_idx = None
-        self.special_tokens = special_tokens
-        self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055))
-        self.layer_norm_hidden_state = layer_norm_hidden_state
-        self.return_projected_pooled = return_projected_pooled
-        if layer == "hidden":
-            assert layer_idx is not None
-            assert abs(layer_idx) < self.num_layers
-            self.set_clip_options({"layer": layer_idx})
-        self.options_default = (self.layer, self.layer_idx, self.return_projected_pooled)
-
-    @property
-    def device(self):
-        return next(self.parameters()).device
-
-    @property
-    def dtype(self):
-        return next(self.parameters()).dtype
-
-    def gradient_checkpointing_enable(self):
-        logger.warning("Gradient checkpointing is not supported for this model")
-
-    def set_attn_mode(self, mode):
-        raise NotImplementedError("This model does not support setting the attention mode")
-
-    def set_clip_options(self, options):
-        layer_idx = options.get("layer", self.layer_idx)
-        self.return_projected_pooled = options.get("projected_pooled", self.return_projected_pooled)
-        if layer_idx is None or abs(layer_idx) > self.num_layers:
-            self.layer = "last"
-        else:
-            self.layer = "hidden"
-            self.layer_idx = layer_idx
-
-    def forward(self, tokens):
-        backup_embeds = self.transformer.get_input_embeddings()
-        device = backup_embeds.weight.device
-        tokens = torch.LongTensor(tokens).to(device)
-        outputs = self.transformer(
-            tokens, intermediate_output=self.layer_idx, final_layer_norm_intermediate=self.layer_norm_hidden_state
-        )
-        self.transformer.set_input_embeddings(backup_embeds)
-        if self.layer == "last":
-            z = outputs[0]
-        else:
-            z = outputs[1]
-        pooled_output = None
-        if len(outputs) >= 3:
-            if not self.return_projected_pooled and len(outputs) >= 4 and outputs[3] is not None:
-                pooled_output = outputs[3].float()
-            elif outputs[2] is not None:
-                pooled_output = outputs[2].float()
-        return z.float(), pooled_output
-
-    def set_attn_mode(self, mode):
-        clip_text_model = self.transformer.text_model
-        for layer in clip_text_model.encoder.layers:
-            layer.self_attn.set_attn_mode(mode)
-
-
-class SDXLClipG(SDClipModel):
-    """Wraps the CLIP-G model into the SD-CLIP-Model interface"""
-
-    def __init__(self, config, device="cpu", layer="penultimate", layer_idx=None, dtype=None):
-        if layer == "penultimate":
-            layer = "hidden"
-            layer_idx = -2
-        super().__init__(
-            device=device,
-            layer=layer,
-            layer_idx=layer_idx,
-            textmodel_json_config=config,
-            dtype=dtype,
-            special_tokens={"start": 49406, "end": 49407, "pad": 0},
-            layer_norm_hidden_state=False,
-        )
-
-    def set_attn_mode(self, mode):
-        clip_text_model = self.transformer.text_model
-        for layer in clip_text_model.encoder.layers:
-            layer.self_attn.set_attn_mode(mode)
-
-
-class T5XXLModel(SDClipModel):
-    """Wraps the T5-XXL model into the SD-CLIP-Model interface for convenience"""
-
-    def __init__(self, config, device="cpu", layer="last", layer_idx=None, dtype=None):
-        super().__init__(
-            device=device,
-            layer=layer,
-            layer_idx=layer_idx,
-            textmodel_json_config=config,
-            dtype=dtype,
-            special_tokens={"end": 1, "pad": 0},
-            model_class=T5,
-        )
-
-    def set_attn_mode(self, mode):
-        t5: T5 = self.transformer
-        for t5block in t5.encoder.block:
-            t5block: T5Block
-            t5layer: T5LayerSelfAttention = t5block.layer[0]
-            t5SaSa: T5Attention = t5layer.SelfAttention
-            t5SaSa.set_attn_mode(mode)
-
-
-#################################################################################################
-### T5 implementation, for the T5-XXL text encoder portion, largely pulled from upstream impl
-#################################################################################################
-
-"""
-class T5XXLTokenizer(SDTokenizer):
-    ""Wraps the T5 Tokenizer from HF into the SDTokenizer interface""
-
-    def __init__(self):
-        super().__init__(
-            pad_with_end=False,
-            tokenizer=T5TokenizerFast.from_pretrained("google/t5-v1_1-xxl"),
-            has_start_token=False,
-            pad_to_max_length=False,
-            max_length=99999999,
-            min_length=77,
-        )
-"""
-
-
-class T5LayerNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-6, dtype=None, device=None):
-        super().__init__()
-        self.weight = torch.nn.Parameter(torch.ones(hidden_size, dtype=dtype, device=device))
-        self.variance_epsilon = eps
-
-    # def forward(self, x):
-    #     variance = x.pow(2).mean(-1, keepdim=True)
-    #     x = x * torch.rsqrt(variance + self.variance_epsilon)
-    #     return self.weight.to(device=x.device, dtype=x.dtype) * x
-
-    # copy from transformers' T5LayerNorm
-    def forward(self, hidden_states):
-        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
-        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
-        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
-        # half-precision inputs is done in fp32
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class T5DenseGatedActDense(torch.nn.Module):
-    def __init__(self, model_dim, ff_dim, dtype, device):
-        super().__init__()
-        self.wi_0 = nn.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
-        self.wi_1 = nn.Linear(model_dim, ff_dim, bias=False, dtype=dtype, device=device)
-        self.wo = nn.Linear(ff_dim, model_dim, bias=False, dtype=dtype, device=device)
-
-    def forward(self, x):
-        hidden_gelu = torch.nn.functional.gelu(self.wi_0(x), approximate="tanh")
-        hidden_linear = self.wi_1(x)
-        x = hidden_gelu * hidden_linear
-        x = self.wo(x)
-        return x
-
-
-class T5LayerFF(torch.nn.Module):
-    def __init__(self, model_dim, ff_dim, dtype, device):
-        super().__init__()
-        self.DenseReluDense = T5DenseGatedActDense(model_dim, ff_dim, dtype, device)
-        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
-
-    def forward(self, x):
-        forwarded_states = self.layer_norm(x)
-        forwarded_states = self.DenseReluDense(forwarded_states)
-        x += forwarded_states
-        return x
-
-
-class T5Attention(torch.nn.Module):
-    def __init__(self, model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device):
-        super().__init__()
-        # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
-        self.k = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
-        self.v = nn.Linear(model_dim, inner_dim, bias=False, dtype=dtype, device=device)
-        self.o = nn.Linear(inner_dim, model_dim, bias=False, dtype=dtype, device=device)
-        self.num_heads = num_heads
-        self.relative_attention_bias = None
-        if relative_attention_bias:
-            self.relative_attention_num_buckets = 32
-            self.relative_attention_max_distance = 128
-            self.relative_attention_bias = torch.nn.Embedding(self.relative_attention_num_buckets, self.num_heads, device=device)
-
-        self.attn_mode = "xformers"  # TODO 何とかする
-
-    def set_attn_mode(self, mode):
-        self.attn_mode = mode
-
-    @staticmethod
-    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
-        """
-        Adapted from Mesh Tensorflow:
-        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
-
-        Translate relative position to a bucket number for relative attention. The relative position is defined as
-        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
-        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
-        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
-        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
-        This should allow for more graceful generalization to longer sequences than the model has been trained on
-
-        Args:
-            relative_position: an int32 Tensor
-            bidirectional: a boolean - whether the attention is bidirectional
-            num_buckets: an integer
-            max_distance: an integer
-
-        Returns:
-            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
-        """
-        relative_buckets = 0
-        if bidirectional:
-            num_buckets //= 2
-            relative_buckets += (relative_position > 0).to(torch.long) * num_buckets
-            relative_position = torch.abs(relative_position)
-        else:
-            relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
-        # now relative_position is in the range [0, inf)
-        # half of the buckets are for exact increments in positions
-        max_exact = num_buckets // 2
-        is_small = relative_position < max_exact
-        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
-        relative_position_if_large = max_exact + (
-            torch.log(relative_position.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
-        ).to(torch.long)
-        relative_position_if_large = torch.min(
-            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
-        )
-        relative_buckets += torch.where(is_small, relative_position, relative_position_if_large)
-        return relative_buckets
-
-    def compute_bias(self, query_length, key_length, device):
-        """Compute binned relative position bias"""
-        context_position = torch.arange(query_length, dtype=torch.long, device=device)[:, None]
-        memory_position = torch.arange(key_length, dtype=torch.long, device=device)[None, :]
-        relative_position = memory_position - context_position  # shape (query_length, key_length)
-        relative_position_bucket = self._relative_position_bucket(
-            relative_position,  # shape (query_length, key_length)
-            bidirectional=True,
-            num_buckets=self.relative_attention_num_buckets,
-            max_distance=self.relative_attention_max_distance,
-        )
-        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
-        values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
-        return values
-
-    def forward(self, x, past_bias=None):
-        q = self.q(x)
-        k = self.k(x)
-        v = self.v(x)
-        if self.relative_attention_bias is not None:
-            past_bias = self.compute_bias(x.shape[1], x.shape[1], x.device)
-        if past_bias is not None:
-            mask = past_bias
-        out = attention(q, k * ((k.shape[-1] / self.num_heads) ** 0.5), v, self.num_heads, mask, mode=self.attn_mode)
-        return self.o(out), past_bias
-
-
-class T5LayerSelfAttention(torch.nn.Module):
-    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
-        super().__init__()
-        self.SelfAttention = T5Attention(model_dim, inner_dim, num_heads, relative_attention_bias, dtype, device)
-        self.layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
-
-    def forward(self, x, past_bias=None):
-        output, past_bias = self.SelfAttention(self.layer_norm(x), past_bias=past_bias)
-        x += output
-        return x, past_bias
-
-
-class T5Block(torch.nn.Module):
-    def __init__(self, model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device):
-        super().__init__()
-        self.layer = torch.nn.ModuleList()
-        self.layer.append(T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias, dtype, device))
-        self.layer.append(T5LayerFF(model_dim, ff_dim, dtype, device))
-
-    def forward(self, x, past_bias=None):
-        x, past_bias = self.layer[0](x, past_bias)
-
-        # copy from transformers' T5Block
-        # clamp inf values to enable fp16 training
-        if x.dtype == torch.float16:
-            clamp_value = torch.where(
-                torch.isinf(x).any(),
-                torch.finfo(x.dtype).max - 1000,
-                torch.finfo(x.dtype).max,
-            )
-            x = torch.clamp(x, min=-clamp_value, max=clamp_value)
-
-        x = self.layer[-1](x)
-        # clamp inf values to enable fp16 training
-        if x.dtype == torch.float16:
-            clamp_value = torch.where(
-                torch.isinf(x).any(),
-                torch.finfo(x.dtype).max - 1000,
-                torch.finfo(x.dtype).max,
-            )
-            x = torch.clamp(x, min=-clamp_value, max=clamp_value)
-
-        return x, past_bias
-
-
-class T5Stack(torch.nn.Module):
-    def __init__(self, num_layers, model_dim, inner_dim, ff_dim, num_heads, vocab_size, dtype, device):
-        super().__init__()
-        self.embed_tokens = torch.nn.Embedding(vocab_size, model_dim, device=device)
-        self.block = torch.nn.ModuleList(
-            [
-                T5Block(model_dim, inner_dim, ff_dim, num_heads, relative_attention_bias=(i == 0), dtype=dtype, device=device)
-                for i in range(num_layers)
-            ]
-        )
-        self.final_layer_norm = T5LayerNorm(model_dim, dtype=dtype, device=device)
-
-    def forward(self, input_ids, intermediate_output=None, final_layer_norm_intermediate=True):
-        intermediate = None
-        x = self.embed_tokens(input_ids)
-        past_bias = None
-        for i, l in enumerate(self.block):
-            # uncomment to debug layerwise output: fp16 may cause issues
-            # print(i, x.mean(), x.std())
-            x, past_bias = l(x, past_bias)
-            if i == intermediate_output:
-                intermediate = x.clone()
-        # print(x.mean(), x.std())
-        x = self.final_layer_norm(x)
-        if intermediate is not None and final_layer_norm_intermediate:
-            intermediate = self.final_layer_norm(intermediate)
-        # print(x.mean(), x.std())
-        return x, intermediate
-
-
-class T5(torch.nn.Module):
-    def __init__(self, config_dict, dtype, device):
-        super().__init__()
-        self.num_layers = config_dict["num_layers"]
-        self.encoder = T5Stack(
-            self.num_layers,
-            config_dict["d_model"],
-            config_dict["d_model"],
-            config_dict["d_ff"],
-            config_dict["num_heads"],
-            config_dict["vocab_size"],
-            dtype,
-            device,
-        )
-        self.dtype = dtype
-
-    def get_input_embeddings(self):
-        return self.encoder.embed_tokens
-
-    def set_input_embeddings(self, embeddings):
-        self.encoder.embed_tokens = embeddings
-
-    def forward(self, *args, **kwargs):
-        return self.encoder(*args, **kwargs)
-
-
-def create_clip_l(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[str, torch.Tensor]] = None):
-    r"""
-    state_dict is not loaded, but updated with missing keys
-    """
-    CLIPL_CONFIG = {
-        "hidden_act": "quick_gelu",
-        "hidden_size": 768,
-        "intermediate_size": 3072,
-        "num_attention_heads": 12,
-        "num_hidden_layers": 12,
-    }
-    with torch.no_grad():
-        clip_l = SDClipModel(
-            layer="hidden",
-            layer_idx=-2,
-            device=device,
-            dtype=dtype,
-            layer_norm_hidden_state=False,
-            return_projected_pooled=False,
-            textmodel_json_config=CLIPL_CONFIG,
-        )
-        clip_l.gradient_checkpointing_enable()
-    if state_dict is not None:
-        # update state_dict if provided to include logit_scale and text_projection.weight avoid errors
-        if "logit_scale" not in state_dict:
-            state_dict["logit_scale"] = clip_l.logit_scale
-        if "transformer.text_projection.weight" not in state_dict:
-            state_dict["transformer.text_projection.weight"] = clip_l.transformer.text_projection.weight
-    return clip_l
-
-
-def create_clip_g(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[str, torch.Tensor]] = None):
-    r"""
-    state_dict is not loaded, but updated with missing keys
-    """
-    CLIPG_CONFIG = {
-        "hidden_act": "gelu",
-        "hidden_size": 1280,
-        "intermediate_size": 5120,
-        "num_attention_heads": 20,
-        "num_hidden_layers": 32,
-    }
-    with torch.no_grad():
-        clip_g = SDXLClipG(CLIPG_CONFIG, device=device, dtype=dtype)
-    if state_dict is not None:
-        if "logit_scale" not in state_dict:
-            state_dict["logit_scale"] = clip_g.logit_scale
-    return clip_g
-
-
-def create_t5xxl(device="cpu", dtype=torch.float32, state_dict: Optional[Dict[str, torch.Tensor]] = None) -> T5XXLModel:
-    T5_CONFIG = {"d_ff": 10240, "d_model": 4096, "num_heads": 64, "num_layers": 24, "vocab_size": 32128}
-    with torch.no_grad():
-        t5 = T5XXLModel(T5_CONFIG, dtype=dtype, device=device)
-    if state_dict is not None:
-        if "logit_scale" not in state_dict:
-            state_dict["logit_scale"] = t5.logit_scale
-        if "transformer.shared.weight" in state_dict:
-            state_dict.pop("transformer.shared.weight")
-    return t5
-
-
-"""
-    # snippet for using the T5 model from transformers
-    
-    from transformers import T5EncoderModel, T5Config
-    import accelerate
-    import json
-
-    T5_CONFIG_JSON = ""
-{
-  "architectures": [
-    "T5EncoderModel"
-  ],
-  "classifier_dropout": 0.0,
-  "d_ff": 10240,
-  "d_kv": 64,
-  "d_model": 4096,
-  "decoder_start_token_id": 0,
-  "dense_act_fn": "gelu_new",
-  "dropout_rate": 0.1,
-  "eos_token_id": 1,
-  "feed_forward_proj": "gated-gelu",
-  "initializer_factor": 1.0,
-  "is_encoder_decoder": true,
-  "is_gated_act": true,
-  "layer_norm_epsilon": 1e-06,
-  "model_type": "t5",
-  "num_decoder_layers": 24,
-  "num_heads": 64,
-  "num_layers": 24,
-  "output_past": true,
-  "pad_token_id": 0,
-  "relative_attention_max_distance": 128,
-  "relative_attention_num_buckets": 32,
-  "tie_word_embeddings": false,
-  "torch_dtype": "float16",
-  "transformers_version": "4.41.2",
-  "use_cache": true,
-  "vocab_size": 32128
-}
-""
-    config = json.loads(T5_CONFIG_JSON)
-    config = T5Config(**config)
-
-    # model = T5EncoderModel.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", subfolder="text_encoder_3")
-    # print(model.config)
-    # # model(**load_model.config)
-
-    # with accelerate.init_empty_weights():
-    model = T5EncoderModel._from_config(config)  # , torch_dtype=dtype)
-    for key in list(state_dict.keys()):
-        if key.startswith("transformer."):
-            new_key = key[len("transformer.") :]
-            state_dict[new_key] = state_dict.pop(key)
-
-    info = model.load_state_dict(state_dict)
-    print(info)
-    model.set_attn_mode = lambda x: None
-    # model.to("cpu")
-
-    _self = model
-
-    def enc(list_of_token_weight_pairs):
-        has_batch = isinstance(list_of_token_weight_pairs[0][0], list)
-
-        if has_batch:
-            list_of_tokens = []
-            for pairs in list_of_token_weight_pairs:
-                tokens = [a[0] for a in pairs[0]]  # I'm not sure why this is [0]
-                list_of_tokens.append(tokens)
-        else:
-            list_of_tokens = [[a[0] for a in list_of_token_weight_pairs[0]]]
-
-        list_of_tokens = np.array(list_of_tokens)
-        list_of_tokens = torch.from_numpy(list_of_tokens).to("cuda", dtype=torch.long)
-        out = _self(list_of_tokens)
-        pooled = None
-        if has_batch:
-            return out, pooled
-        else:
-            if pooled is not None:
-                first_pooled = pooled[0:1]
-            else:
-                first_pooled = pooled
-            return out[0], first_pooled
-            # output = [out[0:1]]
-            # return torch.cat(output, dim=-2), first_pooled
-
-    model.encode_token_weights = enc
-
-    return model
-"""
-
 # endregion
diff --git a/library/sd3_train_utils.py b/library/sd3_train_utils.py
index e819d440..9282482d 100644
--- a/library/sd3_train_utils.py
+++ b/library/sd3_train_utils.py
@@ -11,8 +11,8 @@ from safetensors.torch import save_file
 from accelerate import Accelerator, PartialState
 from tqdm import tqdm
 from PIL import Image
+from transformers import CLIPTextModelWithProjection, T5EncoderModel
 
-from library import sd3_models, sd3_utils, strategy_base, train_util
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -28,60 +28,16 @@ import logging
 
 logger = logging.getLogger(__name__)
 
-from .sdxl_train_util import match_mixed_precision
-
-
-def load_target_model(
-    model_type: str,
-    args: argparse.Namespace,
-    state_dict: dict,
-    accelerator: Accelerator,
-    attn_mode: str,
-    model_dtype: Optional[torch.dtype],
-    device: Optional[torch.device],
-) -> Union[
-    sd3_models.MMDiT,
-    Optional[sd3_models.SDClipModel],
-    Optional[sd3_models.SDXLClipG],
-    Optional[sd3_models.T5XXLModel],
-    sd3_models.SDVAE,
-]:
-    loading_device = device if device is not None else (accelerator.device if args.lowram else "cpu")
-
-    for pi in range(accelerator.state.num_processes):
-        if pi == accelerator.state.local_process_index:
-            logger.info(f"loading model for process {accelerator.state.local_process_index}/{accelerator.state.num_processes}")
-
-            if model_type == "mmdit":
-                model = sd3_utils.load_mmdit(state_dict, attn_mode, model_dtype, loading_device)
-            elif model_type == "clip_l":
-                model = sd3_utils.load_clip_l(state_dict, args.clip_l, attn_mode, model_dtype, loading_device)
-            elif model_type == "clip_g":
-                model = sd3_utils.load_clip_g(state_dict, args.clip_g, attn_mode, model_dtype, loading_device)
-            elif model_type == "t5xxl":
-                model = sd3_utils.load_t5xxl(state_dict, args.t5xxl, attn_mode, model_dtype, loading_device)
-            elif model_type == "vae":
-                model = sd3_utils.load_vae(state_dict, args.vae, model_dtype, loading_device)
-            else:
-                raise ValueError(f"Unknown model type: {model_type}")
-
-            # work on low-ram device: models are already loaded on accelerator.device, but we ensure they are on device
-            if args.lowram:
-                model = model.to(accelerator.device)
-
-            clean_memory_on_device(accelerator.device)
-        accelerator.wait_for_everyone()
-
-    return model
+from library import sd3_models, sd3_utils, strategy_base, train_util
 
 
 def save_models(
     ckpt_path: str,
-    mmdit: sd3_models.MMDiT,
-    vae: sd3_models.SDVAE,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel],
+    mmdit: Optional[sd3_models.MMDiT],
+    vae: Optional[sd3_models.SDVAE],
+    clip_l: Optional[CLIPTextModelWithProjection],
+    clip_g: Optional[CLIPTextModelWithProjection],
+    t5xxl: Optional[T5EncoderModel],
     sai_metadata: Optional[dict],
     save_dtype: Optional[torch.dtype] = None,
 ):
@@ -101,24 +57,35 @@ def save_models(
     update_sd("model.diffusion_model.", mmdit.state_dict())
     update_sd("first_stage_model.", vae.state_dict())
 
-    if clip_l is not None:
-        update_sd("text_encoders.clip_l.", clip_l.state_dict())
-    if clip_g is not None:
-        update_sd("text_encoders.clip_g.", clip_g.state_dict())
-    if t5xxl is not None:
-        update_sd("text_encoders.t5xxl.", t5xxl.state_dict())
+    # do not support unified checkpoint format for now
+    # if clip_l is not None:
+    #     update_sd("text_encoders.clip_l.", clip_l.state_dict())
+    # if clip_g is not None:
+    #     update_sd("text_encoders.clip_g.", clip_g.state_dict())
+    # if t5xxl is not None:
+    #     update_sd("text_encoders.t5xxl.", t5xxl.state_dict())
 
     save_file(state_dict, ckpt_path, metadata=sai_metadata)
 
+    if clip_l is not None:
+        clip_l_path = ckpt_path.replace(".safetensors", "_clip_l.safetensors")
+        save_file(clip_l.state_dict(), clip_l_path)
+    if clip_g is not None:
+        clip_g_path = ckpt_path.replace(".safetensors", "_clip_g.safetensors")
+        save_file(clip_g.state_dict(), clip_g_path)
+    if t5xxl is not None:
+        t5xxl_path = ckpt_path.replace(".safetensors", "_t5xxl.safetensors")
+        save_file(t5xxl.state_dict(), t5xxl_path)
+
 
 def save_sd3_model_on_train_end(
     args: argparse.Namespace,
     save_dtype: torch.dtype,
     epoch: int,
     global_step: int,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel],
+    clip_l: Optional[CLIPTextModelWithProjection],
+    clip_g: Optional[CLIPTextModelWithProjection],
+    t5xxl: Optional[T5EncoderModel],
     mmdit: sd3_models.MMDiT,
     vae: sd3_models.SDVAE,
 ):
@@ -141,9 +108,9 @@ def save_sd3_model_on_epoch_end_or_stepwise(
     epoch: int,
     num_train_epochs: int,
     global_step: int,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel],
+    clip_l: Optional[CLIPTextModelWithProjection],
+    clip_g: Optional[CLIPTextModelWithProjection],
+    t5xxl: Optional[T5EncoderModel],
     mmdit: sd3_models.MMDiT,
     vae: sd3_models.SDVAE,
 ):
@@ -208,23 +175,27 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         help="T5-XXL model path. if not specified, use ckpt's state_dict / T5-XXLモデルのパス。指定しない場合はckptのstate_dictを使用",
     )
     parser.add_argument(
-        "--save_clip", action="store_true", help="save CLIP models to checkpoint / CLIPモデルをチェックポイントに保存する"
+        "--save_clip",
+        action="store_true",
+        help="[DOES NOT WORK] unified checkpoint is not supported / 統合チェックポイントはまだサポートされていません",
     )
     parser.add_argument(
-        "--save_t5xxl", action="store_true", help="save T5-XXL model to checkpoint / T5-XXLモデルをチェックポイントに保存する"
+        "--save_t5xxl",
+        action="store_true",
+        help="[DOES NOT WORK] unified checkpoint is not supported / 統合チェックポイントはまだサポートされていません",
     )
 
     parser.add_argument(
         "--t5xxl_device",
         type=str,
         default=None,
-        help="T5-XXL device. if not specified, use accelerator's device / T5-XXLデバイス。指定しない場合はacceleratorのデバイスを使用",
+        help="[DOES NOT WORK] not supported yet. T5-XXL device. if not specified, use accelerator's device / T5-XXLデバイス。指定しない場合はacceleratorのデバイスを使用",
     )
     parser.add_argument(
         "--t5xxl_dtype",
         type=str,
         default=None,
-        help="T5-XXL dtype. if not specified, use default dtype (from mixed precision) / T5-XXL dtype。指定しない場合はデフォルトのdtype（mixed precisionから）を使用",
+        help="[DOES NOT WORK] not supported yet. T5-XXL dtype. if not specified, use default dtype (from mixed precision) / T5-XXL dtype。指定しない場合はデフォルトのdtype（mixed precisionから）を使用",
     )
 
     # copy from Diffusers
@@ -233,16 +204,25 @@ def add_sd3_training_arguments(parser: argparse.ArgumentParser):
         type=str,
         default="logit_normal",
         choices=["sigma_sqrt", "logit_normal", "mode", "cosmap"],
+        help="weighting scheme for timestep distribution and loss / タイムステップ分布と損失のための重み付けスキーム",
     )
     parser.add_argument(
-        "--logit_mean", type=float, default=0.0, help="mean to use when using the `'logit_normal'` weighting scheme."
+        "--logit_mean",
+        type=float,
+        default=0.0,
+        help="mean to use when using the `'logit_normal'` weighting scheme for timestep distribution. / タイムステップ分布のために`'logit_normal'`重み付けスキームを使用する場合の平均",
+    )
+    parser.add_argument(
+        "--logit_std",
+        type=float,
+        default=1.0,
+        help="std to use when using the `'logit_normal'` weighting scheme for timestep distribution. / タイムステップ分布のために`'logit_normal'`重み付けスキームを使用する場合のstd",
     )
-    parser.add_argument("--logit_std", type=float, default=1.0, help="std to use when using the `'logit_normal'` weighting scheme.")
     parser.add_argument(
         "--mode_scale",
         type=float,
         default=1.29,
-        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`.",
+        help="Scale of mode weighting scheme. Only effective when using the `'mode'` as the `weighting_scheme`. / モード重み付けスキームのスケール。`'mode'`を`weighting_scheme`として使用する場合のみ有効",
     )
 
 
@@ -283,7 +263,7 @@ def verify_sdxl_training_args(args: argparse.Namespace, supportTextEncoderCachin
 # temporary copied from sd3_minimal_inferece.py
 
 
-def get_sigmas(sampling: sd3_utils.ModelSamplingDiscreteFlow, steps):
+def get_all_sigmas(sampling: sd3_utils.ModelSamplingDiscreteFlow, steps):
     start = sampling.timestep(sampling.sigma_max)
     end = sampling.timestep(sampling.sigma_min)
     timesteps = torch.linspace(start, end, steps)
@@ -327,7 +307,7 @@ def do_sample(
 
     model_sampling = sd3_utils.ModelSamplingDiscreteFlow(shift=3.0)  # 3.0 is for SD3
 
-    sigmas = get_sigmas(model_sampling, steps).to(device)
+    sigmas = get_all_sigmas(model_sampling, steps).to(device)
 
     noise_scaled = model_sampling.noise_scaling(sigmas[0], noise, latent, max_denoise(model_sampling, sigmas))
 
@@ -371,37 +351,6 @@ def do_sample(
     return x
 
 
-def load_prompts(prompt_file: str) -> List[Dict]:
-    # read prompts
-    if prompt_file.endswith(".txt"):
-        with open(prompt_file, "r", encoding="utf-8") as f:
-            lines = f.readlines()
-        prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
-    elif prompt_file.endswith(".toml"):
-        with open(prompt_file, "r", encoding="utf-8") as f:
-            data = toml.load(f)
-        prompts = [dict(**data["prompt"], **subset) for subset in data["prompt"]["subset"]]
-    elif prompt_file.endswith(".json"):
-        with open(prompt_file, "r", encoding="utf-8") as f:
-            prompts = json.load(f)
-
-    # preprocess prompts
-    for i in range(len(prompts)):
-        prompt_dict = prompts[i]
-        if isinstance(prompt_dict, str):
-            from library.train_util import line_to_prompt_dict
-
-            prompt_dict = line_to_prompt_dict(prompt_dict)
-            prompts[i] = prompt_dict
-        assert isinstance(prompt_dict, dict)
-
-        # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
-        prompt_dict["enum"] = i
-        prompt_dict.pop("subset", None)
-
-    return prompts
-
-
 def sample_images(
     accelerator: Accelerator,
     args: argparse.Namespace,
@@ -440,7 +389,7 @@ def sample_images(
     text_encoders = [accelerator.unwrap_model(te) for te in text_encoders]
     # print([(te.parameters().__next__().device if te is not None else None) for te in text_encoders])
 
-    prompts = load_prompts(args.sample_prompts)
+    prompts = train_util.load_prompts(args.sample_prompts)
 
     save_dir = args.output_dir + "/sample"
     os.makedirs(save_dir, exist_ok=True)
@@ -510,7 +459,7 @@ def sample_image_inference(
     accelerator: Accelerator,
     args: argparse.Namespace,
     mmdit: sd3_models.MMDiT,
-    text_encoders: List[Union[sd3_models.SDClipModel, sd3_models.SDXLClipG, sd3_models.T5XXLModel]],
+    text_encoders: List[Union[CLIPTextModelWithProjection, T5EncoderModel]],
     vae: sd3_models.SDVAE,
     save_dir,
     prompt_dict,
@@ -568,7 +517,7 @@ def sample_image_inference(
         l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(prompt)
         te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, [l_tokens, g_tokens, t5_tokens])
 
-    lg_out, t5_out, pooled = te_outputs
+    lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = te_outputs
     cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
     # encode negative prompts
@@ -578,7 +527,7 @@ def sample_image_inference(
         l_tokens, g_tokens, t5_tokens = tokenize_strategy.tokenize(negative_prompt)
         neg_te_outputs = encoding_strategy.encode_tokens(tokenize_strategy, text_encoders, [l_tokens, g_tokens, t5_tokens])
 
-    lg_out, t5_out, pooled = neg_te_outputs
+    lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = neg_te_outputs
     neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
 
     # sample image
@@ -609,14 +558,9 @@ def sample_image_inference(
         wandb_tracker = accelerator.get_tracker("wandb")
 
         import wandb
+
         # not to commit images to avoid inconsistency between training and logging steps
-        wandb_tracker.log(
-            {f"sample_{i}": wandb.Image(
-                image,
-                caption=prompt # positive prompt as a caption
-            )}, 
-            commit=False
-        )
+        wandb_tracker.log({f"sample_{i}": wandb.Image(image, caption=prompt)}, commit=False)  # positive prompt as a caption
 
 
 # region Diffusers
@@ -886,4 +830,78 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
         return self.config.num_train_timesteps
 
 
+def get_sigmas(noise_scheduler, timesteps, device, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(device)
+    timesteps = timesteps.to(device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+
+
+def compute_density_for_timestep_sampling(
+    weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
+):
+    """Compute the density for sampling the timesteps when doing SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "logit_normal":
+        # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
+        u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
+        u = torch.nn.functional.sigmoid(u)
+    elif weighting_scheme == "mode":
+        u = torch.rand(size=(batch_size,), device="cpu")
+        u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
+    else:
+        u = torch.rand(size=(batch_size,), device="cpu")
+    return u
+
+
+def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
+    """Computes loss weighting scheme for SD3 training.
+
+    Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
+
+    SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
+    """
+    if weighting_scheme == "sigma_sqrt":
+        weighting = (sigmas**-2.0).float()
+    elif weighting_scheme == "cosmap":
+        bot = 1 - 2 * sigmas + 2 * sigmas**2
+        weighting = 2 / (math.pi * bot)
+    else:
+        weighting = torch.ones_like(sigmas)
+    return weighting
+
+
+def get_noisy_model_input_and_timesteps(
+    args, noise_scheduler, latents, noise, device, dtype
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    bsz = latents.shape[0]
+
+    # Sample a random timestep for each image
+    # for weighting schemes where we sample timesteps non-uniformly
+    u = compute_density_for_timestep_sampling(
+        weighting_scheme=args.weighting_scheme,
+        batch_size=bsz,
+        logit_mean=args.logit_mean,
+        logit_std=args.logit_std,
+        mode_scale=args.mode_scale,
+    )
+    indices = (u * noise_scheduler.config.num_train_timesteps).long()
+    timesteps = noise_scheduler.timesteps[indices].to(device=device)
+
+    # Add noise according to flow matching.
+    sigmas = get_sigmas(noise_scheduler, timesteps, device, n_dim=latents.ndim, dtype=dtype)
+    noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
+
+    return noisy_model_input, timesteps, sigmas
+
+
 # endregion
diff --git a/library/sd3_utils.py b/library/sd3_utils.py
index 5849518f..9ad995d8 100644
--- a/library/sd3_utils.py
+++ b/library/sd3_utils.py
@@ -1,9 +1,12 @@
+from dataclasses import dataclass
 import math
-from typing import Dict, Optional, Union
+import re
+from typing import Dict, List, Optional, Union
 import torch
 import safetensors
 from safetensors.torch import load_file
 from accelerate import init_empty_weights
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPConfig, CLIPTextConfig
 
 from .utils import setup_logging
 
@@ -19,18 +22,61 @@ from library import sdxl_model_util
 
 # region models
 
+# TODO remove dependency on flux_utils
+from library.utils import load_safetensors
+from library.flux_utils import load_t5xxl as flux_utils_load_t5xxl
 
-def load_safetensors(path: str, dvc: Union[str, torch.device], disable_mmap: bool = False):
-    if disable_mmap:
-        return safetensors.torch.load(open(path, "rb").read())
+
+def analyze_state_dict_state(state_dict: Dict, prefix: str = ""):
+    logger.info(f"Analyzing state dict state...")
+
+    # analyze configs
+    patch_size = state_dict[f"{prefix}x_embedder.proj.weight"].shape[2]
+    depth = state_dict[f"{prefix}x_embedder.proj.weight"].shape[0] // 64
+    num_patches = state_dict[f"{prefix}pos_embed"].shape[1]
+    pos_embed_max_size = round(math.sqrt(num_patches))
+    adm_in_channels = state_dict[f"{prefix}y_embedder.mlp.0.weight"].shape[1]
+    context_shape = state_dict[f"{prefix}context_embedder.weight"].shape
+    qk_norm = "rms" if f"{prefix}joint_blocks.0.context_block.attn.ln_k.weight" in state_dict.keys() else None
+
+    #  x_block_self_attn_layers.append(int(key.split(".x_block.attn2.ln_k.weight")[0].split(".")[-1]))
+    x_block_self_attn_layers = []
+    re_attn = re.compile(r".(\d+).x_block.attn2.ln_k.weight")
+    for key in list(state_dict.keys()):
+        m = re_attn.match(key)
+        if m:
+            x_block_self_attn_layers.append(int(m.group(1)))
+
+    assert len(x_block_self_attn_layers) == 0, "x_block_self_attn_layers is not supported"
+
+    context_embedder_in_features = context_shape[1]
+    context_embedder_out_features = context_shape[0]
+
+    # only supports 3-5-large and 3-medium
+    if qk_norm is not None:
+        model_type = "3-5-large"
     else:
-        try:
-            return load_file(path, device=dvc)
-        except:
-            return load_file(path)  # prevent device invalid Error
+        model_type = "3-medium"
+
+    params = sd3_models.SD3Params(
+        patch_size=patch_size,
+        depth=depth,
+        num_patches=num_patches,
+        pos_embed_max_size=pos_embed_max_size,
+        adm_in_channels=adm_in_channels,
+        qk_norm=qk_norm,
+        x_block_self_attn_layers=x_block_self_attn_layers,
+        context_embedder_in_features=context_embedder_in_features,
+        context_embedder_out_features=context_embedder_out_features,
+        model_type=model_type,
+    )
+    logger.info(f"Analyzed state dict state: {params}")
+    return params
 
 
-def load_mmdit(state_dict: Dict, attn_mode: str, dtype: Optional[Union[str, torch.dtype]], device: Union[str, torch.device]):
+def load_mmdit(
+    state_dict: Dict, dtype: Optional[Union[str, torch.dtype]], device: Union[str, torch.device], attn_mode: str = "torch"
+) -> sd3_models.MMDiT:
     mmdit_sd = {}
 
     mmdit_prefix = "model.diffusion_model."
@@ -40,8 +86,9 @@ def load_mmdit(state_dict: Dict, attn_mode: str, dtype: Optional[Union[str, torc
 
     # load MMDiT
     logger.info("Building MMDit")
+    params = analyze_state_dict_state(mmdit_sd)
     with init_empty_weights():
-        mmdit = sd3_models.create_mmdit_sd3_medium_configs(attn_mode)
+        mmdit = sd3_models.create_sd3_mmdit(params, attn_mode)
 
     logger.info("Loading state dict...")
     info = sdxl_model_util._load_state_dict_on_device(mmdit, mmdit_sd, device, dtype)
@@ -50,20 +97,14 @@ def load_mmdit(state_dict: Dict, attn_mode: str, dtype: Optional[Union[str, torc
 
 
 def load_clip_l(
-    state_dict: Dict,
     clip_l_path: Optional[str],
-    attn_mode: str,
-    clip_dtype: Optional[Union[str, torch.dtype]],
+    dtype: Optional[Union[str, torch.dtype]],
     device: Union[str, torch.device],
     disable_mmap: bool = False,
+    state_dict: Optional[Dict] = None,
 ):
     clip_l_sd = None
-    if clip_l_path:
-        logger.info(f"Loading clip_l from {clip_l_path}...")
-        clip_l_sd = load_safetensors(clip_l_path, device, disable_mmap)
-        for key in list(clip_l_sd.keys()):
-            clip_l_sd["transformer." + key] = clip_l_sd.pop(key)
-    else:
+    if clip_l_path is None:
         if "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
             # found clip_l: remove prefix "text_encoders.clip_l."
             logger.info("clip_l is included in the checkpoint")
@@ -72,34 +113,58 @@ def load_clip_l(
             for k in list(state_dict.keys()):
                 if k.startswith(prefix):
                     clip_l_sd[k[len(prefix) :]] = state_dict.pop(k)
+        elif clip_l_path is None:
+            logger.info("clip_l is not included in the checkpoint and clip_l_path is not provided")
+            return None
+
+    # load clip_l
+    logger.info("Building CLIP-L")
+    config = CLIPTextConfig(
+        vocab_size=49408,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-05,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        model_type="clip_text_model",
+        projection_dim=768,
+        # torch_dtype="float32",
+        # transformers_version="4.25.0.dev0",
+    )
+    with init_empty_weights():
+        clip = CLIPTextModelWithProjection(config)
 
     if clip_l_sd is None:
-        clip_l = None
-    else:
-        logger.info("Building ClipL")
-        clip_l = sd3_models.create_clip_l(device, clip_dtype, clip_l_sd)
-        logger.info("Loading state dict...")
-        info = clip_l.load_state_dict(clip_l_sd)
-        logger.info(f"Loaded ClipL: {info}")
-        clip_l.set_attn_mode(attn_mode)
-    return clip_l
+        logger.info(f"Loading state dict from {clip_l_path}")
+        clip_l_sd = load_safetensors(clip_l_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+
+    if "text_projection.weight" not in clip_l_sd:
+        logger.info("Adding text_projection.weight to clip_l_sd")
+        clip_l_sd["text_projection.weight"] = torch.eye(768, dtype=dtype, device=device)
+
+    info = clip.load_state_dict(clip_l_sd, strict=False, assign=True)
+    logger.info(f"Loaded CLIP-L: {info}")
+    return clip
 
 
 def load_clip_g(
-    state_dict: Dict,
     clip_g_path: Optional[str],
-    attn_mode: str,
-    clip_dtype: Optional[Union[str, torch.dtype]],
+    dtype: Optional[Union[str, torch.dtype]],
     device: Union[str, torch.device],
     disable_mmap: bool = False,
+    state_dict: Optional[Dict] = None,
 ):
     clip_g_sd = None
-    if clip_g_path:
-        logger.info(f"Loading clip_g from {clip_g_path}...")
-        clip_g_sd = load_safetensors(clip_g_path, device, disable_mmap)
-        for key in list(clip_g_sd.keys()):
-            clip_g_sd["transformer." + key] = clip_g_sd.pop(key)
-    else:
+    if state_dict is not None:
         if "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
             # found clip_g: remove prefix "text_encoders.clip_g."
             logger.info("clip_g is included in the checkpoint")
@@ -108,34 +173,53 @@ def load_clip_g(
             for k in list(state_dict.keys()):
                 if k.startswith(prefix):
                     clip_g_sd[k[len(prefix) :]] = state_dict.pop(k)
+        elif clip_g_path is None:
+            logger.info("clip_g is not included in the checkpoint and clip_g_path is not provided")
+            return None
+
+    # load clip_g
+    logger.info("Building CLIP-G")
+    config = CLIPTextConfig(
+        vocab_size=49408,
+        hidden_size=1280,
+        intermediate_size=5120,
+        num_hidden_layers=32,
+        num_attention_heads=20,
+        max_position_embeddings=77,
+        hidden_act="gelu",
+        layer_norm_eps=1e-05,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        model_type="clip_text_model",
+        projection_dim=1280,
+        # torch_dtype="float32",
+        # transformers_version="4.25.0.dev0",
+    )
+    with init_empty_weights():
+        clip = CLIPTextModelWithProjection(config)
 
     if clip_g_sd is None:
-        clip_g = None
-    else:
-        logger.info("Building ClipG")
-        clip_g = sd3_models.create_clip_g(device, clip_dtype, clip_g_sd)
-        logger.info("Loading state dict...")
-        info = clip_g.load_state_dict(clip_g_sd)
-        logger.info(f"Loaded ClipG: {info}")
-        clip_g.set_attn_mode(attn_mode)
-    return clip_g
+        logger.info(f"Loading state dict from {clip_g_path}")
+        clip_g_sd = load_safetensors(clip_g_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype)
+    info = clip.load_state_dict(clip_g_sd, strict=False, assign=True)
+    logger.info(f"Loaded CLIP-G: {info}")
+    return clip
 
 
 def load_t5xxl(
-    state_dict: Dict,
     t5xxl_path: Optional[str],
-    attn_mode: str,
     dtype: Optional[Union[str, torch.dtype]],
     device: Union[str, torch.device],
     disable_mmap: bool = False,
+    state_dict: Optional[Dict] = None,
 ):
     t5xxl_sd = None
-    if t5xxl_path:
-        logger.info(f"Loading t5xxl from {t5xxl_path}...")
-        t5xxl_sd = load_safetensors(t5xxl_path, device, disable_mmap)
-        for key in list(t5xxl_sd.keys()):
-            t5xxl_sd["transformer." + key] = t5xxl_sd.pop(key)
-    else:
+    if state_dict is not None:
         if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight" in state_dict:
             # found t5xxl: remove prefix "text_encoders.t5xxl."
             logger.info("t5xxl is included in the checkpoint")
@@ -144,29 +228,19 @@ def load_t5xxl(
             for k in list(state_dict.keys()):
                 if k.startswith(prefix):
                     t5xxl_sd[k[len(prefix) :]] = state_dict.pop(k)
+        elif t5xxl_path is None:
+            logger.info("t5xxl is not included in the checkpoint and t5xxl_path is not provided")
+            return None
 
-    if t5xxl_sd is None:
-        t5xxl = None
-    else:
-        logger.info("Building T5XXL")
-
-        # workaround for T5XXL model creation: create with fp16 takes too long TODO support virtual device
-        t5xxl = sd3_models.create_t5xxl(device, torch.float32, t5xxl_sd)
-        t5xxl.to(dtype=dtype)
-
-        logger.info("Loading state dict...")
-        info = t5xxl.load_state_dict(t5xxl_sd)
-        logger.info(f"Loaded T5XXL: {info}")
-        t5xxl.set_attn_mode(attn_mode)
-    return t5xxl
+    return flux_utils_load_t5xxl(t5xxl_path, dtype, device, disable_mmap, state_dict=t5xxl_sd)
 
 
 def load_vae(
-    state_dict: Dict,
     vae_path: Optional[str],
     vae_dtype: Optional[Union[str, torch.dtype]],
     device: Optional[Union[str, torch.device]],
     disable_mmap: bool = False,
+    state_dict: Optional[Dict] = None,
 ):
     vae_sd = {}
     if vae_path:
@@ -181,299 +255,15 @@ def load_vae(
                 vae_sd[k[len(vae_prefix) :]] = state_dict.pop(k)
 
     logger.info("Building VAE")
-    vae = sd3_models.SDVAE()
+    vae = sd3_models.SDVAE(vae_dtype, device)
     logger.info("Loading state dict...")
     info = vae.load_state_dict(vae_sd)
     logger.info(f"Loaded VAE: {info}")
-    vae.to(device=device, dtype=vae_dtype)
+    vae.to(device=device, dtype=vae_dtype)  # make sure it's in the right device and dtype
     return vae
 
 
-def load_models(
-    ckpt_path: str,
-    clip_l_path: str,
-    clip_g_path: str,
-    t5xxl_path: str,
-    vae_path: str,
-    attn_mode: str,
-    device: Union[str, torch.device],
-    weight_dtype: Optional[Union[str, torch.dtype]] = None,
-    disable_mmap: bool = False,
-    clip_dtype: Optional[Union[str, torch.dtype]] = None,
-    t5xxl_device: Optional[Union[str, torch.device]] = None,
-    t5xxl_dtype: Optional[Union[str, torch.dtype]] = None,
-    vae_dtype: Optional[Union[str, torch.dtype]] = None,
-):
-    """
-    Load SD3 models from checkpoint files.
-
-    Args:
-        ckpt_path: Path to the SD3 checkpoint file.
-        clip_l_path: Path to the clip_l checkpoint file.
-        clip_g_path: Path to the clip_g checkpoint file.
-        t5xxl_path: Path to the t5xxl checkpoint file.
-        vae_path: Path to the VAE checkpoint file.
-        attn_mode: Attention mode for MMDiT model.
-        device: Device for MMDiT model.
-        weight_dtype: Default dtype of weights for all models. This is weight dtype, so the model dtype may be different.
-        disable_mmap: Disable memory mapping when loading state dict.
-        clip_dtype: Dtype for Clip models, or None to use default dtype.
-        t5xxl_device: Device for T5XXL model to load T5XXL in another device (eg. gpu). Default is None to use device.
-        t5xxl_dtype: Dtype for T5XXL model, or None to use default dtype.
-        vae_dtype: Dtype for VAE model, or None to use default dtype.
-
-    Returns:
-        Tuple of MMDiT, ClipL, ClipG, T5XXL, and VAE models.
-    """
-
-    # In SD1/2 and SDXL, the model is created with empty weights and then loaded with state dict.
-    # However, in SD3, Clip and T5XXL models are created with dtype, so we need to set dtype before loading state dict.
-    # Therefore, we need clip_dtype and t5xxl_dtype.
-
-    def load_state_dict(path: str, dvc: Union[str, torch.device] = device):
-        if disable_mmap:
-            return safetensors.torch.load(open(path, "rb").read())
-        else:
-            try:
-                return load_file(path, device=dvc)
-            except:
-                return load_file(path)  # prevent device invalid Error
-
-    t5xxl_device = t5xxl_device or device
-    clip_dtype = clip_dtype or weight_dtype or torch.float32
-    t5xxl_dtype = t5xxl_dtype or weight_dtype or torch.float32
-    vae_dtype = vae_dtype or weight_dtype or torch.float32
-
-    logger.info(f"Loading SD3 models from {ckpt_path}...")
-    state_dict = load_state_dict(ckpt_path)
-
-    # load clip_l
-    clip_l_sd = None
-    if clip_l_path:
-        logger.info(f"Loading clip_l from {clip_l_path}...")
-        clip_l_sd = load_state_dict(clip_l_path)
-        for key in list(clip_l_sd.keys()):
-            clip_l_sd["transformer." + key] = clip_l_sd.pop(key)
-    else:
-        if "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
-            # found clip_l: remove prefix "text_encoders.clip_l."
-            logger.info("clip_l is included in the checkpoint")
-            clip_l_sd = {}
-            prefix = "text_encoders.clip_l."
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix):
-                    clip_l_sd[k[len(prefix) :]] = state_dict.pop(k)
-
-    # load clip_g
-    clip_g_sd = None
-    if clip_g_path:
-        logger.info(f"Loading clip_g from {clip_g_path}...")
-        clip_g_sd = load_state_dict(clip_g_path)
-        for key in list(clip_g_sd.keys()):
-            clip_g_sd["transformer." + key] = clip_g_sd.pop(key)
-    else:
-        if "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
-            # found clip_g: remove prefix "text_encoders.clip_g."
-            logger.info("clip_g is included in the checkpoint")
-            clip_g_sd = {}
-            prefix = "text_encoders.clip_g."
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix):
-                    clip_g_sd[k[len(prefix) :]] = state_dict.pop(k)
-
-    # load t5xxl
-    t5xxl_sd = None
-    if t5xxl_path:
-        logger.info(f"Loading t5xxl from {t5xxl_path}...")
-        t5xxl_sd = load_state_dict(t5xxl_path, t5xxl_device)
-        for key in list(t5xxl_sd.keys()):
-            t5xxl_sd["transformer." + key] = t5xxl_sd.pop(key)
-    else:
-        if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight" in state_dict:
-            # found t5xxl: remove prefix "text_encoders.t5xxl."
-            logger.info("t5xxl is included in the checkpoint")
-            t5xxl_sd = {}
-            prefix = "text_encoders.t5xxl."
-            for k in list(state_dict.keys()):
-                if k.startswith(prefix):
-                    t5xxl_sd[k[len(prefix) :]] = state_dict.pop(k)
-
-    # MMDiT and VAE
-    vae_sd = {}
-    if vae_path:
-        logger.info(f"Loading VAE from {vae_path}...")
-        vae_sd = load_state_dict(vae_path)
-    else:
-        # remove prefix "first_stage_model."
-        vae_sd = {}
-        vae_prefix = "first_stage_model."
-        for k in list(state_dict.keys()):
-            if k.startswith(vae_prefix):
-                vae_sd[k[len(vae_prefix) :]] = state_dict.pop(k)
-
-    mmdit_prefix = "model.diffusion_model."
-    for k in list(state_dict.keys()):
-        if k.startswith(mmdit_prefix):
-            state_dict[k[len(mmdit_prefix) :]] = state_dict.pop(k)
-        else:
-            state_dict.pop(k)  # remove other keys
-
-    # load MMDiT
-    logger.info("Building MMDit")
-    with init_empty_weights():
-        mmdit = sd3_models.create_mmdit_sd3_medium_configs(attn_mode)
-
-    logger.info("Loading state dict...")
-    info = sdxl_model_util._load_state_dict_on_device(mmdit, state_dict, device, weight_dtype)
-    logger.info(f"Loaded MMDiT: {info}")
-
-    # load ClipG and ClipL
-    if clip_l_sd is None:
-        clip_l = None
-    else:
-        logger.info("Building ClipL")
-        clip_l = sd3_models.create_clip_l(device, clip_dtype, clip_l_sd)
-        logger.info("Loading state dict...")
-        info = clip_l.load_state_dict(clip_l_sd)
-        logger.info(f"Loaded ClipL: {info}")
-        clip_l.set_attn_mode(attn_mode)
-
-    if clip_g_sd is None:
-        clip_g = None
-    else:
-        logger.info("Building ClipG")
-        clip_g = sd3_models.create_clip_g(device, clip_dtype, clip_g_sd)
-        logger.info("Loading state dict...")
-        info = clip_g.load_state_dict(clip_g_sd)
-        logger.info(f"Loaded ClipG: {info}")
-        clip_g.set_attn_mode(attn_mode)
-
-    # load T5XXL
-    if t5xxl_sd is None:
-        t5xxl = None
-    else:
-        logger.info("Building T5XXL")
-        t5xxl = sd3_models.create_t5xxl(t5xxl_device, t5xxl_dtype, t5xxl_sd)
-        logger.info("Loading state dict...")
-        info = t5xxl.load_state_dict(t5xxl_sd)
-        logger.info(f"Loaded T5XXL: {info}")
-        t5xxl.set_attn_mode(attn_mode)
-
-    # load VAE
-    logger.info("Building VAE")
-    vae = sd3_models.SDVAE()
-    logger.info("Loading state dict...")
-    info = vae.load_state_dict(vae_sd)
-    logger.info(f"Loaded VAE: {info}")
-    vae.to(device=device, dtype=vae_dtype)
-
-    return mmdit, clip_l, clip_g, t5xxl, vae
-
-
 # endregion
-# region utils
-
-
-def get_cond(
-    prompt: str,
-    tokenizer: sd3_models.SD3Tokenizer,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel] = None,
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
-):
-    l_tokens, g_tokens, t5_tokens = tokenizer.tokenize_with_weights(prompt)
-    print(t5_tokens)
-    return get_cond_from_tokens(l_tokens, g_tokens, t5_tokens, clip_l, clip_g, t5xxl, device=device, dtype=dtype)
-
-
-def get_cond_from_tokens(
-    l_tokens,
-    g_tokens,
-    t5_tokens,
-    clip_l: sd3_models.SDClipModel,
-    clip_g: sd3_models.SDXLClipG,
-    t5xxl: Optional[sd3_models.T5XXLModel] = None,
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
-):
-    l_out, l_pooled = clip_l.encode_token_weights(l_tokens)
-    g_out, g_pooled = clip_g.encode_token_weights(g_tokens)
-    lg_out = torch.cat([l_out, g_out], dim=-1)
-    lg_out = torch.nn.functional.pad(lg_out, (0, 4096 - lg_out.shape[-1]))
-    if device is not None:
-        lg_out = lg_out.to(device=device)
-        l_pooled = l_pooled.to(device=device)
-        g_pooled = g_pooled.to(device=device)
-    if dtype is not None:
-        lg_out = lg_out.to(dtype=dtype)
-        l_pooled = l_pooled.to(dtype=dtype)
-        g_pooled = g_pooled.to(dtype=dtype)
-
-    # t5xxl may be in another device (eg. cpu)
-    if t5_tokens is None:
-        t5_out = torch.zeros((lg_out.shape[0], 77, 4096), device=lg_out.device, dtype=lg_out.dtype)
-    else:
-        t5_out, _ = t5xxl.encode_token_weights(t5_tokens)  # t5_out is [1, 77, 4096], t5_pooled is None
-        if device is not None:
-            t5_out = t5_out.to(device=device)
-        if dtype is not None:
-            t5_out = t5_out.to(dtype=dtype)
-
-    # return torch.cat([lg_out, t5_out], dim=-2), torch.cat((l_pooled, g_pooled), dim=-1)
-    return lg_out, t5_out, torch.cat((l_pooled, g_pooled), dim=-1)
-
-
-# used if other sd3 models is available
-r"""
-def get_sd3_configs(state_dict: Dict):
-    # Important configuration values can be quickly determined by checking shapes in the source file
-    # Some of these will vary between models (eg 2B vs 8B primarily differ in their depth, but also other details change)
-    # prefix = "model.diffusion_model."
-    prefix = ""
-
-    patch_size = state_dict[prefix + "x_embedder.proj.weight"].shape[2]
-    depth = state_dict[prefix + "x_embedder.proj.weight"].shape[0] // 64
-    num_patches = state_dict[prefix + "pos_embed"].shape[1]
-    pos_embed_max_size = round(math.sqrt(num_patches))
-    adm_in_channels = state_dict[prefix + "y_embedder.mlp.0.weight"].shape[1]
-    context_shape = state_dict[prefix + "context_embedder.weight"].shape
-    context_embedder_config = {
-        "target": "torch.nn.Linear",
-        "params": {"in_features": context_shape[1], "out_features": context_shape[0]},
-    }
-    return {
-        "patch_size": patch_size,
-        "depth": depth,
-        "num_patches": num_patches,
-        "pos_embed_max_size": pos_embed_max_size,
-        "adm_in_channels": adm_in_channels,
-        "context_embedder": context_embedder_config,
-    }
-
-
-def create_mmdit_from_sd3_checkpoint(state_dict: Dict, attn_mode: str = "xformers"):
-    ""
-    Doesn't load state dict.
-    ""
-    sd3_configs = get_sd3_configs(state_dict)
-
-    mmdit = sd3_models.MMDiT(
-        input_size=None,
-        pos_embed_max_size=sd3_configs["pos_embed_max_size"],
-        patch_size=sd3_configs["patch_size"],
-        in_channels=16,
-        adm_in_channels=sd3_configs["adm_in_channels"],
-        depth=sd3_configs["depth"],
-        mlp_ratio=4,
-        qk_norm=None,
-        num_patches=sd3_configs["num_patches"],
-        context_size=4096,
-        attn_mode=attn_mode,
-    )
-    return mmdit
-"""
 
 
 class ModelSamplingDiscreteFlow:
@@ -509,6 +299,3 @@ class ModelSamplingDiscreteFlow:
         # assert max_denoise is False, "max_denoise not implemented"
         # max_denoise is always True, I'm not sure why it's there
         return sigma * noise + (1.0 - sigma) * latent_image
-
-
-# endregion
diff --git a/library/strategy_sd3.py b/library/strategy_sd3.py
index 9fde0208..dd08cf00 100644
--- a/library/strategy_sd3.py
+++ b/library/strategy_sd3.py
@@ -3,7 +3,7 @@ import glob
 from typing import Any, List, Optional, Tuple, Union
 import torch
 import numpy as np
-from transformers import CLIPTokenizer, T5TokenizerFast
+from transformers import CLIPTokenizer, T5TokenizerFast, CLIPTextModel, CLIPTextModelWithProjection, T5EncoderModel
 
 from library import sd3_utils, train_util
 from library import sd3_models
@@ -48,45 +48,79 @@ class Sd3TokenizeStrategy(TokenizeStrategy):
 
 
 class Sd3TextEncodingStrategy(TextEncodingStrategy):
-    def __init__(self) -> None:
-        pass
+    def __init__(self, apply_lg_attn_mask: Optional[bool] = None, apply_t5_attn_mask: Optional[bool] = None) -> None:
+        """
+        Args:
+            apply_t5_attn_mask: Default value for apply_t5_attn_mask.
+        """
+        self.apply_lg_attn_mask = apply_lg_attn_mask
+        self.apply_t5_attn_mask = apply_t5_attn_mask
 
     def encode_tokens(
         self,
         tokenize_strategy: TokenizeStrategy,
         models: List[Any],
         tokens: List[torch.Tensor],
-        apply_lg_attn_mask: bool = False,
-        apply_t5_attn_mask: bool = False,
+        apply_lg_attn_mask: Optional[bool] = False,
+        apply_t5_attn_mask: Optional[bool] = False,
     ) -> List[torch.Tensor]:
         """
         returned embeddings are not masked
         """
         clip_l, clip_g, t5xxl = models
+        clip_l: CLIPTextModel
+        clip_g: CLIPTextModelWithProjection
+        t5xxl: T5EncoderModel
+
+        if apply_lg_attn_mask is None:
+            apply_lg_attn_mask = self.apply_lg_attn_mask
+        if apply_t5_attn_mask is None:
+            apply_t5_attn_mask = self.apply_t5_attn_mask
 
         l_tokens, g_tokens, t5_tokens = tokens[:3]
-        l_attn_mask, g_attn_mask, t5_attn_mask = tokens[3:] if len(tokens) > 3 else [None, None, None]
+
+        if len(tokens) > 3:
+            l_attn_mask, g_attn_mask, t5_attn_mask = tokens[3:]
+            if not apply_lg_attn_mask:
+                l_attn_mask = None
+                g_attn_mask = None
+            else:
+                l_attn_mask = l_attn_mask.to(clip_l.device)
+                g_attn_mask = g_attn_mask.to(clip_g.device)
+            if not apply_t5_attn_mask:
+                t5_attn_mask = None
+            else:
+                t5_attn_mask = t5_attn_mask.to(t5xxl.device)
+        else:
+            l_attn_mask = None
+            g_attn_mask = None
+            t5_attn_mask = None
+
         if l_tokens is None:
             assert g_tokens is None, "g_tokens must be None if l_tokens is None"
             lg_out = None
+            lg_pooled = None
         else:
-            assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
-            l_out, l_pooled = clip_l(l_tokens)
-            g_out, g_pooled = clip_g(g_tokens)
-            if apply_lg_attn_mask:
-                l_out = l_out * l_attn_mask.to(l_out.device).unsqueeze(-1)
-                g_out = g_out * g_attn_mask.to(g_out.device).unsqueeze(-1)
-            lg_out = torch.cat([l_out, g_out], dim=-1)
+            with torch.no_grad():
+                assert g_tokens is not None, "g_tokens must not be None if l_tokens is not None"
+                prompt_embeds = clip_l(l_tokens.to(clip_l.device), l_attn_mask, output_hidden_states=True)
+                l_pooled = prompt_embeds[0]
+                l_out = prompt_embeds.hidden_states[-2]
+
+                prompt_embeds = clip_g(g_tokens.to(clip_g.device), g_attn_mask, output_hidden_states=True)
+                g_pooled = prompt_embeds[0]
+                g_out = prompt_embeds.hidden_states[-2]
+
+                lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1) if l_tokens is not None else None
+                lg_out = torch.cat([l_out, g_out], dim=-1)
 
         if t5xxl is not None and t5_tokens is not None:
-            t5_out, _ = t5xxl(t5_tokens)  # t5_out is [1, max length, 4096]
-            if apply_t5_attn_mask:
-                t5_out = t5_out * t5_attn_mask.to(t5_out.device).unsqueeze(-1)
+            with torch.no_grad():
+                t5_out, _ = t5xxl(t5_tokens.to(t5xxl.device), t5_attn_mask, return_dict=False, output_hidden_states=True)
         else:
             t5_out = None
 
-        lg_pooled = torch.cat((l_pooled, g_pooled), dim=-1) if l_tokens is not None else None
-        return [lg_out, t5_out, lg_pooled]
+        return [lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask] # masks are used for attention masking in transformer
 
     def concat_encodings(
         self, lg_out: torch.Tensor, t5_out: Optional[torch.Tensor], lg_pooled: torch.Tensor
@@ -132,39 +166,38 @@ class Sd3TextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
                 return False
             if "clip_l_attn_mask" not in npz or "clip_g_attn_mask" not in npz:  # necessary even if not used
                 return False
-            # t5xxl is optional
+            if "apply_lg_attn_mask" not in npz:
+                return False
+            if "t5_out" not in npz:
+                return False
+            if "t5_attn_mask" not in npz:
+                return False
+            npz_apply_lg_attn_mask = npz["apply_lg_attn_mask"]
+            if npz_apply_lg_attn_mask != self.apply_lg_attn_mask:
+                return False
+            if "apply_t5_attn_mask" not in npz:
+                return False
+            npz_apply_t5_attn_mask = npz["apply_t5_attn_mask"]
+            if npz_apply_t5_attn_mask != self.apply_t5_attn_mask:
+                return False
         except Exception as e:
             logger.error(f"Error loading file: {npz_path}")
             raise e
 
         return True
 
-    def mask_lg_attn(self, lg_out: np.ndarray, l_attn_mask: np.ndarray, g_attn_mask: np.ndarray) -> np.ndarray:
-        l_out = lg_out[..., :768]
-        g_out = lg_out[..., 768:]  # 1280
-        l_out = l_out * np.expand_dims(l_attn_mask, -1)  # l_out = l_out * l_attn_mask.
-        g_out = g_out * np.expand_dims(g_attn_mask, -1)  # g_out = g_out * g_attn_mask.
-        return np.concatenate([l_out, g_out], axis=-1)
-
-    def mask_t5_attn(self, t5_out: np.ndarray, t5_attn_mask: np.ndarray) -> np.ndarray:
-        return t5_out * np.expand_dims(t5_attn_mask, -1)
-
     def load_outputs_npz(self, npz_path: str) -> List[np.ndarray]:
         data = np.load(npz_path)
         lg_out = data["lg_out"]
         lg_pooled = data["lg_pooled"]
-        t5_out = data["t5_out"] if "t5_out" in data else None
+        t5_out = data["t5_out"]
 
-        if self.apply_lg_attn_mask:
-            l_attn_mask = data["clip_l_attn_mask"]
-            g_attn_mask = data["clip_g_attn_mask"]
-            lg_out = self.mask_lg_attn(lg_out, l_attn_mask, g_attn_mask)
+        l_attn_mask = data["clip_l_attn_mask"]
+        g_attn_mask = data["clip_g_attn_mask"]
+        t5_attn_mask = data["t5_attn_mask"]
 
-        if self.apply_t5_attn_mask and t5_out is not None:
-            t5_attn_mask = data["t5_attn_mask"]
-            t5_out = self.mask_t5_attn(t5_out, t5_attn_mask)
-
-        return [lg_out, t5_out, lg_pooled]
+        # apply_t5_attn_mask and apply_lg_attn_mask are same as self.apply_t5_attn_mask and self.apply_lg_attn_mask
+        return [lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask]
 
     def cache_batch_outputs(
         self, tokenize_strategy: TokenizeStrategy, models: List[Any], text_encoding_strategy: TextEncodingStrategy, infos: List
@@ -174,7 +207,7 @@ class Sd3TextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
 
         tokens_and_masks = tokenize_strategy.tokenize(captions)
         with torch.no_grad():
-            lg_out, t5_out, lg_pooled = sd3_text_encoding_strategy.encode_tokens(
+            lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = sd3_text_encoding_strategy.encode_tokens(
                 tokenize_strategy, models, tokens_and_masks, self.apply_lg_attn_mask, self.apply_t5_attn_mask
             )
 
@@ -182,38 +215,41 @@ class Sd3TextEncoderOutputsCachingStrategy(TextEncoderOutputsCachingStrategy):
             lg_out = lg_out.float()
         if lg_pooled.dtype == torch.bfloat16:
             lg_pooled = lg_pooled.float()
-        if t5_out is not None and t5_out.dtype == torch.bfloat16:
+        if t5_out.dtype == torch.bfloat16:
             t5_out = t5_out.float()
 
         lg_out = lg_out.cpu().numpy()
         lg_pooled = lg_pooled.cpu().numpy()
-        if t5_out is not None:
-            t5_out = t5_out.cpu().numpy()
+        t5_out = t5_out.cpu().numpy()
+
+        l_attn_mask = tokens_and_masks[3].cpu().numpy()
+        g_attn_mask = tokens_and_masks[4].cpu().numpy()
+        t5_attn_mask = tokens_and_masks[5].cpu().numpy()
 
         for i, info in enumerate(infos):
             lg_out_i = lg_out[i]
-            t5_out_i = t5_out[i] if t5_out is not None else None
+            t5_out_i = t5_out[i]
             lg_pooled_i = lg_pooled[i]
+            l_attn_mask_i = l_attn_mask[i]
+            g_attn_mask_i = g_attn_mask[i]
+            t5_attn_mask_i = t5_attn_mask[i]
+            apply_lg_attn_mask = self.apply_lg_attn_mask
+            apply_t5_attn_mask = self.apply_t5_attn_mask
 
             if self.cache_to_disk:
-                clip_l_attn_mask, clip_g_attn_mask, t5_attn_mask = tokens_and_masks[3:6]
-                clip_l_attn_mask_i = clip_l_attn_mask[i].cpu().numpy()
-                clip_g_attn_mask_i = clip_g_attn_mask[i].cpu().numpy()
-                t5_attn_mask_i = t5_attn_mask[i].cpu().numpy() if t5_attn_mask is not None else None  # shouldn't be None
-                kwargs = {}
-                if t5_out is not None:
-                    kwargs["t5_out"] = t5_out_i
                 np.savez(
                     info.text_encoder_outputs_npz,
                     lg_out=lg_out_i,
                     lg_pooled=lg_pooled_i,
-                    clip_l_attn_mask=clip_l_attn_mask_i,
-                    clip_g_attn_mask=clip_g_attn_mask_i,
+                    t5_out=t5_out_i,
+                    clip_l_attn_mask=l_attn_mask_i,
+                    clip_g_attn_mask=g_attn_mask_i,
                     t5_attn_mask=t5_attn_mask_i,
-                    **kwargs,
+                    apply_lg_attn_mask=apply_lg_attn_mask,
+                    apply_t5_attn_mask=apply_t5_attn_mask,
                 )
             else:
-                info.text_encoder_outputs = (lg_out_i, t5_out_i, lg_pooled_i)
+                info.text_encoder_outputs = (lg_out_i, t5_out_i, lg_pooled_i, l_attn_mask_i, g_attn_mask_i, t5_attn_mask_i)
 
 
 class Sd3LatentsCachingStrategy(LatentsCachingStrategy):
@@ -246,41 +282,3 @@ class Sd3LatentsCachingStrategy(LatentsCachingStrategy):
 
         if not train_util.HIGH_VRAM:
             train_util.clean_memory_on_device(vae.device)
-
-
-if __name__ == "__main__":
-    # test code for Sd3TokenizeStrategy
-    # tokenizer = sd3_models.SD3Tokenizer()
-    strategy = Sd3TokenizeStrategy(256)
-    text = "hello world"
-
-    l_tokens, g_tokens, t5_tokens = strategy.tokenize(text)
-    # print(l_tokens.shape)
-    print(l_tokens)
-    print(g_tokens)
-    print(t5_tokens)
-
-    texts = ["hello world", "the quick brown fox jumps over the lazy dog"]
-    l_tokens_2 = strategy.clip_l(texts, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
-    g_tokens_2 = strategy.clip_g(texts, max_length=77, padding="max_length", truncation=True, return_tensors="pt")
-    t5_tokens_2 = strategy.t5xxl(
-        texts, max_length=strategy.t5xxl_max_length, padding="max_length", truncation=True, return_tensors="pt"
-    )
-    print(l_tokens_2)
-    print(g_tokens_2)
-    print(t5_tokens_2)
-
-    # compare
-    print(torch.allclose(l_tokens, l_tokens_2["input_ids"][0]))
-    print(torch.allclose(g_tokens, g_tokens_2["input_ids"][0]))
-    print(torch.allclose(t5_tokens, t5_tokens_2["input_ids"][0]))
-
-    text = ",".join(["hello world! this is long text"] * 50)
-    l_tokens, g_tokens, t5_tokens = strategy.tokenize(text)
-    print(l_tokens)
-    print(g_tokens)
-    print(t5_tokens)
-
-    print(f"model max length l: {strategy.clip_l.model_max_length}")
-    print(f"model max length g: {strategy.clip_g.model_max_length}")
-    print(f"model max length t5: {strategy.t5xxl.model_max_length}")
diff --git a/library/train_util.py b/library/train_util.py
index 462c7a9a..9ea1eec0 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -5967,6 +5967,37 @@ def line_to_prompt_dict(line: str) -> dict:
     return prompt_dict
 
 
+def load_prompts(prompt_file: str) -> List[Dict]:
+    # read prompts
+    if prompt_file.endswith(".txt"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
+    elif prompt_file.endswith(".toml"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            data = toml.load(f)
+        prompts = [dict(**data["prompt"], **subset) for subset in data["prompt"]["subset"]]
+    elif prompt_file.endswith(".json"):
+        with open(prompt_file, "r", encoding="utf-8") as f:
+            prompts = json.load(f)
+
+    # preprocess prompts
+    for i in range(len(prompts)):
+        prompt_dict = prompts[i]
+        if isinstance(prompt_dict, str):
+            from library.train_util import line_to_prompt_dict
+
+            prompt_dict = line_to_prompt_dict(prompt_dict)
+            prompts[i] = prompt_dict
+        assert isinstance(prompt_dict, dict)
+
+        # Adds an enumerator to the dict based on prompt position. Used later to name image files. Also cleanup of extra data in original prompt dict.
+        prompt_dict["enum"] = i
+        prompt_dict.pop("subset", None)
+
+    return prompts
+
+
 def sample_images_common(
     pipe_class,
     accelerator: Accelerator,
diff --git a/library/utils.py b/library/utils.py
index 8a0c782c..ca0f904d 100644
--- a/library/utils.py
+++ b/library/utils.py
@@ -13,12 +13,16 @@ from diffusers.schedulers.scheduling_euler_ancestral_discrete import EulerAncest
 import cv2
 from PIL import Image
 import numpy as np
+from safetensors.torch import load_file
 
 
 def fire_in_thread(f, *args, **kwargs):
     threading.Thread(target=f, args=args, kwargs=kwargs).start()
 
 
+# region Logging
+
+
 def add_logging_arguments(parser):
     parser.add_argument(
         "--console_log_level",
@@ -85,6 +89,11 @@ def setup_logging(args=None, log_level=None, reset=False):
         logger.info(msg_init)
 
 
+# endregion
+
+# region PyTorch utils
+
+
 def str_to_dtype(s: Optional[str], default_dtype: Optional[torch.dtype] = None) -> torch.dtype:
     """
     Convert a string to a torch.dtype
@@ -304,6 +313,35 @@ class MemoryEfficientSafeOpen:
             # return byte_tensor.view(torch.uint8).to(torch.float16).reshape(shape)
             raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
 
+def load_safetensors(
+    path: str, device: Union[str, torch.device], disable_mmap: bool = False, dtype: Optional[torch.dtype] = torch.float32
+) -> dict[str, torch.Tensor]:
+    if disable_mmap:
+        # return safetensors.torch.load(open(path, "rb").read())
+        # use experimental loader
+        # logger.info(f"Loading without mmap (experimental)")
+        state_dict = {}
+        with MemoryEfficientSafeOpen(path) as f:
+            for key in f.keys():
+                state_dict[key] = f.get_tensor(key).to(device, dtype=dtype)
+        return state_dict
+    else:
+        try:
+            state_dict = load_file(path, device=device)
+        except:
+            state_dict = load_file(path)  # prevent device invalid Error
+        if dtype is not None:
+            for key in state_dict.keys():
+                state_dict[key] = state_dict[key].to(dtype=dtype)
+        return state_dict
+
+
+
+# endregion
+
+# region Image utils
+
+
 def pil_resize(image, size, interpolation=Image.LANCZOS):
     has_alpha = image.shape[2] == 4 if len(image.shape) == 3 else False
 
@@ -323,9 +361,9 @@ def pil_resize(image, size, interpolation=Image.LANCZOS):
     return resized_cv2
 
 
+# endregion
+
 # TODO make inf_utils.py
-
-
 # region Gradual Latent hires fix
 
 
diff --git a/sd3_minimal_inference.py b/sd3_minimal_inference.py
index 630da7e0..d099fe18 100644
--- a/sd3_minimal_inference.py
+++ b/sd3_minimal_inference.py
@@ -12,6 +12,7 @@ import torch
 from safetensors.torch import safe_open, load_file
 from tqdm import tqdm
 from PIL import Image
+from transformers import CLIPTextModelWithProjection, T5EncoderModel
 
 from library.device_utils import init_ipex, get_preferred_device
 
@@ -25,11 +26,14 @@ import logging
 logger = logging.getLogger(__name__)
 
 from library import sd3_models, sd3_utils, strategy_sd3
+from library.utils import load_safetensors
 
 
-def get_noise(seed, latent):
-    generator = torch.manual_seed(seed)
-    return torch.randn(latent.size(), dtype=torch.float32, layout=latent.layout, generator=generator, device="cpu").to(latent.dtype)
+def get_noise(seed, latent, device="cpu"):
+    # generator = torch.manual_seed(seed)
+    generator = torch.Generator(device)
+    generator.manual_seed(seed)
+    return torch.randn(latent.size(), dtype=latent.dtype, layout=latent.layout, generator=generator, device=device)
 
 
 def get_sigmas(sampling: sd3_utils.ModelSamplingDiscreteFlow, steps):
@@ -59,7 +63,7 @@ def do_sample(
     neg_cond: Tuple[torch.Tensor, torch.Tensor],
     mmdit: sd3_models.MMDiT,
     steps: int,
-    guidance_scale: float,
+    cfg_scale: float,
     dtype: torch.dtype,
     device: str,
 ):
@@ -71,7 +75,7 @@ def do_sample(
 
     latent = latent.to(dtype).to(device)
 
-    noise = get_noise(seed, latent).to(device)
+    noise = get_noise(seed, latent, device)
 
     model_sampling = sd3_utils.ModelSamplingDiscreteFlow(shift=3.0)  # 3.0 is for SD3
 
@@ -105,7 +109,7 @@ def do_sample(
             batched = model_sampling.calculate_denoised(sigma_hat, model_output, x)
 
             pos_out, neg_out = batched.chunk(2)
-            denoised = neg_out + (pos_out - neg_out) * guidance_scale
+            denoised = neg_out + (pos_out - neg_out) * cfg_scale
             # print(denoised.shape)
 
             # d = to_d(x, sigma_hat, denoised)
@@ -122,230 +126,68 @@ def do_sample(
             x = x.to(dtype)
 
     latent = x
-    scale_factor = 1.5305
-    shift_factor = 0.0609
-    # def process_out(self, latent):
-    #     return (latent / self.scale_factor) + self.shift_factor
-    latent = (latent / scale_factor) + shift_factor
+    latent = vae.process_out(latent)
     return latent
 
 
-if __name__ == "__main__":
-    target_height = 1024
-    target_width = 1024
-
-    # steps = 50  # 28  # 50
-    guidance_scale = 5
-    # seed = 1  # None  # 1
-
-    device = get_preferred_device()
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--ckpt_path", type=str, required=True)
-    parser.add_argument("--clip_g", type=str, required=False)
-    parser.add_argument("--clip_l", type=str, required=False)
-    parser.add_argument("--t5xxl", type=str, required=False)
-    parser.add_argument("--t5xxl_token_length", type=int, default=77, help="t5xxl token length, default: 77")
-    parser.add_argument("--apply_lg_attn_mask", action="store_true")
-    parser.add_argument("--apply_t5_attn_mask", action="store_true")
-    parser.add_argument("--prompt", type=str, default="A photo of a cat")
-    # parser.add_argument("--prompt2", type=str, default=None)  # do not support different prompts for text encoders
-    parser.add_argument("--negative_prompt", type=str, default="")
-    parser.add_argument("--output_dir", type=str, default=".")
-    parser.add_argument("--do_not_use_t5xxl", action="store_true")
-    parser.add_argument("--attn_mode", type=str, default="torch", help="torch (SDPA) or xformers. default: torch")
-    parser.add_argument("--fp16", action="store_true")
-    parser.add_argument("--bf16", action="store_true")
-    parser.add_argument("--seed", type=int, default=1)
-    parser.add_argument("--steps", type=int, default=50)
-    # parser.add_argument(
-    #     "--lora_weights",
-    #     type=str,
-    #     nargs="*",
-    #     default=[],
-    #     help="LoRA weights, only supports networks.lora, each argument is a `path;multiplier` (semi-colon separated)",
-    # )
-    # parser.add_argument("--interactive", action="store_true")
-    args = parser.parse_args()
-
-    seed = args.seed
-    steps = args.steps
-
-    sd3_dtype = torch.float32
-    if args.fp16:
-        sd3_dtype = torch.float16
-    elif args.bf16:
-        sd3_dtype = torch.bfloat16
-
-    # TODO test with separated safetenors files for each model
-
-    # load state dict
-    logger.info(f"Loading SD3 models from {args.ckpt_path}...")
-    state_dict = load_file(args.ckpt_path)
-
-    if "text_encoders.clip_g.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
-        # found clip_g: remove prefix "text_encoders.clip_g."
-        logger.info("clip_g is included in the checkpoint")
-        clip_g_sd = {}
-        prefix = "text_encoders.clip_g."
-        for k, v in list(state_dict.items()):
-            if k.startswith(prefix):
-                clip_g_sd[k[len(prefix) :]] = state_dict.pop(k)
-    else:
-        logger.info(f"Lodaing clip_g from {args.clip_g}...")
-        clip_g_sd = load_file(args.clip_g)
-        for key in list(clip_g_sd.keys()):
-            clip_g_sd["transformer." + key] = clip_g_sd.pop(key)
-
-    if "text_encoders.clip_l.transformer.text_model.embeddings.position_embedding.weight" in state_dict:
-        # found clip_l: remove prefix "text_encoders.clip_l."
-        logger.info("clip_l is included in the checkpoint")
-        clip_l_sd = {}
-        prefix = "text_encoders.clip_l."
-        for k, v in list(state_dict.items()):
-            if k.startswith(prefix):
-                clip_l_sd[k[len(prefix) :]] = state_dict.pop(k)
-    else:
-        logger.info(f"Lodaing clip_l from {args.clip_l}...")
-        clip_l_sd = load_file(args.clip_l)
-        for key in list(clip_l_sd.keys()):
-            clip_l_sd["transformer." + key] = clip_l_sd.pop(key)
-
-    if "text_encoders.t5xxl.transformer.encoder.block.0.layer.0.SelfAttention.k.weight" in state_dict:
-        # found t5xxl: remove prefix "text_encoders.t5xxl."
-        logger.info("t5xxl is included in the checkpoint")
-        if not args.do_not_use_t5xxl:
-            t5xxl_sd = {}
-            prefix = "text_encoders.t5xxl."
-            for k, v in list(state_dict.items()):
-                if k.startswith(prefix):
-                    t5xxl_sd[k[len(prefix) :]] = state_dict.pop(k)
-        else:
-            logger.info("but not used")
-            for key in list(state_dict.keys()):
-                if key.startswith("text_encoders.t5xxl."):
-                    state_dict.pop(key)
-            t5xxl_sd = None
-    elif args.t5xxl:
-        assert not args.do_not_use_t5xxl, "t5xxl is not used but specified"
-        logger.info(f"Lodaing t5xxl from {args.t5xxl}...")
-        t5xxl_sd = load_file(args.t5xxl)
-        for key in list(t5xxl_sd.keys()):
-            t5xxl_sd["transformer." + key] = t5xxl_sd.pop(key)
-    else:
-        logger.info("t5xxl is not used")
-        t5xxl_sd = None
-
-    use_t5xxl = t5xxl_sd is not None
-
-    # MMDiT and VAE
-    vae_sd = {}
-    vae_prefix = "first_stage_model."
-    mmdit_prefix = "model.diffusion_model."
-    for k, v in list(state_dict.items()):
-        if k.startswith(vae_prefix):
-            vae_sd[k[len(vae_prefix) :]] = state_dict.pop(k)
-        elif k.startswith(mmdit_prefix):
-            state_dict[k[len(mmdit_prefix) :]] = state_dict.pop(k)
-
-    # load tokenizers
-    logger.info("Loading tokenizers...")
-    tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_token_length)
-
-    # load models
-    # logger.info("Create MMDiT from SD3 checkpoint...")
-    # mmdit = sd3_utils.create_mmdit_from_sd3_checkpoint(state_dict)
-    logger.info("Create MMDiT")
-    mmdit = sd3_models.create_mmdit_sd3_medium_configs(args.attn_mode)
-
-    logger.info("Loading state dict...")
-    info = mmdit.load_state_dict(state_dict)
-    logger.info(f"Loaded MMDiT: {info}")
-
-    logger.info(f"Move MMDiT to {device} and {sd3_dtype}...")
-    mmdit.to(device, dtype=sd3_dtype)
-    mmdit.eval()
-
-    # load VAE
-    logger.info("Create VAE")
-    vae = sd3_models.SDVAE()
-    logger.info("Loading state dict...")
-    info = vae.load_state_dict(vae_sd)
-    logger.info(f"Loaded VAE: {info}")
-
-    logger.info(f"Move VAE to {device} and {sd3_dtype}...")
-    vae.to(device, dtype=sd3_dtype)
-    vae.eval()
-
-    # load text encoders
-    logger.info("Create clip_l")
-    clip_l = sd3_models.create_clip_l(device, sd3_dtype, clip_l_sd)
-
-    logger.info("Loading state dict...")
-    info = clip_l.load_state_dict(clip_l_sd)
-    logger.info(f"Loaded clip_l: {info}")
-
-    logger.info(f"Move clip_l to {device} and {sd3_dtype}...")
-    clip_l.to(device, dtype=sd3_dtype)
-    clip_l.eval()
-    logger.info(f"Set attn_mode to {args.attn_mode}...")
-    clip_l.set_attn_mode(args.attn_mode)
-
-    logger.info("Create clip_g")
-    clip_g = sd3_models.create_clip_g(device, sd3_dtype, clip_g_sd)
-
-    logger.info("Loading state dict...")
-    info = clip_g.load_state_dict(clip_g_sd)
-    logger.info(f"Loaded clip_g: {info}")
-
-    logger.info(f"Move clip_g to {device} and {sd3_dtype}...")
-    clip_g.to(device, dtype=sd3_dtype)
-    clip_g.eval()
-    logger.info(f"Set attn_mode to {args.attn_mode}...")
-    clip_g.set_attn_mode(args.attn_mode)
-
-    if use_t5xxl:
-        logger.info("Create t5xxl")
-        t5xxl = sd3_models.create_t5xxl(device, sd3_dtype, t5xxl_sd)
-
-        logger.info("Loading state dict...")
-        info = t5xxl.load_state_dict(t5xxl_sd)
-        logger.info(f"Loaded t5xxl: {info}")
-
-        logger.info(f"Move t5xxl to {device} and {sd3_dtype}...")
-        t5xxl.to(device, dtype=sd3_dtype)
-        # t5xxl.to("cpu", dtype=torch.float32) # run on CPU
-        t5xxl.eval()
-        logger.info(f"Set attn_mode to {args.attn_mode}...")
-        t5xxl.set_attn_mode(args.attn_mode)
-    else:
-        t5xxl = None
-
+def generate_image(
+    mmdit: sd3_models.MMDiT,
+    vae: sd3_models.SDVAE,
+    clip_l: CLIPTextModelWithProjection,
+    clip_g: CLIPTextModelWithProjection,
+    t5xxl: T5EncoderModel,
+    steps: int,
+    prompt: str,
+    seed: int,
+    target_width: int,
+    target_height: int,
+    device: str,
+    negative_prompt: str,
+    cfg_scale: float,
+):
     # prepare embeddings
     logger.info("Encoding prompts...")
-    encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy()
 
-    tokens_and_masks = tokenize_strategy.tokenize(args.prompt)
-    lg_out, t5_out, pooled = encoding_strategy.encode_tokens(
-        tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
-    )
-    cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
+    # TODO support one-by-one offloading
+    clip_l.to(device)
+    clip_g.to(device)
+    t5xxl.to(device)
 
-    tokens_and_masks = tokenize_strategy.tokenize(args.negative_prompt)
-    lg_out, t5_out, pooled = encoding_strategy.encode_tokens(
-        tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
-    )
-    neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
+    with torch.no_grad():
+        tokens_and_masks = tokenize_strategy.tokenize(prompt)
+        lg_out, t5_out, pooled, l_attn_mask, g_attn_mask, t5_attn_mask = encoding_strategy.encode_tokens(
+            tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
+        )
+        cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
+
+        tokens_and_masks = tokenize_strategy.tokenize(negative_prompt)
+        lg_out, t5_out, pooled, neg_l_attn_mask, neg_g_attn_mask, neg_t5_attn_mask = encoding_strategy.encode_tokens(
+            tokenize_strategy, [clip_l, clip_g, t5xxl], tokens_and_masks, args.apply_lg_attn_mask, args.apply_t5_attn_mask
+        )
+        neg_cond = encoding_strategy.concat_encodings(lg_out, t5_out, pooled)
+
+    # attn masks are not used currently
+
+    if args.offload:
+        clip_l.to("cpu")
+        clip_g.to("cpu")
+        t5xxl.to("cpu")
 
     # generate image
     logger.info("Generating image...")
-    latent_sampled = do_sample(
-        target_height, target_width, None, seed, cond, neg_cond, mmdit, steps, guidance_scale, sd3_dtype, device
-    )
+    mmdit.to(device)
+    latent_sampled = do_sample(target_height, target_width, None, seed, cond, neg_cond, mmdit, steps, cfg_scale, sd3_dtype, device)
+    if args.offload:
+        mmdit.to("cpu")
 
     # latent to image
+    vae.to(device)
     with torch.no_grad():
         image = vae.decode(latent_sampled)
+
+    if args.offload:
+        vae.to("cpu")
+
     image = image.float()
     image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0)[0]
     decoded_np = 255.0 * np.moveaxis(image.cpu().numpy(), 0, 2)
@@ -359,3 +201,179 @@ if __name__ == "__main__":
     out_image.save(output_path)
 
     logger.info(f"Saved image to {output_path}")
+
+
+if __name__ == "__main__":
+    target_height = 1024
+    target_width = 1024
+
+    # steps = 50  # 28  # 50
+    # cfg_scale = 5
+    # seed = 1  # None  # 1
+
+    device = get_preferred_device()
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, required=True)
+    parser.add_argument("--clip_g", type=str, required=False)
+    parser.add_argument("--clip_l", type=str, required=False)
+    parser.add_argument("--t5xxl", type=str, required=False)
+    parser.add_argument("--t5xxl_token_length", type=int, default=256, help="t5xxl token length, default: 256")
+    parser.add_argument("--apply_lg_attn_mask", action="store_true")
+    parser.add_argument("--apply_t5_attn_mask", action="store_true")
+    parser.add_argument("--prompt", type=str, default="A photo of a cat")
+    # parser.add_argument("--prompt2", type=str, default=None)  # do not support different prompts for text encoders
+    parser.add_argument("--negative_prompt", type=str, default="")
+    parser.add_argument("--cfg_scale", type=float, default=5.0)
+    parser.add_argument("--offload", action="store_true", help="Offload to CPU")
+    parser.add_argument("--output_dir", type=str, default=".")
+    # parser.add_argument("--do_not_use_t5xxl", action="store_true")
+    # parser.add_argument("--attn_mode", type=str, default="torch", help="torch (SDPA) or xformers. default: torch")
+    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument("--bf16", action="store_true")
+    parser.add_argument("--seed", type=int, default=1)
+    parser.add_argument("--steps", type=int, default=50)
+    # parser.add_argument(
+    #     "--lora_weights",
+    #     type=str,
+    #     nargs="*",
+    #     default=[],
+    #     help="LoRA weights, only supports networks.lora, each argument is a `path;multiplier` (semi-colon separated)",
+    # )
+    parser.add_argument("--width", type=int, default=target_width)
+    parser.add_argument("--height", type=int, default=target_height)
+    parser.add_argument("--interactive", action="store_true")
+    args = parser.parse_args()
+
+    seed = args.seed
+    steps = args.steps
+
+    sd3_dtype = torch.float32
+    if args.fp16:
+        sd3_dtype = torch.float16
+    elif args.bf16:
+        sd3_dtype = torch.bfloat16
+
+    loading_device = "cpu" if args.offload else device
+
+    # load state dict
+    logger.info(f"Loading SD3 models from {args.ckpt_path}...")
+    # state_dict = load_file(args.ckpt_path)
+    state_dict = load_safetensors(args.ckpt_path, loading_device, disable_mmap=True, dtype=sd3_dtype)
+
+    # load text encoders
+    clip_l = sd3_utils.load_clip_l(args.clip_l, sd3_dtype, loading_device, state_dict=state_dict)
+    clip_g = sd3_utils.load_clip_g(args.clip_g, sd3_dtype, loading_device, state_dict=state_dict)
+    t5xxl = sd3_utils.load_t5xxl(args.t5xxl, sd3_dtype, loading_device, state_dict=state_dict)
+
+    # MMDiT and VAE
+    vae = sd3_utils.load_vae(None, sd3_dtype, loading_device, state_dict=state_dict)
+    mmdit = sd3_utils.load_mmdit(state_dict, sd3_dtype, loading_device)
+
+    clip_l.to(sd3_dtype)
+    clip_g.to(sd3_dtype)
+    t5xxl.to(sd3_dtype)
+    vae.to(sd3_dtype)
+    mmdit.to(sd3_dtype)
+    if not args.offload:
+        # make sure to move to the device: some tensors are created in the constructor on the CPU
+        clip_l.to(device)
+        clip_g.to(device)
+        t5xxl.to(device)
+        vae.to(device)
+        mmdit.to(device)
+
+    clip_l.eval()
+    clip_g.eval()
+    t5xxl.eval()
+    mmdit.eval()
+    vae.eval()
+
+    # load tokenizers
+    logger.info("Loading tokenizers...")
+    tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_token_length)
+    encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy()
+
+    if not args.interactive:
+        generate_image(
+            mmdit,
+            vae,
+            clip_l,
+            clip_g,
+            t5xxl,
+            args.steps,
+            args.prompt,
+            args.seed,
+            args.width,
+            args.height,
+            device,
+            args.negative_prompt,
+            args.cfg_scale,
+        )
+    else:
+        # loop for interactive
+        width = args.width
+        height = args.height
+        steps = None
+        cfg_scale = args.cfg_scale
+
+        while True:
+            print(
+                "Enter prompt (empty to exit). Options: --w <width> --h <height> --s <steps> --d <seed>"
+                " --n <negative prompt>, `--n -` for empty negative prompt"
+                "Options are kept for the next prompt. Current options:"
+                f" width={width}, height={height}, steps={steps}, seed={seed}, cfg_scale={cfg_scale}"
+            )
+            prompt = input()
+            if prompt == "":
+                break
+
+            # parse options
+            options = prompt.split("--")
+            prompt = options[0].strip()
+            seed = None
+            negative_prompt = None
+            for opt in options[1:]:
+                try:
+                    opt = opt.strip()
+                    if opt.startswith("w"):
+                        width = int(opt[1:].strip())
+                    elif opt.startswith("h"):
+                        height = int(opt[1:].strip())
+                    elif opt.startswith("s"):
+                        steps = int(opt[1:].strip())
+                    elif opt.startswith("d"):
+                        seed = int(opt[1:].strip())
+                    # elif opt.startswith("m"):
+                    #     mutipliers = opt[1:].strip().split(",")
+                    #     if len(mutipliers) != len(lora_models):
+                    #         logger.error(f"Invalid number of multipliers, expected {len(lora_models)}")
+                    #         continue
+                    #     for i, lora_model in enumerate(lora_models):
+                    #         lora_model.set_multiplier(float(mutipliers[i]))
+                    elif opt.startswith("n"):
+                        negative_prompt = opt[1:].strip()
+                        if negative_prompt == "-":
+                            negative_prompt = ""
+                    elif opt.startswith("c"):
+                        cfg_scale = float(opt[1:].strip())
+                except ValueError as e:
+                    logger.error(f"Invalid option: {opt}, {e}")
+
+            generate_image(
+                mmdit,
+                vae,
+                clip_l,
+                clip_g,
+                t5xxl,
+                steps if steps is not None else args.steps,
+                prompt,
+                seed if seed is not None else args.seed,
+                width,
+                height,
+                device,
+                negative_prompt if negative_prompt is not None else args.negative_prompt,
+                cfg_scale,
+            )
+
+    logger.info("Done!")
diff --git a/sd3_train.py b/sd3_train.py
index ef18c32c..6336b4cf 100644
--- a/sd3_train.py
+++ b/sd3_train.py
@@ -1,6 +1,7 @@
 # training with captions
 
 import argparse
+from concurrent.futures import ThreadPoolExecutor
 import copy
 import math
 import os
@@ -11,6 +12,7 @@ import toml
 from tqdm import tqdm
 
 import torch
+from library import utils
 from library.device_utils import init_ipex, clean_memory_on_device
 
 init_ipex()
@@ -38,7 +40,7 @@ from library.config_util import (
     ConfigSanitizer,
     BlueprintGenerator,
 )
-import library.custom_train_functions as custom_train_functions
+from library.custom_train_functions import apply_masked_loss, add_custom_train_arguments
 
 # from library.custom_train_functions import (
 #     apply_snr_weight,
@@ -61,23 +63,13 @@ def train(args):
     if not args.skip_cache_check:
         args.skip_cache_check = args.skip_latents_validity_check
 
-    assert (
-        not args.weighted_captions
-    ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
+    # assert (
+    #     not args.weighted_captions
+    # ), "weighted_captions is not supported currently / weighted_captionsは現在サポートされていません"
     # assert (
     #     not args.train_text_encoder or not args.cache_text_encoder_outputs
     # ), "cache_text_encoder_outputs is not supported when training text encoder / text encoderを学習するときはcache_text_encoder_outputsはサポートされていません"
 
-    # # training text encoder is not supported
-    # assert (
-    #     not args.train_text_encoder
-    # ), "training text encoder is not supported currently / text encoderの学習は現在サポートされていません"
-
-    # # training without text encoder cache is not supported: because T5XXL must be cached
-    # assert (
-    #     args.cache_text_encoder_outputs
-    # ), "training without text encoder cache is not supported currently / text encoderのキャッシュなしの学習は現在サポートされていません"
-
     assert not args.train_text_encoder or (args.use_t5xxl_cache_only or not args.cache_text_encoder_outputs), (
         "when training text encoder, text encoder outputs must not be cached (except for T5XXL)"
         + " / text encoderの学習時はtext encoderの出力はキャッシュできません（t5xxlのみキャッシュすることは可能です）"
@@ -90,13 +82,13 @@ def train(args):
         )
         args.cache_text_encoder_outputs = True
 
-    # if args.block_lr:
-    #     block_lrs = [float(lr) for lr in args.block_lr.split(",")]
-    #     assert (
-    #         len(block_lrs) == UNET_NUM_BLOCKS_FOR_BLOCK_LR
-    #     ), f"block_lr must have {UNET_NUM_BLOCKS_FOR_BLOCK_LR} values / block_lrは{UNET_NUM_BLOCKS_FOR_BLOCK_LR}個の値を指定してください"
-    # else:
-    #     block_lrs = None
+    if args.train_t5xxl:
+        assert (
+            args.train_text_encoder
+        ), "when training T5XXL, text encoder (CLIP-L/G) must be trained / T5XXLを学習するときはtext encoder (CLIP-L/G)も学習する必要があります"
+        assert (
+            not args.cache_text_encoder_outputs
+        ), "when training T5XXL, t5xxl output must not be cached / T5XXLを学習するときはt5xxlの出力をキャッシュできません"
 
     cache_latents = args.cache_latents
     use_dreambooth_method = args.in_json is None
@@ -111,11 +103,6 @@ def train(args):
         )
         strategy_base.LatentsCachingStrategy.set_strategy(latents_caching_strategy)
 
-    # load tokenizer and prepare tokenize strategy
-    sd3_tokenizer = sd3_models.SD3Tokenizer(t5xxl_max_length=args.t5xxl_max_token_length)
-    sd3_tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(args.t5xxl_max_token_length)
-    strategy_base.TokenizeStrategy.set_strategy(sd3_tokenize_strategy)
-
     # データセットを準備する
     if args.dataset_class is None:
         blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, args.masked_loss, True))
@@ -156,10 +143,10 @@ def train(args):
                     ]
                 }
 
-        blueprint = blueprint_generator.generate(user_config, args, tokenizer=[sd3_tokenizer])
+        blueprint = blueprint_generator.generate(user_config, args)
         train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
     else:
-        train_dataset_group = train_util.load_arbitrary_dataset(args, [sd3_tokenizer])
+        train_dataset_group = train_util.load_arbitrary_dataset(args)
 
     current_epoch = Value("i", 0)
     current_step = Value("i", 0)
@@ -205,41 +192,152 @@ def train(args):
 
     # mixed precisionに対応した型を用意しておき適宜castする
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
-    vae_dtype = weight_dtype  # torch.float32 if args.no_half_vae else weight_dtype # SD3 VAE works with fp16
-
-    t5xxl_dtype = weight_dtype
-    if args.t5xxl_dtype is not None:
-        if args.t5xxl_dtype == "fp16":
-            t5xxl_dtype = torch.float16
-        elif args.t5xxl_dtype == "bf16":
-            t5xxl_dtype = torch.bfloat16
-        elif args.t5xxl_dtype == "fp32" or args.t5xxl_dtype == "float":
-            t5xxl_dtype = torch.float32
-        else:
-            raise ValueError(f"unexpected t5xxl_dtype: {args.t5xxl_dtype}")
-    t5xxl_device = accelerator.device if args.t5xxl_device is None else args.t5xxl_device
-
-    clip_dtype = weight_dtype  # if not args.train_text_encoder else None
 
     # モデルを読み込む
-    attn_mode = "xformers" if args.xformers else "torch"
 
-    assert (
-        attn_mode == "torch"
-    ), f"attn_mode {attn_mode} is not supported yet. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
+    # t5xxl_dtype = weight_dtype
+    # if args.t5xxl_dtype is not None:
+    #     if args.t5xxl_dtype == "fp16":
+    #         t5xxl_dtype = torch.float16
+    #     elif args.t5xxl_dtype == "bf16":
+    #         t5xxl_dtype = torch.bfloat16
+    #     elif args.t5xxl_dtype == "fp32" or args.t5xxl_dtype == "float":
+    #         t5xxl_dtype = torch.float32
+    #     else:
+    #         raise ValueError(f"unexpected t5xxl_dtype: {args.t5xxl_dtype}")
+    # t5xxl_device = accelerator.device if args.t5xxl_device is None else args.t5xxl_device
+    # clip_dtype = weight_dtype  # if not args.train_text_encoder else None
 
-    # SD3 state dict may contain multiple models, so we need to load it and extract one by one. annoying.
-    logger.info(f"Loading SD3 models from {args.pretrained_model_name_or_path}")
-    device_to_load = accelerator.device if args.lowram else "cpu"
-    sd3_state_dict = sd3_utils.load_safetensors(
-        args.pretrained_model_name_or_path, device_to_load, args.disable_mmap_load_safetensors
-    )
+    # if clip_l is not specified, the checkpoint must contain clip_l, so we load state dict here
+    # if full_fp16/bf16, model_dtype is casted to fp16/bf16. If not, model_dtype is None (float32).
+    # by loading with model_dtype, we can reduce memory usage.
+    model_dtype = match_mixed_precision(args, weight_dtype)  # None (default) or fp16/bf16 (full_xxxx)
+    if args.clip_l is None:
+        sd3_state_dict = utils.load_safetensors(
+            args.pretrained_model_name_or_path, "cpu", args.disable_mmap_load_safetensors, model_dtype
+        )
+    else:
+        sd3_state_dict = None
+
+    # load tokenizer and prepare tokenize strategy
+    if args.t5xxl_max_token_length is None:
+        t5xxl_max_token_length = 256  # default value for T5XXL
+    else:
+        t5xxl_max_token_length = args.t5xxl_max_token_length
+
+    sd3_tokenize_strategy = strategy_sd3.Sd3TokenizeStrategy(t5xxl_max_token_length)
+    strategy_base.TokenizeStrategy.set_strategy(sd3_tokenize_strategy)
+
+    # load clip_l, clip_g, t5xxl for caching text encoder outputs
+    # clip_l = sd3_train_utils.load_target_model("clip_l", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
+    # clip_g = sd3_train_utils.load_target_model("clip_g", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
+    clip_l = sd3_utils.load_clip_l(args.clip_l, weight_dtype, "cpu", args.disable_mmap_load_safetensors, state_dict=sd3_state_dict)
+    clip_g = sd3_utils.load_clip_g(args.clip_g, weight_dtype, "cpu", args.disable_mmap_load_safetensors, state_dict=sd3_state_dict)
+    t5xxl = sd3_utils.load_t5xxl(args.t5xxl, weight_dtype, "cpu", args.disable_mmap_load_safetensors, state_dict=sd3_state_dict)
+    assert clip_l is not None and clip_g is not None and t5xxl is not None, "clip_l, clip_g, t5xxl must be specified"
+
+    # prepare text encoding strategy
+    text_encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy(args.apply_lg_attn_mask, args.apply_t5_attn_mask)
+    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
+
+    # 学習を準備する：モデルを適切な状態にする
+    train_clip = False
+    train_t5xxl = False
+
+    if args.train_text_encoder:
+        accelerator.print("enable text encoder training")
+        if args.gradient_checkpointing:
+            clip_l.gradient_checkpointing_enable()
+            clip_g.gradient_checkpointing_enable()
+            if args.train_t5xxl:
+                t5xxl.gradient_checkpointing_enable()
+
+        lr_te1 = args.learning_rate_te1 if args.learning_rate_te1 is not None else args.learning_rate  # 0 means not train
+        lr_te2 = args.learning_rate_te2 if args.learning_rate_te2 is not None else args.learning_rate  # 0 means not train
+        lr_t5xxl = args.learning_rate_te3 if args.learning_rate_te3 is not None else args.learning_rate  # 0 means not train
+        train_clip = lr_te1 != 0 or lr_te2 != 0
+        train_t5xxl = lr_t5xxl != 0 and args.train_t5xxl
+
+        clip_l.to(weight_dtype)
+        clip_g.to(weight_dtype)
+        t5xxl.to(weight_dtype)
+        clip_l.requires_grad_(train_clip)
+        clip_g.requires_grad_(train_clip)
+        t5xxl.requires_grad_(train_t5xxl)
+    else:
+        print("disable text encoder training")
+        clip_l.to(weight_dtype)
+        clip_g.to(weight_dtype)
+        t5xxl.to(weight_dtype)
+        clip_l.requires_grad_(False)
+        clip_g.requires_grad_(False)
+        t5xxl.requires_grad_(False)
+        lr_te1 = 0
+        lr_te2 = 0
+        lr_t5xxl = 0
+
+    # cache text encoder outputs
+    sample_prompts_te_outputs = None
+    if args.cache_text_encoder_outputs:
+        clip_l.to(accelerator.device)
+        clip_g.to(accelerator.device)
+        t5xxl.to(accelerator.device)
+        clip_l.eval()
+        clip_g.eval()
+        t5xxl.eval()
+
+        text_encoder_caching_strategy = strategy_sd3.Sd3TextEncoderOutputsCachingStrategy(
+            args.cache_text_encoder_outputs_to_disk,
+            args.text_encoder_batch_size,
+            args.skip_cache_check,
+            train_clip or args.use_t5xxl_cache_only,  # if clip is trained or t5xxl is cached, caching is partial
+            args.apply_lg_attn_mask,
+            args.apply_t5_attn_mask,
+        )
+        strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
+
+        with accelerator.autocast():
+            train_dataset_group.new_cache_text_encoder_outputs([clip_l, clip_g, t5xxl], accelerator)
+
+        # cache sample prompt's embeddings to free text encoder's memory
+        if args.sample_prompts is not None:
+            logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
+            prompts = train_util.load_prompts(args.sample_prompts)
+            sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
+            with accelerator.autocast(), torch.no_grad():
+                for prompt_dict in prompts:
+                    for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
+                        if p not in sample_prompts_te_outputs:
+                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
+                            tokens_and_masks = sd3_tokenize_strategy.tokenize(p)
+                            sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
+                                sd3_tokenize_strategy,
+                                [clip_l, clip_g, t5xxl],
+                                tokens_and_masks,
+                                args.apply_lg_attn_mask,
+                                args.apply_t5_attn_mask,
+                            )
+
+        accelerator.wait_for_everyone()
+
+        # now we can delete Text Encoders to free memory
+        if args.use_t5xxl_cache_only:
+            clip_l = None
+            clip_g = None
+        t5xxl = None
+
+        clean_memory_on_device(accelerator.device)
 
     # load VAE for caching latents
-    vae: sd3_models.SDVAE = None
+    if sd3_state_dict is None:
+        sd3_state_dict = utils.load_safetensors(
+            args.pretrained_model_name_or_path, "cpu", args.disable_mmap_load_safetensors, model_dtype
+        )
+
+    vae = sd3_utils.load_vae(args.vae, weight_dtype, "cpu", args.disable_mmap_load_safetensors, state_dict=sd3_state_dict)
     if cache_latents:
-        vae = sd3_train_utils.load_target_model("vae", args, sd3_state_dict, accelerator, attn_mode, vae_dtype, device_to_load)
-        vae.to(accelerator.device, dtype=vae_dtype)
+        # vae = sd3_train_utils.load_target_model("vae", args, sd3_state_dict, accelerator, attn_mode, vae_dtype, device_to_load)
+        vae.to(accelerator.device, dtype=weight_dtype)
         vae.requires_grad_(False)
         vae.eval()
 
@@ -250,127 +348,36 @@ def train(args):
 
         accelerator.wait_for_everyone()
 
-    # load clip_l, clip_g, t5xxl for caching text encoder outputs
-    # # models are usually loaded on CPU and moved to GPU later. This is to avoid OOM on GPU0.
-    # mmdit, clip_l, clip_g, t5xxl, vae = sd3_train_utils.load_target_model(
-    #     args, accelerator, attn_mode, weight_dtype, clip_dtype, t5xxl_device, t5xxl_dtype, vae_dtype
-    # )
-    clip_l = sd3_train_utils.load_target_model("clip_l", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
-    clip_g = sd3_train_utils.load_target_model("clip_g", args, sd3_state_dict, accelerator, attn_mode, clip_dtype, device_to_load)
-    assert clip_l is not None, "clip_l is required / clip_lは必須です"
-    assert clip_g is not None, "clip_g is required / clip_gは必須です"
-
-    t5xxl = sd3_train_utils.load_target_model("t5xxl", args, sd3_state_dict, accelerator, attn_mode, t5xxl_dtype, device_to_load)
-    # logit_scale = logit_scale.to(accelerator.device, dtype=weight_dtype)
-
-    # should be deleted after caching text encoder outputs when not training text encoder
-    # this strategy should not be used other than this process
-    text_encoding_strategy = strategy_sd3.Sd3TextEncodingStrategy()
-    strategy_base.TextEncodingStrategy.set_strategy(text_encoding_strategy)
-
-    # 学習を準備する：モデルを適切な状態にする
-    train_clip_l = False
-    train_clip_g = False
-    train_t5xxl = False
-
-    if args.train_text_encoder:
-        accelerator.print("enable text encoder training")
-        if args.gradient_checkpointing:
-            clip_l.gradient_checkpointing_enable()
-            clip_g.gradient_checkpointing_enable()
-        lr_te1 = args.learning_rate_te1 if args.learning_rate_te1 is not None else args.learning_rate  # 0 means not train
-        lr_te2 = args.learning_rate_te2 if args.learning_rate_te2 is not None else args.learning_rate  # 0 means not train
-        train_clip_l = lr_te1 != 0
-        train_clip_g = lr_te2 != 0
-
-        if not train_clip_l:
-            clip_l.to(weight_dtype)
-        if not train_clip_g:
-            clip_g.to(weight_dtype)
-        clip_l.requires_grad_(train_clip_l)
-        clip_g.requires_grad_(train_clip_g)
-        clip_l.train(train_clip_l)
-        clip_g.train(train_clip_g)
-    else:
-        clip_l.to(weight_dtype)
-        clip_g.to(weight_dtype)
-        clip_l.requires_grad_(False)
-        clip_g.requires_grad_(False)
-        clip_l.eval()
-        clip_g.eval()
-
-    if t5xxl is not None:
-        t5xxl.to(t5xxl_dtype)
-        t5xxl.requires_grad_(False)
-        t5xxl.eval()
-
-    # cache text encoder outputs
-    sample_prompts_te_outputs = None
-    if args.cache_text_encoder_outputs:
-        # Text Encodes are eval and no grad here
-        clip_l.to(accelerator.device)
-        clip_g.to(accelerator.device)
-        if t5xxl is not None:
-            t5xxl.to(t5xxl_device)
-
-        text_encoder_caching_strategy = strategy_sd3.Sd3TextEncoderOutputsCachingStrategy(
-            args.cache_text_encoder_outputs_to_disk,
-            args.text_encoder_batch_size,
-            args.skip_cache_check,
-            train_clip_g or train_clip_l or args.use_t5xxl_cache_only,
-            args.apply_lg_attn_mask,
-            args.apply_t5_attn_mask,
-        )
-        strategy_base.TextEncoderOutputsCachingStrategy.set_strategy(text_encoder_caching_strategy)
-
-        clip_l.to(accelerator.device, dtype=weight_dtype)
-        clip_g.to(accelerator.device, dtype=weight_dtype)
-        if t5xxl is not None:
-            t5xxl.to(t5xxl_device, dtype=t5xxl_dtype)
-
-        with accelerator.autocast():
-            train_dataset_group.new_cache_text_encoder_outputs([clip_l, clip_g, t5xxl], accelerator)
-
-        # cache sample prompt's embeddings to free text encoder's memory
-        if args.sample_prompts is not None:
-            logger.info(f"cache Text Encoder outputs for sample prompt: {args.sample_prompts}")
-            prompts = sd3_train_utils.load_prompts(args.sample_prompts)
-            sample_prompts_te_outputs = {}  # key: prompt, value: text encoder outputs
-            with accelerator.autocast(), torch.no_grad():
-                for prompt_dict in prompts:
-                    for p in [prompt_dict.get("prompt", ""), prompt_dict.get("negative_prompt", "")]:
-                        if p not in sample_prompts_te_outputs:
-                            logger.info(f"cache Text Encoder outputs for prompt: {p}")
-                            tokens_list = sd3_tokenize_strategy.tokenize(p)
-                            sample_prompts_te_outputs[p] = text_encoding_strategy.encode_tokens(
-                                sd3_tokenize_strategy,
-                                [clip_l, clip_g, t5xxl],
-                                tokens_list,
-                                args.apply_lg_attn_mask,
-                                args.apply_t5_attn_mask,
-                            )
-
-        accelerator.wait_for_everyone()
-
     # load MMDIT
-    # if full_fp16/bf16, model_dtype is casted to fp16/bf16. If not, model_dtype is None (float32).
-    # by loading with model_dtype, we can reduce memory usage.
-    model_dtype = match_mixed_precision(args, weight_dtype)  # None (default) or fp16/bf16 (full_xxxx)
-    mmdit = sd3_train_utils.load_target_model("mmdit", args, sd3_state_dict, accelerator, attn_mode, model_dtype, device_to_load)
+    mmdit = sd3_utils.load_mmdit(
+        sd3_state_dict,
+        model_dtype,
+        "cpu",
+    )
+
+    # attn_mode = "xformers" if args.xformers else "torch"
+    # assert (
+    #     attn_mode == "torch"
+    # ), f"attn_mode {attn_mode} is not supported yet. Please use `--sdpa` instead of `--xformers`. / attn_mode {attn_mode} はサポートされていません。`--xformers`の代わりに`--sdpa`を使ってください。"
+
+    # SD3 state dict may contain multiple models, so we need to load it and extract one by one. annoying.
+    logger.info(f"Loading SD3 models from {args.pretrained_model_name_or_path}")
+    device_to_load = accelerator.device if args.lowram else "cpu"
+    sd3_state_dict = utils.load_safetensors(args.pretrained_model_name_or_path, device_to_load, args.disable_mmap_load_safetensors)
+
     if args.gradient_checkpointing:
         mmdit.enable_gradient_checkpointing()
 
     train_mmdit = args.learning_rate != 0
     mmdit.requires_grad_(train_mmdit)
     if not train_mmdit:
-        mmdit.to(accelerator.device, dtype=weight_dtype)  # because of mmdie will not be prepared
+        mmdit.to(accelerator.device, dtype=weight_dtype)  # because of mmdit will not be prepared
 
     if not cache_latents:
-        # load VAE here if not cached
-        vae = sd3_train_utils.load_target_model("vae", args, sd3_state_dict, accelerator, attn_mode, vae_dtype, device_to_load)
+        # move to accelerator device
         vae.requires_grad_(False)
         vae.eval()
-        vae.to(accelerator.device, dtype=vae_dtype)
+        vae.to(accelerator.device, dtype=weight_dtype)
 
     mmdit.requires_grad_(train_mmdit)
     if not train_mmdit:
@@ -394,19 +401,24 @@ def train(args):
 
     training_models = []
     params_to_optimize = []
-    # if train_unet:
+    param_names = []
     training_models.append(mmdit)
-    # if block_lrs is None:
     params_to_optimize.append({"params": list(filter(lambda p: p.requires_grad, mmdit.parameters())), "lr": args.learning_rate})
-    # else:
-    #     params_to_optimize.extend(get_block_params_to_optimize(mmdit, block_lrs))
+    param_names.append([n for n, _ in mmdit.named_parameters()])
 
-    # if train_clip_l:
-    #     training_models.append(clip_l)
-    #     params_to_optimize.append({"params": list(clip_l.parameters()), "lr": args.learning_rate_te1 or args.learning_rate})
-    # if train_clip_g:
-    #     training_models.append(clip_g)
-    #     params_to_optimize.append({"params": list(clip_g.parameters()), "lr": args.learning_rate_te2 or args.learning_rate})
+    if train_clip:
+        if lr_te1 > 0:
+            training_models.append(clip_l)
+            params_to_optimize.append({"params": list(clip_l.parameters()), "lr": args.learning_rate_te1 or args.learning_rate})
+            param_names.append([n for n, _ in clip_l.named_parameters()])
+        if lr_te2 > 0:
+            training_models.append(clip_g)
+            params_to_optimize.append({"params": list(clip_g.parameters()), "lr": args.learning_rate_te2 or args.learning_rate})
+            param_names.append([n for n, _ in clip_g.named_parameters()])
+    if train_t5xxl:
+        training_models.append(t5xxl)
+        params_to_optimize.append({"params": list(t5xxl.parameters()), "lr": args.learning_rate_te3 or args.learning_rate})
+        param_names.append([n for n, _ in t5xxl.named_parameters()])
 
     # calculate number of trainable parameters
     n_params = 0
@@ -414,47 +426,49 @@ def train(args):
         for p in group["params"]:
             n_params += p.numel()
 
-    accelerator.print(f"train mmdit: {train_mmdit}")  # , clip_l: {train_clip_l}, clip_g: {train_clip_g}")
+    accelerator.print(f"train mmdit: {train_mmdit} , clip:{train_clip}, t5xxl:{train_t5xxl}")
     accelerator.print(f"number of models: {len(training_models)}")
     accelerator.print(f"number of trainable parameters: {n_params}")
 
     # 学習に必要なクラスを準備する
     accelerator.print("prepare optimizer, data loader etc.")
 
-    if args.fused_optimizer_groups:
+    if args.blockwise_fused_optimizers:
         # fused backward pass: https://pytorch.org/tutorials/intermediate/optimizer_step_in_backward_tutorial.html
-        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each group of parameters.
+        # Instead of creating an optimizer for all parameters as in the tutorial, we create an optimizer for each block of parameters.
         # This balances memory usage and management complexity.
 
-        # calculate total number of parameters
-        n_total_params = sum(len(params["params"]) for params in params_to_optimize)
-        params_per_group = math.ceil(n_total_params / args.fused_optimizer_groups)
-
-        # split params into groups, keeping the learning rate the same for all params in a group
-        # this will increase the number of groups if the learning rate is different for different params (e.g. U-Net and text encoders)
+        # split params into groups for mmdit. clip_l, clip_g, t5xxl are in each group
         grouped_params = []
-        param_group = []
-        param_group_lr = -1
-        for group in params_to_optimize:
-            lr = group["lr"]
-            for p in group["params"]:
-                # if the learning rate is different for different params, start a new group
-                if lr != param_group_lr:
-                    if param_group:
-                        grouped_params.append({"params": param_group, "lr": param_group_lr})
-                        param_group = []
-                    param_group_lr = lr
+        param_group = {}
+        group = params_to_optimize[0]
+        named_parameters = list(mmdit.named_parameters())
+        assert len(named_parameters) == len(group["params"]), "number of parameters does not match"
+        for p, np in zip(group["params"], named_parameters):
+            # determine target layer and block index for each parameter
+            block_type = "other"  # joint or other
+            if np[0].startswith("joint_blocks"):
+                block_idx = int(np[0].split(".")[1])
+                block_type = "joint"
+            else:
+                block_idx = -1
 
-                param_group.append(p)
+            param_group_key = (block_type, block_idx)
+            if param_group_key not in param_group:
+                param_group[param_group_key] = []
+            param_group[param_group_key].append(p)
 
-                # if the group has enough parameters, start a new group
-                if len(param_group) == params_per_group:
-                    grouped_params.append({"params": param_group, "lr": param_group_lr})
-                    param_group = []
-                    param_group_lr = -1
+        block_types_and_indices = []
+        for param_group_key, param_group in param_group.items():
+            block_types_and_indices.append(param_group_key)
+            grouped_params.append({"params": param_group, "lr": args.learning_rate})
 
-        if param_group:
-            grouped_params.append({"params": param_group, "lr": param_group_lr})
+            num_params = 0
+            for p in param_group:
+                num_params += p.numel()
+            accelerator.print(f"block {param_group_key}: {num_params} parameters")
+
+        grouped_params.extend(params_to_optimize[1:])  # add clip_l, clip_g, t5xxl if they are trained
 
         # prepare optimizers for each group
         optimizers = []
@@ -463,10 +477,15 @@ def train(args):
             optimizers.append(optimizer)
         optimizer = optimizers[0]  # avoid error in the following code
 
-        logger.info(f"using {len(optimizers)} optimizers for fused optimizer groups")
+        logger.info(f"using {len(optimizers)} optimizers for blockwise fused optimizers")
 
+        if train_util.is_schedulefree_optimizer(optimizers[0], args):
+            raise ValueError("Schedule-free optimizer is not supported with blockwise fused optimizers")
+        optimizer_train_fn = lambda: None  # dummy function
+        optimizer_eval_fn = lambda: None  # dummy function
     else:
         _, _, optimizer = train_util.get_optimizer(args, trainable_params=params_to_optimize)
+        optimizer_train_fn, optimizer_eval_fn = train_util.get_optimizer_train_eval_fn(optimizer, args)
 
     # prepare dataloader
     # strategies are set here because they cannot be referenced in another process. Copy them with the dataset
@@ -497,7 +516,7 @@ def train(args):
     train_dataset_group.set_max_train_steps(args.max_train_steps)
 
     # lr schedulerを用意する
-    if args.fused_optimizer_groups:
+    if args.blockwise_fused_optimizers:
         # prepare lr schedulers for each optimizer
         lr_schedulers = [train_util.get_scheduler_fix(args, optimizer, accelerator.num_processes) for optimizer in optimizers]
         lr_scheduler = lr_schedulers[0]  # avoid error in the following code
@@ -511,18 +530,22 @@ def train(args):
         ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
         accelerator.print("enable full fp16 training.")
         mmdit.to(weight_dtype)
-        clip_l.to(weight_dtype)
-        clip_g.to(weight_dtype)
+        if clip_l is not None:
+            clip_l.to(weight_dtype)
+        if clip_g is not None:
+            clip_g.to(weight_dtype)
         if t5xxl is not None:
-            t5xxl.to(weight_dtype)  # TODO check works with fp16 or not
+            t5xxl.to(weight_dtype)
     elif args.full_bf16:
         assert (
             args.mixed_precision == "bf16"
         ), "full_bf16 requires mixed precision='bf16' / full_bf16を使う場合はmixed_precision='bf16'を指定してください。"
         accelerator.print("enable full bf16 training.")
         mmdit.to(weight_dtype)
-        clip_l.to(weight_dtype)
-        clip_g.to(weight_dtype)
+        if clip_l is not None:
+            clip_l.to(weight_dtype)
+        if clip_g is not None:
+            clip_g.to(weight_dtype)
         if t5xxl is not None:
             t5xxl.to(weight_dtype)
 
@@ -533,14 +556,7 @@ def train(args):
     #     clip_l.text_model.final_layer_norm.requires_grad_(False)
 
     # TextEncoderの出力をキャッシュするときには、すでに出力を取得済みなのでCPUへ移動する
-    if args.cache_text_encoder_outputs:
-        # move Text Encoders for sampling images. Text Encoder doesn't work on CPU with fp16
-        clip_l.to("cpu", dtype=torch.float32)
-        clip_g.to("cpu", dtype=torch.float32)
-        if t5xxl is not None:
-            t5xxl.to("cpu", dtype=torch.float32)
-        clean_memory_on_device(accelerator.device)
-    else:
+    if not args.cache_text_encoder_outputs:
         # make sure Text Encoders are on GPU
         # TODO support CPU for text encoders
         clip_l.to(accelerator.device)
@@ -548,18 +564,11 @@ def train(args):
         if t5xxl is not None:
             t5xxl.to(accelerator.device)
 
-    # TODO cache sample prompt's embeddings to free text encoder's memory
-    if args.cache_text_encoder_outputs:
-        if not args.save_t5xxl:
-            t5xxl = None  # free memory
     clean_memory_on_device(accelerator.device)
 
     if args.deepspeed:
         ds_model = deepspeed_utils.prepare_deepspeed_model(
-            args,
-            mmdit=mmdit,
-            clip_l=clip_l if train_clip_l else None,
-            clip_g=clip_g if train_clip_g else None,
+            args, mmdit=mmdit, clip_l=clip_l if train_clip else None, clip_g=clip_g if train_clip else None
         )
         # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007
         ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
@@ -571,10 +580,11 @@ def train(args):
         # acceleratorがなんかよろしくやってくれるらしい
         if train_mmdit:
             mmdit = accelerator.prepare(mmdit)
-        if train_clip_l:
+        if train_clip:
             clip_l = accelerator.prepare(clip_l)
-        if train_clip_g:
             clip_g = accelerator.prepare(clip_g)
+        if train_t5xxl:
+            t5xxl = accelerator.prepare(t5xxl)
         optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler)
 
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
@@ -586,24 +596,110 @@ def train(args):
     # resumeする
     train_util.resume_from_local_or_hf_if_specified(accelerator, args)
 
+    # memory efficient block swapping
+
+    def submit_move_blocks(futures, thread_pool, block_idx_to_cpu, block_idx_to_cuda, blocks, device):
+        def move_blocks(bidx_to_cpu, block_to_cpu, bidx_to_cuda, block_to_cuda, dvc):
+            # print(f"Backward: Move block {bidx_to_cpu} to CPU")
+            block_to_cpu = block_to_cpu.to("cpu", non_blocking=True)
+            torch.cuda.empty_cache()
+
+            block_to_cuda = block_to_cuda.to(dvc, non_blocking=True)
+            torch.cuda.synchronize()
+            return bidx_to_cpu, bidx_to_cuda
+
+        block_to_cpu = blocks[block_idx_to_cpu]
+        block_to_cuda = blocks[block_idx_to_cuda]
+
+        futures[block_idx_to_cuda] = thread_pool.submit(
+            move_blocks, block_idx_to_cpu, block_to_cpu, block_idx_to_cuda, block_to_cuda, device
+        )
+
+    def wait_blocks_move(block_idx, futures):
+        if block_idx not in futures:
+            return
+        future = futures.pop(block_idx)
+        future.result()
+
     if args.fused_backward_pass:
         # use fused optimizer for backward pass: other optimizers will be supported in the future
         import library.adafactor_fused
 
         library.adafactor_fused.patch_adafactor_fused(optimizer)
-        for param_group in optimizer.param_groups:
-            for parameter in param_group["params"]:
+
+        blocks_to_swap = args.blocks_to_swap
+        num_blocks = len(accelerator.unwrap_model(mmdit).joint_blocks)
+        handled_block_indices = set()
+
+        n = 1  # only asynchronous purpose, no need to increase this number
+        # n = 2
+        # n = max(1, os.cpu_count() // 2)
+        thread_pool = ThreadPoolExecutor(max_workers=n)
+        futures = {}
+
+        for param_group, param_name_group in zip(optimizer.param_groups, param_names):
+            for parameter, param_name in zip(param_group["params"], param_name_group):
                 if parameter.requires_grad:
+                    grad_hook = None
 
-                    def __grad_hook(tensor: torch.Tensor, param_group=param_group):
-                        if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                            accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
-                        optimizer.step_param(tensor, param_group)
-                        tensor.grad = None
+                    if blocks_to_swap:
+                        is_block = param_name.startswith("double_blocks")
+                        if is_block:
+                            block_idx = int(param_name.split(".")[1])
+                            if block_idx not in handled_block_indices:
+                                # swap following (already backpropagated) block
+                                handled_block_indices.add(block_idx)
 
-                    parameter.register_post_accumulate_grad_hook(__grad_hook)
+                                # if n blocks were already backpropagated
+                                num_blocks_propagated = num_blocks - block_idx - 1
+                                swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
+                                waiting = block_idx > 0 and block_idx <= blocks_to_swap
+                                if swapping or waiting:
+                                    block_idx_to_cpu = num_blocks - num_blocks_propagated
+                                    block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
+                                    block_idx_to_wait = block_idx - 1
 
-    elif args.fused_optimizer_groups:
+                                    # create swap hook
+                                    def create_swap_grad_hook(
+                                        bidx_to_cpu, bidx_to_cuda, bidx_to_wait, bidx: int, swpng: bool, wtng: bool
+                                    ):
+                                        def __grad_hook(tensor: torch.Tensor):
+                                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                                            optimizer.step_param(tensor, param_group)
+                                            tensor.grad = None
+
+                                            if swpng:
+                                                submit_move_blocks(
+                                                    futures,
+                                                    thread_pool,
+                                                    bidx_to_cpu,
+                                                    bidx_to_cuda,
+                                                    mmdit.joint_blocks,
+                                                    accelerator.device,
+                                                )
+                                            if wtng:
+                                                wait_blocks_move(bidx_to_wait, futures)
+
+                                        return __grad_hook
+
+                                    grad_hook = create_swap_grad_hook(
+                                        block_idx_to_cpu, block_idx_to_cuda, block_idx_to_wait, block_idx, swapping, waiting
+                                    )
+
+                    if grad_hook is None:
+
+                        def __grad_hook(tensor: torch.Tensor, param_group=param_group):
+                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                accelerator.clip_grad_norm_(tensor, args.max_grad_norm)
+                            optimizer.step_param(tensor, param_group)
+                            tensor.grad = None
+
+                        grad_hook = __grad_hook
+
+                    parameter.register_post_accumulate_grad_hook(grad_hook)
+
+    elif args.blockwise_fused_optimizers:
         # prepare for additional optimizers and lr schedulers
         for i in range(1, len(optimizers)):
             optimizers[i] = accelerator.prepare(optimizers[i])
@@ -618,22 +714,59 @@ def train(args):
         num_parameters_per_group = [0] * len(optimizers)
         parameter_optimizer_map = {}
 
+        blocks_to_swap = args.blocks_to_swap
+        num_blocks = len(accelerator.unwrap_model(mmdit).joint_blocks)
+
+        n = 1  # only asynchronous purpose, no need to increase this number
+        # n = max(1, os.cpu_count() // 2)
+        thread_pool = ThreadPoolExecutor(max_workers=n)
+        futures = {}
+
         for opt_idx, optimizer in enumerate(optimizers):
             for param_group in optimizer.param_groups:
                 for parameter in param_group["params"]:
                     if parameter.requires_grad:
+                        block_type, block_idx = block_types_and_indices[opt_idx]
 
-                        def optimizer_hook(parameter: torch.Tensor):
-                            if accelerator.sync_gradients and args.max_grad_norm != 0.0:
-                                accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
+                        def create_optimizer_hook(btype, bidx):
+                            def optimizer_hook(parameter: torch.Tensor):
+                                # print(f"optimizer_hook: {btype}, {bidx}")
+                                if accelerator.sync_gradients and args.max_grad_norm != 0.0:
+                                    accelerator.clip_grad_norm_(parameter, args.max_grad_norm)
 
-                            i = parameter_optimizer_map[parameter]
-                            optimizer_hooked_count[i] += 1
-                            if optimizer_hooked_count[i] == num_parameters_per_group[i]:
-                                optimizers[i].step()
-                                optimizers[i].zero_grad(set_to_none=True)
+                                i = parameter_optimizer_map[parameter]
+                                optimizer_hooked_count[i] += 1
+                                if optimizer_hooked_count[i] == num_parameters_per_group[i]:
+                                    optimizers[i].step()
+                                    optimizers[i].zero_grad(set_to_none=True)
 
-                        parameter.register_post_accumulate_grad_hook(optimizer_hook)
+                                    # swap blocks if necessary
+                                    if blocks_to_swap and btype == "joint":
+                                        num_blocks_propagated = num_blocks - bidx
+
+                                        swapping = num_blocks_propagated > 0 and num_blocks_propagated <= blocks_to_swap
+                                        waiting = bidx > 0 and bidx <= blocks_to_swap
+
+                                        if swapping:
+                                            block_idx_to_cpu = num_blocks - num_blocks_propagated
+                                            block_idx_to_cuda = blocks_to_swap - num_blocks_propagated
+                                            # print(f"Backward: Swap blocks {block_idx_to_cpu} and {block_idx_to_cuda}")
+                                            submit_move_blocks(
+                                                futures,
+                                                thread_pool,
+                                                block_idx_to_cpu,
+                                                block_idx_to_cuda,
+                                                mmdit.joint_blocks,
+                                                accelerator.device,
+                                            )
+
+                                        if waiting:
+                                            block_idx_to_wait = bidx - 1
+                                            wait_blocks_move(block_idx_to_wait, futures)
+
+                            return optimizer_hook
+
+                        parameter.register_post_accumulate_grad_hook(create_optimizer_hook(block_type, block_idx))
                         parameter_optimizer_map[parameter] = opt_idx
                         num_parameters_per_group[opt_idx] += 1
 
@@ -661,17 +794,9 @@ def train(args):
     progress_bar = tqdm(range(args.max_train_steps), smoothing=0, disable=not accelerator.is_local_main_process, desc="steps")
     global_step = 0
 
-    # noise_scheduler = DDPMScheduler(
-    #     beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000, clip_sample=False
-    # )
-
     noise_scheduler = sd3_train_utils.FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=3.0)
     noise_scheduler_copy = copy.deepcopy(noise_scheduler)
 
-    # prepare_scheduler_for_custom_training(noise_scheduler, accelerator.device)
-    # if args.zero_terminal_snr:
-    #     custom_train_functions.fix_noise_scheduler_betas_for_zero_terminal_snr(noise_scheduler)
-
     if accelerator.is_main_process:
         init_kwargs = {}
         if args.wandb_run_name:
@@ -685,60 +810,13 @@ def train(args):
         )
 
     # For --sample_at_first
+    optimizer_eval_fn()
     sd3_train_utils.sample_images(accelerator, args, 0, global_step, mmdit, vae, [clip_l, clip_g, t5xxl], sample_prompts_te_outputs)
+    optimizer_train_fn()
     if len(accelerator.trackers) > 0:
         # log empty object to commit the sample images to wandb
         accelerator.log({}, step=0)
 
-    # following function will be moved to sd3_train_utils
-
-    def get_sigmas(timesteps, n_dim=4, dtype=torch.float32):
-        sigmas = noise_scheduler_copy.sigmas.to(device=accelerator.device, dtype=dtype)
-        schedule_timesteps = noise_scheduler_copy.timesteps.to(accelerator.device)
-        timesteps = timesteps.to(accelerator.device)
-        step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < n_dim:
-            sigma = sigma.unsqueeze(-1)
-        return sigma
-
-    def compute_density_for_timestep_sampling(
-        weighting_scheme: str, batch_size: int, logit_mean: float = None, logit_std: float = None, mode_scale: float = None
-    ):
-        """Compute the density for sampling the timesteps when doing SD3 training.
-
-        Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
-
-        SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
-        """
-        if weighting_scheme == "logit_normal":
-            # See 3.1 in the SD3 paper ($rf/lognorm(0.00,1.00)$).
-            u = torch.normal(mean=logit_mean, std=logit_std, size=(batch_size,), device="cpu")
-            u = torch.nn.functional.sigmoid(u)
-        elif weighting_scheme == "mode":
-            u = torch.rand(size=(batch_size,), device="cpu")
-            u = 1 - u - mode_scale * (torch.cos(math.pi * u / 2) ** 2 - 1 + u)
-        else:
-            u = torch.rand(size=(batch_size,), device="cpu")
-        return u
-
-    def compute_loss_weighting_for_sd3(weighting_scheme: str, sigmas=None):
-        """Computes loss weighting scheme for SD3 training.
-
-        Courtesy: This was contributed by Rafie Walker in https://github.com/huggingface/diffusers/pull/8528.
-
-        SD3 paper reference: https://arxiv.org/abs/2403.03206v1.
-        """
-        if weighting_scheme == "sigma_sqrt":
-            weighting = (sigmas**-2.0).float()
-        elif weighting_scheme == "cosmap":
-            bot = 1 - 2 * sigmas + 2 * sigmas**2
-            weighting = 2 / (math.pi * bot)
-        else:
-            weighting = torch.ones_like(sigmas)
-        return weighting
-
     loss_recorder = train_util.LossRecorder()
     epoch = 0  # avoid error when max_train_steps is 0
     for epoch in range(num_train_epochs):
@@ -751,16 +829,16 @@ def train(args):
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
 
-            if args.fused_optimizer_groups:
+            if args.blockwise_fused_optimizers:
                 optimizer_hooked_count = {i: 0 for i in range(len(optimizers))}  # reset counter for each step
 
             with accelerator.accumulate(*training_models):
                 if "latents" in batch and batch["latents"] is not None:
-                    latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype)
+                    latents = batch["latents"].to(accelerator.device, dtype=weight_dtype)
                 else:
                     with torch.no_grad():
                         # encode images to latents. images are [-1, 1]
-                        latents = vae.encode(batch["images"].to(vae_dtype)).to(weight_dtype)
+                        latents = vae.encode(batch["images"])
 
                     # NaNが含まれていれば警告を表示し0に置き換える
                     if torch.any(torch.isnan(latents)):
@@ -772,7 +850,7 @@ def train(args):
 
                 text_encoder_outputs_list = batch.get("text_encoder_outputs_list", None)
                 if text_encoder_outputs_list is not None:
-                    lg_out, t5_out, lg_pooled = text_encoder_outputs_list
+                    lg_out, t5_out, lg_pooled, l_attn_mask, g_attn_mask, t5_attn_mask = text_encoder_outputs_list
                     if args.use_t5xxl_cache_only:
                         lg_out = None
                         lg_pooled = None
@@ -781,7 +859,7 @@ def train(args):
                     t5_out = None
                     lg_pooled = None
 
-                if lg_out is None or (train_clip_l or train_clip_g):
+                if lg_out is None:
                     # not cached or training, so get from text encoders
                     input_ids_clip_l, input_ids_clip_g, _, l_attn_mask, g_attn_mask, _ = batch["input_ids_list"]
                     with torch.set_grad_enabled(args.train_text_encoder):
@@ -811,21 +889,10 @@ def train(args):
                 noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
 
-                # Sample a random timestep for each image
-                # for weighting schemes where we sample timesteps non-uniformly
-                u = compute_density_for_timestep_sampling(
-                    weighting_scheme=args.weighting_scheme,
-                    batch_size=bsz,
-                    logit_mean=args.logit_mean,
-                    logit_std=args.logit_std,
-                    mode_scale=args.mode_scale,
+                # get noisy model input and timesteps
+                noisy_model_input, timesteps, sigmas = sd3_train_utils.get_noisy_model_input_and_timesteps(
+                    args, noise_scheduler_copy, latents, noise, accelerator.device, weight_dtype
                 )
-                indices = (u * noise_scheduler_copy.config.num_train_timesteps).long()
-                timesteps = noise_scheduler_copy.timesteps[indices].to(device=accelerator.device)
-
-                # Add noise according to flow matching.
-                sigmas = get_sigmas(timesteps, n_dim=latents.ndim, dtype=weight_dtype)
-                noisy_model_input = sigmas * noise + (1.0 - sigmas) * latents
 
                 # debug: NaN check for all inputs
                 if torch.any(torch.isnan(noisy_model_input)):
@@ -840,6 +907,7 @@ def train(args):
 
                 # call model
                 with accelerator.autocast():
+                    # TODO support attention mask
                     model_pred = mmdit(noisy_model_input, timesteps, context=context, y=lg_pooled)
 
                 # Follow: Section 5 of https://arxiv.org/abs/2206.00364.
@@ -848,21 +916,34 @@ def train(args):
 
                 # these weighting schemes use a uniform timestep sampling
                 # and instead post-weight the loss
-                weighting = compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
+                weighting = sd3_train_utils.compute_loss_weighting_for_sd3(weighting_scheme=args.weighting_scheme, sigmas=sigmas)
 
                 # flow matching loss
                 target = latents
 
-                # Compute regular loss. TODO simplify this
-                loss = torch.mean(
-                    (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
-                    1,
+                # # Compute regular loss. TODO simplify this
+                # loss = torch.mean(
+                #     (weighting.float() * (model_pred.float() - target.float()) ** 2).reshape(target.shape[0], -1),
+                #     1,
+                # )
+                # calculate loss
+                loss = train_util.conditional_loss(
+                    model_pred.float(), target.float(), reduction="none", loss_type=args.loss_type, huber_c=None
                 )
+                if args.masked_loss or ("alpha_masks" in batch and batch["alpha_masks"] is not None):
+                    loss = apply_masked_loss(loss, batch)
+                loss = loss.mean([1, 2, 3])
+
+                if weighting is not None:
+                    loss = loss * weighting
+
+                loss_weights = batch["loss_weights"]  # 各sampleごとのweight
+                loss = loss * loss_weights
                 loss = loss.mean()
 
                 accelerator.backward(loss)
 
-                if not (args.fused_backward_pass or args.fused_optimizer_groups):
+                if not (args.fused_backward_pass or args.blockwise_fused_optimizers):
                     if accelerator.sync_gradients and args.max_grad_norm != 0.0:
                         params_to_clip = []
                         for m in training_models:
@@ -875,7 +956,7 @@ def train(args):
                 else:
                     # optimizer.step() and optimizer.zero_grad() are called in the optimizer hook
                     lr_scheduler.step()
-                    if args.fused_optimizer_groups:
+                    if args.blockwise_fused_optimizers:
                         for i in range(1, len(optimizers)):
                             lr_schedulers[i].step()
 
@@ -884,6 +965,7 @@ def train(args):
                 progress_bar.update(1)
                 global_step += 1
 
+                optimizer_eval_fn()
                 sd3_train_utils.sample_images(
                     accelerator, args, None, global_step, mmdit, vae, [clip_l, clip_g, t5xxl], sample_prompts_te_outputs
                 )
@@ -900,12 +982,13 @@ def train(args):
                             epoch,
                             num_train_epochs,
                             global_step,
-                            accelerator.unwrap_model(clip_l) if args.save_clip else None,
-                            accelerator.unwrap_model(clip_g) if args.save_clip else None,
-                            accelerator.unwrap_model(t5xxl) if args.save_t5xxl else None,
-                            accelerator.unwrap_model(mmdit),
+                            accelerator.unwrap_model(clip_l) if train_clip else None,
+                            accelerator.unwrap_model(clip_g) if train_clip else None,
+                            accelerator.unwrap_model(t5xxl) if train_t5xxl else None,
+                            accelerator.unwrap_model(mmdit) if train_mmdit else None,
                             vae,
                         )
+                optimizer_train_fn()
 
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
             if len(accelerator.trackers) > 0:
@@ -928,6 +1011,7 @@ def train(args):
 
         accelerator.wait_for_everyone()
 
+        optimizer_eval_fn()
         if args.save_every_n_epochs is not None:
             if accelerator.is_main_process:
                 sd3_train_utils.save_sd3_model_on_epoch_end_or_stepwise(
@@ -938,10 +1022,10 @@ def train(args):
                     epoch,
                     num_train_epochs,
                     global_step,
-                    accelerator.unwrap_model(clip_l) if args.save_clip else None,
-                    accelerator.unwrap_model(clip_g) if args.save_clip else None,
-                    accelerator.unwrap_model(t5xxl) if args.save_t5xxl else None,
-                    accelerator.unwrap_model(mmdit),
+                    accelerator.unwrap_model(clip_l) if train_clip else None,
+                    accelerator.unwrap_model(clip_g) if train_clip else None,
+                    accelerator.unwrap_model(t5xxl) if train_t5xxl else None,
+                    accelerator.unwrap_model(mmdit) if train_mmdit else None,
                     vae,
                 )
 
@@ -958,6 +1042,7 @@ def train(args):
         t5xxl = accelerator.unwrap_model(t5xxl)
 
     accelerator.end_training()
+    optimizer_eval_fn()
 
     if args.save_state or args.save_state_on_train_end:
         train_util.save_state_on_train_end(args, accelerator)
@@ -970,10 +1055,10 @@ def train(args):
             save_dtype,
             epoch,
             global_step,
-            clip_l if args.save_clip else None,
-            clip_g if args.save_clip else None,
-            t5xxl if args.save_t5xxl else None,
-            mmdit,
+            accelerator.unwrap_model(clip_l) if train_clip else None,
+            accelerator.unwrap_model(clip_g) if train_clip else None,
+            accelerator.unwrap_model(t5xxl) if train_t5xxl else None,
+            accelerator.unwrap_model(mmdit) if train_mmdit else None,
             vae,
         )
         logger.info("model saved.")
@@ -991,13 +1076,13 @@ def setup_parser() -> argparse.ArgumentParser:
     train_util.add_sd_saving_arguments(parser)
     train_util.add_optimizer_arguments(parser)
     config_util.add_config_arguments(parser)
-    custom_train_functions.add_custom_train_arguments(parser)
+    add_custom_train_arguments(parser)
     sd3_train_utils.add_sd3_training_arguments(parser)
 
     parser.add_argument(
         "--train_text_encoder", action="store_true", help="train text encoder (CLIP-L and G) / text encoderも学習する"
     )
-    # parser.add_argument("--train_t5xxl", action="store_true", help="train T5-XXL / T5-XXLも学習する")
+    parser.add_argument("--train_t5xxl", action="store_true", help="train T5-XXL / T5-XXLも学習する")
     parser.add_argument(
         "--use_t5xxl_cache_only", action="store_true", help="cache T5-XXL outputs only / T5-XXLの出力のみキャッシュする"
     )
@@ -1018,19 +1103,24 @@ def setup_parser() -> argparse.ArgumentParser:
         help="apply attention mask (zero embs) to T5-XXL / T5-XXLにアテンションマスク（ゼロ埋め）を適用する",
     )
 
-    # TE training is disabled temporarily
-    # parser.add_argument(
-    #     "--learning_rate_te1",
-    #     type=float,
-    #     default=None,
-    #     help="learning rate for text encoder 1 (ViT-L) / text encoder 1 (ViT-L)の学習率",
-    # )
-    # parser.add_argument(
-    #     "--learning_rate_te2",
-    #     type=float,
-    #     default=None,
-    #     help="learning rate for text encoder 2 (BiG-G) / text encoder 2 (BiG-G)の学習率",
-    # )
+    parser.add_argument(
+        "--learning_rate_te1",
+        type=float,
+        default=None,
+        help="learning rate for text encoder 1 (ViT-L) / text encoder 1 (ViT-L)の学習率",
+    )
+    parser.add_argument(
+        "--learning_rate_te2",
+        type=float,
+        default=None,
+        help="learning rate for text encoder 2 (BiG-G) / text encoder 2 (BiG-G)の学習率",
+    )
+    parser.add_argument(
+        "--learning_rate_te3",
+        type=float,
+        default=None,
+        help="learning rate for text encoder 3 (T5-XXL) / text encoder 3 (T5-XXL)の学習率",
+    )
 
     # parser.add_argument(
     #     "--diffusers_xformers", action="store_true", help="use xformers by diffusers / Diffusersでxformersを使用する"
@@ -1047,22 +1137,22 @@ def setup_parser() -> argparse.ArgumentParser:
     #     help=f"learning rates for each block of U-Net, comma-separated, {UNET_NUM_BLOCKS_FOR_BLOCK_LR} values / "
     #     + f"U-Netの各ブロックの学習率、カンマ区切り、{UNET_NUM_BLOCKS_FOR_BLOCK_LR}個の値",
     # )
+    parser.add_argument(
+        "--blockwise_fused_optimizers",
+        action="store_true",
+        help="enable blockwise optimizers for fused backward pass and optimizer step / fused backward passとoptimizer step のためブロック単位のoptimizerを有効にする",
+    )
     parser.add_argument(
         "--fused_optimizer_groups",
         type=int,
         default=None,
-        help="number of optimizers for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizer数",
+        help="[DOES NOT WORK] number of optimizer groups for fused backward pass and optimizer step / fused backward passとoptimizer stepのためのoptimizerグループ数",
     )
     parser.add_argument(
         "--skip_latents_validity_check",
         action="store_true",
         help="[Deprecated] use 'skip_cache_check' instead / 代わりに 'skip_cache_check' を使用してください",
     )
-    parser.add_argument(
-        "--skip_cache_check",
-        action="store_true",
-        help="skip cache (latents and text encoder outputs) check / キャッシュ（latentsとtext encoder outputs）のチェックをスキップする",
-    )
     parser.add_argument(
         "--num_last_block_to_freeze",
         type=int,