mirror of
https://github.com/kohya-ss/sd-scripts.git
synced 2026-04-08 22:35:09 +00:00
add gradual latent
This commit is contained in:
318
sdxl_gen_img.py
318
sdxl_gen_img.py
@@ -345,6 +345,8 @@ class PipelineLike:
|
||||
self.control_nets: List[ControlNetLLLite] = []
|
||||
self.control_net_enabled = True # control_netsが空ならTrueでもFalseでもControlNetは動作しない
|
||||
|
||||
self.gradual_latent = None
|
||||
|
||||
# Textual Inversion
|
||||
def add_token_replacement(self, text_encoder_index, target_token_id, rep_token_ids):
|
||||
self.token_replacements_list[text_encoder_index][target_token_id] = rep_token_ids
|
||||
@@ -375,6 +377,14 @@ class PipelineLike:
|
||||
def set_control_nets(self, ctrl_nets):
|
||||
self.control_nets = ctrl_nets
|
||||
|
||||
def set_gradual_latent(self, gradual_latent):
|
||||
if gradual_latent is None:
|
||||
print("gradual_latent is disabled")
|
||||
self.gradual_latent = None
|
||||
else:
|
||||
print(f"gradual_latent is enabled: {gradual_latent}")
|
||||
self.gradual_latent = gradual_latent # (ds_ratio, start_timesteps, every_n_steps, ratio_step)
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
@@ -706,7 +716,108 @@ class PipelineLike:
|
||||
control_net.set_cond_image(None)
|
||||
|
||||
each_control_net_enabled = [self.control_net_enabled] * len(self.control_nets)
|
||||
|
||||
# # first, we downscale the latents to the half of the size
|
||||
# # 最初に1/2に縮小する
|
||||
# height, width = latents.shape[-2:]
|
||||
# # latents = torch.nn.functional.interpolate(latents.float(), scale_factor=0.5, mode="bicubic", align_corners=False).to(
|
||||
# # latents.dtype
|
||||
# # )
|
||||
# latents = latents[:, :, ::2, ::2]
|
||||
# current_scale = 0.5
|
||||
|
||||
# # how much to increase the scale at each step: .125 seems to work well (because it's 1/8?)
|
||||
# # 各ステップに拡大率をどのくらい増やすか:.125がよさそう(たぶん1/8なので)
|
||||
# scale_step = 0.125
|
||||
|
||||
# # timesteps at which to start increasing the scale: 1000 seems to be enough
|
||||
# # 拡大を開始するtimesteps: 1000で十分そうである
|
||||
# start_timesteps = 1000
|
||||
|
||||
# # how many steps to wait before increasing the scale again
|
||||
# # small values leads to blurry images (because the latents are blurry after the upscale, so some denoising might be needed)
|
||||
# # large values leads to flat images
|
||||
|
||||
# # 何ステップごとに拡大するか
|
||||
# # 小さいとボケる(拡大後のlatentsはボケた感じになるので、そこから数stepのdenoiseが必要と思われる)
|
||||
# # 大きすぎると細部が書き込まれずのっぺりした感じになる
|
||||
# every_n_steps = 5
|
||||
|
||||
# scale_step = input("scale step:")
|
||||
# scale_step = float(scale_step)
|
||||
# start_timesteps = input("start timesteps:")
|
||||
# start_timesteps = int(start_timesteps)
|
||||
# every_n_steps = input("every n steps:")
|
||||
# every_n_steps = int(every_n_steps)
|
||||
|
||||
# # for i, t in enumerate(tqdm(timesteps)):
|
||||
# i = 0
|
||||
# last_step = 0
|
||||
# while i < len(timesteps):
|
||||
# t = timesteps[i]
|
||||
# print(f"[{i}] t={t}")
|
||||
|
||||
# print(i, t, current_scale, latents.shape)
|
||||
# if t < start_timesteps and current_scale < 1.0 and i % every_n_steps == 0:
|
||||
# if i == last_step:
|
||||
# pass
|
||||
# else:
|
||||
# print("upscale")
|
||||
# current_scale = min(current_scale + scale_step, 1.0)
|
||||
|
||||
# h = int(height * current_scale) // 8 * 8
|
||||
# w = int(width * current_scale) // 8 * 8
|
||||
|
||||
# latents = torch.nn.functional.interpolate(latents.float(), size=(h, w), mode="bicubic", align_corners=False).to(
|
||||
# latents.dtype
|
||||
# )
|
||||
# last_step = i
|
||||
# i = max(0, i - every_n_steps + 1)
|
||||
|
||||
# diff = timesteps[i] - timesteps[last_step]
|
||||
# # resized_init_noise = torch.nn.functional.interpolate(
|
||||
# # init_noise.float(), size=(h, w), mode="bicubic", align_corners=False
|
||||
# # ).to(latents.dtype)
|
||||
# # latents = self.scheduler.add_noise(latents, resized_init_noise, diff)
|
||||
# latents = self.scheduler.add_noise(latents, torch.randn_like(latents), diff * 4)
|
||||
# # latents += torch.randn_like(latents) / 100 * diff
|
||||
# continue
|
||||
|
||||
enable_gradual_latent = False
|
||||
if self.gradual_latent:
|
||||
if not hasattr(self.scheduler, "set_resized_size"):
|
||||
print("gradual_latent is not supported for this scheduler. Ignoring.")
|
||||
print(self.scheduler.__class__.__name__)
|
||||
else:
|
||||
enable_gradual_latent = True
|
||||
current_ratio, start_timesteps, every_n_steps, ratio_step = self.gradual_latent
|
||||
step_elapsed = 1000
|
||||
|
||||
# first, we downscale the latents to the specified ratio / 最初に指定された比率にlatentsをダウンスケールする
|
||||
height, width = latents.shape[-2:]
|
||||
org_dtype = latents.dtype
|
||||
if org_dtype == torch.bfloat16:
|
||||
latents = latents.float()
|
||||
latents = torch.nn.functional.interpolate(
|
||||
latents, scale_factor=current_ratio, mode="bicubic", align_corners=False
|
||||
).to(org_dtype)
|
||||
|
||||
for i, t in enumerate(tqdm(timesteps)):
|
||||
resized_size = None
|
||||
if enable_gradual_latent:
|
||||
# gradually upscale the latents / latentsを徐々にアップスケールする
|
||||
if t < start_timesteps and current_ratio < 1.0 and step_elapsed >= every_n_steps:
|
||||
print("upscale")
|
||||
current_ratio = min(current_ratio + ratio_step, 1.0)
|
||||
h = int(height * current_ratio) // 8 * 8 # make divisible by 8 because size of latents must be divisible at bottom of UNet
|
||||
w = int(width * current_ratio) // 8 * 8
|
||||
resized_size = (h, w)
|
||||
self.scheduler.set_resized_size(resized_size)
|
||||
step_elapsed = 0
|
||||
else:
|
||||
self.scheduler.set_resized_size(None)
|
||||
step_elapsed += 1
|
||||
|
||||
# expand the latents if we are doing classifier free guidance
|
||||
latent_model_input = latents.repeat((num_latent_input, 1, 1, 1))
|
||||
latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
|
||||
@@ -775,6 +886,8 @@ class PipelineLike:
|
||||
if is_cancelled_callback is not None and is_cancelled_callback():
|
||||
return None
|
||||
|
||||
i += 1
|
||||
|
||||
if return_latents:
|
||||
return latents
|
||||
|
||||
@@ -1306,6 +1419,133 @@ def handle_dynamic_prompt_variants(prompt, repeat_count):
|
||||
return prompts
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
# region Gradual Latent hires fix
|
||||
|
||||
import diffusers.schedulers.scheduling_euler_ancestral_discrete
|
||||
from diffusers.schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteSchedulerOutput
|
||||
|
||||
|
||||
class EulerAncestralDiscreteSchedulerGL(EulerAncestralDiscreteScheduler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.resized_size = None
|
||||
|
||||
def set_resized_size(self, size):
|
||||
self.resized_size = size
|
||||
|
||||
def step(
|
||||
self,
|
||||
model_output: torch.FloatTensor,
|
||||
timestep: Union[float, torch.FloatTensor],
|
||||
sample: torch.FloatTensor,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
|
||||
"""
|
||||
Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
|
||||
process from the learned model outputs (most often the predicted noise).
|
||||
|
||||
Args:
|
||||
model_output (`torch.FloatTensor`):
|
||||
The direct output from learned diffusion model.
|
||||
timestep (`float`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.FloatTensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator.
|
||||
return_dict (`bool`):
|
||||
Whether or not to return a
|
||||
[`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
|
||||
|
||||
Returns:
|
||||
[`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
|
||||
If return_dict is `True`,
|
||||
[`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
|
||||
otherwise a tuple is returned where the first element is the sample tensor.
|
||||
|
||||
"""
|
||||
|
||||
if isinstance(timestep, int) or isinstance(timestep, torch.IntTensor) or isinstance(timestep, torch.LongTensor):
|
||||
raise ValueError(
|
||||
(
|
||||
"Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
|
||||
" `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
|
||||
" one of the `scheduler.timesteps` as a timestep."
|
||||
),
|
||||
)
|
||||
|
||||
if not self.is_scale_input_called:
|
||||
logger.warning(
|
||||
"The `scale_model_input` function should be called before `step` to ensure correct denoising. "
|
||||
"See `StableDiffusionPipeline` for a usage example."
|
||||
)
|
||||
|
||||
if self.step_index is None:
|
||||
self._init_step_index(timestep)
|
||||
|
||||
sigma = self.sigmas[self.step_index]
|
||||
|
||||
# 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
|
||||
if self.config.prediction_type == "epsilon":
|
||||
pred_original_sample = sample - sigma * model_output
|
||||
elif self.config.prediction_type == "v_prediction":
|
||||
# * c_out + input * c_skip
|
||||
pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
|
||||
elif self.config.prediction_type == "sample":
|
||||
raise NotImplementedError("prediction_type not implemented yet: sample")
|
||||
else:
|
||||
raise ValueError(f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`")
|
||||
|
||||
sigma_from = self.sigmas[self.step_index]
|
||||
sigma_to = self.sigmas[self.step_index + 1]
|
||||
sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
|
||||
sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
|
||||
|
||||
# 2. Convert to an ODE derivative
|
||||
derivative = (sample - pred_original_sample) / sigma
|
||||
|
||||
dt = sigma_down - sigma
|
||||
|
||||
prev_sample = sample + derivative * dt
|
||||
|
||||
device = model_output.device
|
||||
if self.resized_size is None:
|
||||
noise = diffusers.schedulers.scheduling_euler_ancestral_discrete.randn_tensor(
|
||||
model_output.shape, dtype=model_output.dtype, device=device, generator=generator
|
||||
)
|
||||
else:
|
||||
print(
|
||||
"resized_size", self.resized_size, "model_output.shape", model_output.shape, "prev_sample.shape", prev_sample.shape
|
||||
)
|
||||
org_dtype = prev_sample.dtype
|
||||
if org_dtype == torch.bfloat16:
|
||||
prev_sample = prev_sample.float()
|
||||
|
||||
prev_sample = torch.nn.functional.interpolate(
|
||||
prev_sample.float(), size=self.resized_size, mode="bicubic", align_corners=False
|
||||
).to(dtype=org_dtype)
|
||||
|
||||
noise = diffusers.schedulers.scheduling_euler_ancestral_discrete.randn_tensor(
|
||||
(model_output.shape[0], model_output.shape[1], self.resized_size[0], self.resized_size[1]),
|
||||
dtype=model_output.dtype,
|
||||
device=device,
|
||||
generator=generator,
|
||||
)
|
||||
|
||||
prev_sample = prev_sample + noise * sigma_up
|
||||
|
||||
# upon completion increase step index by one
|
||||
self._step_index += 1
|
||||
|
||||
if not return_dict:
|
||||
return (prev_sample,)
|
||||
|
||||
return EulerAncestralDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
|
||||
|
||||
|
||||
# endregion
|
||||
|
||||
|
||||
@@ -1407,7 +1647,7 @@ def main(args):
|
||||
scheduler_module = diffusers.schedulers.scheduling_euler_discrete
|
||||
has_clip_sample = False
|
||||
elif args.sampler == "euler_a" or args.sampler == "k_euler_a":
|
||||
scheduler_cls = EulerAncestralDiscreteScheduler
|
||||
scheduler_cls = EulerAncestralDiscreteSchedulerGL
|
||||
scheduler_module = diffusers.schedulers.scheduling_euler_ancestral_discrete
|
||||
has_clip_sample = False
|
||||
elif args.sampler == "dpmsolver" or args.sampler == "dpmsolver++":
|
||||
@@ -1700,6 +1940,16 @@ def main(args):
|
||||
if args.ds_depth_1 is not None:
|
||||
unet.set_deep_shrink(args.ds_depth_1, args.ds_timesteps_1, args.ds_depth_2, args.ds_timesteps_2, args.ds_ratio)
|
||||
|
||||
# Gradual Latent
|
||||
if args.gradual_latent_ratio is not None:
|
||||
gradual_latent = (
|
||||
args.gradual_latent_ratio,
|
||||
args.gradual_latent_timesteps,
|
||||
args.gradual_latent_every_n_steps,
|
||||
args.gradual_latent_ratio_step,
|
||||
)
|
||||
pipe.set_gradual_latent(gradual_latent)
|
||||
|
||||
# Textual Inversionを処理する
|
||||
if args.textual_inversion_embeddings:
|
||||
token_ids_embeds1 = []
|
||||
@@ -2297,6 +2547,12 @@ def main(args):
|
||||
ds_timesteps_2 = args.ds_timesteps_2
|
||||
ds_ratio = args.ds_ratio
|
||||
|
||||
# Gradual Latent
|
||||
gl_timesteps = None # means no override
|
||||
gl_ratio = args.gradual_latent_ratio
|
||||
gl_every_n_steps = args.gradual_latent_every_n_steps
|
||||
gl_ratio_step = args.gradual_latent_ratio_step
|
||||
|
||||
prompt_args = raw_prompt.strip().split(" --")
|
||||
prompt = prompt_args[0]
|
||||
print(f"prompt {prompt_index+1}/{len(prompt_list)}: {prompt}")
|
||||
@@ -2439,6 +2695,34 @@ def main(args):
|
||||
print(f"deep shrink ratio: {ds_ratio}")
|
||||
continue
|
||||
|
||||
# Gradual Latent
|
||||
m = re.match(r"glt ([\d\.]+)", parg, re.IGNORECASE)
|
||||
if m: # gradual latent timesteps
|
||||
gl_timesteps = int(m.group(1))
|
||||
print(f"gradual latent timesteps: {gl_timesteps}")
|
||||
continue
|
||||
|
||||
m = re.match(r"glr ([\d\.]+)", parg, re.IGNORECASE)
|
||||
if m: # gradual latent ratio
|
||||
gl_ratio = float(m.group(1))
|
||||
gl_timesteps = gl_timesteps if gl_timesteps is not None else -1 # -1 means override
|
||||
print(f"gradual latent ratio: {ds_ratio}")
|
||||
continue
|
||||
|
||||
m = re.match(r"gle ([\d\.]+)", parg, re.IGNORECASE)
|
||||
if m: # gradual latent every n steps
|
||||
gl_every_n_steps = int(m.group(1))
|
||||
gl_timesteps = gl_timesteps if gl_timesteps is not None else -1 # -1 means override
|
||||
print(f"gradual latent every n steps: {gl_every_n_steps}")
|
||||
continue
|
||||
|
||||
m = re.match(r"gls ([\d\.]+)", parg, re.IGNORECASE)
|
||||
if m: # gradual latent ratio step
|
||||
gl_ratio_step = float(m.group(1))
|
||||
gl_timesteps = gl_timesteps if gl_timesteps is not None else -1 # -1 means override
|
||||
print(f"gradual latent ratio step: {gl_ratio_step}")
|
||||
continue
|
||||
|
||||
except ValueError as ex:
|
||||
print(f"Exception in parsing / 解析エラー: {parg}")
|
||||
print(ex)
|
||||
@@ -2449,6 +2733,12 @@ def main(args):
|
||||
ds_depth_1 = args.ds_depth_1 or 3
|
||||
unet.set_deep_shrink(ds_depth_1, ds_timesteps_1, ds_depth_2, ds_timesteps_2, ds_ratio)
|
||||
|
||||
# override Gradual Latent
|
||||
if gl_ratio is not None:
|
||||
if gl_timesteps is None:
|
||||
gl_timesteps = args.gradual_latent_timesteps or 650
|
||||
pipe.set_gradual_latent((gl_ratio, gl_timesteps, gl_every_n_steps, gl_ratio_step))
|
||||
|
||||
# prepare seed
|
||||
if seeds is not None: # given in prompt
|
||||
# 数が足りないなら前のをそのまま使う
|
||||
@@ -2811,6 +3101,32 @@ def setup_parser() -> argparse.ArgumentParser:
|
||||
"--ds_ratio", type=float, default=0.5, help="Deep Shrink ratio for downsampling / Deep Shrinkのdownsampling比率"
|
||||
)
|
||||
|
||||
# gradual latent
|
||||
parser.add_argument(
|
||||
"--gradual_latent_timesteps",
|
||||
type=int,
|
||||
default=None,
|
||||
help="enable Gradual Latent hires fix and apply upscaling from this timesteps / Gradual Latent hires fixをこのtimestepsで有効にし、このtimestepsからアップスケーリングを適用する",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradual_latent_ratio",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help=" this size ratio, 0.5 means 1/2 / Gradual Latent hires fixをこのサイズ比率で有効にする、0.5は1/2を意味する",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradual_latent_ratio_step",
|
||||
type=float,
|
||||
default=0.125,
|
||||
help="step to increase ratio for Gradual Latent / Gradual Latentのratioをどのくらいずつ上げるか",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gradual_latent_every_n_steps",
|
||||
type=int,
|
||||
default=3,
|
||||
help="steps to increase size of latents every this steps for Gradual Latent / Gradual Latentでlatentsのサイズをこのステップごとに上げる",
|
||||
)
|
||||
|
||||
# # parser.add_argument(
|
||||
# "--control_net_image_path", type=str, default=None, nargs="*", help="image for ControlNet guidance / ControlNetでガイドに使う画像"
|
||||
# )
|
||||
|
||||
Reference in New Issue
Block a user