common lr logging, set default None to ddp_timeout

This commit is contained in:
Kohya S
2023-11-05 19:09:17 +09:00
parent 96d877be90
commit 6231aa91e2
4 changed files with 46 additions and 44 deletions

View File

@@ -408,13 +408,8 @@ def train(args):
current_loss = loss.detach().item() # 平均なのでbatch sizeは関係ないはず current_loss = loss.detach().item() # 平均なのでbatch sizeは関係ないはず
if args.logging_dir is not None: if args.logging_dir is not None:
logs = {"loss": current_loss, "lr": float(lr_scheduler.get_last_lr()[0])} logs = {"loss": current_loss}
if ( train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower()
): # tracking d*lr value
logs["lr/d*lr"] = (
lr_scheduler.optimizers[0].param_groups[0]["d"] * lr_scheduler.optimizers[0].param_groups[0]["lr"]
)
accelerator.log(logs, step=global_step) accelerator.log(logs, step=global_step)
loss_recorder.add(epoch=epoch, step=step, loss=current_loss) loss_recorder.add(epoch=epoch, step=step, loss=current_loss)

View File

@@ -2864,7 +2864,10 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
"--full_bf16", action="store_true", help="bf16 training including gradients / 勾配も含めてbf16で学習する" "--full_bf16", action="store_true", help="bf16 training including gradients / 勾配も含めてbf16で学習する"
) # TODO move to SDXL training, because it is not supported by SD1/2 ) # TODO move to SDXL training, because it is not supported by SD1/2
parser.add_argument( parser.add_argument(
"--ddp_timeout", type=int, default=30, help="DDP timeout (min) / DDPのタイムアウト(min)", "--ddp_timeout",
type=int,
default=None,
help="DDP timeout (min, None for default of accelerate) / DDPのタイムアウト分、Noneでaccelerateのデフォルト",
) )
parser.add_argument( parser.add_argument(
"--clip_skip", "--clip_skip",
@@ -3806,12 +3809,15 @@ def prepare_accelerator(args: argparse.Namespace):
if args.wandb_api_key is not None: if args.wandb_api_key is not None:
wandb.login(key=args.wandb_api_key) wandb.login(key=args.wandb_api_key)
kwargs_handlers = (
None if args.ddp_timeout is None else [InitProcessGroupKwargs(timeout=datetime.timedelta(minutes=args.ddp_timeout))]
)
accelerator = Accelerator( accelerator = Accelerator(
gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_accumulation_steps=args.gradient_accumulation_steps,
mixed_precision=args.mixed_precision, mixed_precision=args.mixed_precision,
log_with=log_with, log_with=log_with,
project_dir=logging_dir, project_dir=logging_dir,
kwargs_handlers=[InitProcessGroupKwargs(timeout=datetime.timedelta(minutes=args.ddp_timeout))], kwargs_handlers=kwargs_handlers,
) )
return accelerator return accelerator
@@ -4401,6 +4407,29 @@ def get_noise_noisy_latents_and_timesteps(args, noise_scheduler, latents):
return noise, noisy_latents, timesteps return noise, noisy_latents, timesteps
def append_lr_to_logs(logs, lr_scheduler, optimizer_type, including_unet=True):
names = []
if including_unet:
names.append("unet")
names.append("text_encoder1")
names.append("text_encoder2")
append_lr_to_logs_with_names(logs, lr_scheduler, optimizer_type, names)
def append_lr_to_logs_with_names(logs, lr_scheduler, optimizer_type, names):
lrs = lr_scheduler.get_last_lr()
for lr_index in range(len(lrs)):
name = names[lr_index]
logs["lr/" + name] = float(lrs[lr_index])
if optimizer_type.lower().startswith("DAdapt".lower()) or optimizer_type.lower() == "Prodigy".lower():
logs["lr/d*lr/" + name] = (
lr_scheduler.optimizers[-1].param_groups[lr_index]["d"] * lr_scheduler.optimizers[-1].param_groups[lr_index]["lr"]
)
# scheduler: # scheduler:
SCHEDULER_LINEAR_START = 0.00085 SCHEDULER_LINEAR_START = 0.00085
SCHEDULER_LINEAR_END = 0.0120 SCHEDULER_LINEAR_END = 0.0120
@@ -4718,7 +4747,7 @@ class LossRecorder:
self.loss_list: List[float] = [] self.loss_list: List[float] = []
self.loss_total: float = 0.0 self.loss_total: float = 0.0
def add(self, *, epoch:int, step: int, loss: float) -> None: def add(self, *, epoch: int, step: int, loss: float) -> None:
if epoch == 0: if epoch == 0:
self.loss_list.append(loss) self.loss_list.append(loss)
else: else:

View File

@@ -74,33 +74,22 @@ def get_block_params_to_optimize(unet: SdxlUNet2DConditionModel, block_lrs: List
def append_block_lr_to_logs(block_lrs, logs, lr_scheduler, optimizer_type): def append_block_lr_to_logs(block_lrs, logs, lr_scheduler, optimizer_type):
lrs = lr_scheduler.get_last_lr() names = []
lr_index = 0
block_index = 0 block_index = 0
while lr_index < len(lrs): while block_index < UNET_NUM_BLOCKS_FOR_BLOCK_LR + 2:
if block_index < UNET_NUM_BLOCKS_FOR_BLOCK_LR: if block_index < UNET_NUM_BLOCKS_FOR_BLOCK_LR:
name = f"block{block_index}"
if block_lrs[block_index] == 0: if block_lrs[block_index] == 0:
block_index += 1 block_index += 1
continue continue
names.append(f"block{block_index}")
elif block_index == UNET_NUM_BLOCKS_FOR_BLOCK_LR: elif block_index == UNET_NUM_BLOCKS_FOR_BLOCK_LR:
name = "text_encoder1" names.append("text_encoder1")
elif block_index == UNET_NUM_BLOCKS_FOR_BLOCK_LR + 1: elif block_index == UNET_NUM_BLOCKS_FOR_BLOCK_LR + 1:
name = "text_encoder2" names.append("text_encoder2")
else:
raise ValueError(f"unexpected block_index: {block_index}")
block_index += 1 block_index += 1
logs["lr/" + name] = float(lrs[lr_index]) train_util.append_lr_to_logs_with_names(logs, lr_scheduler, optimizer_type, names)
if optimizer_type.lower().startswith("DAdapt".lower()) or optimizer_type.lower() == "Prodigy".lower():
logs["lr/d*lr/" + name] = (
lr_scheduler.optimizers[-1].param_groups[lr_index]["d"] * lr_scheduler.optimizers[-1].param_groups[lr_index]["lr"]
)
lr_index += 1
def train(args): def train(args):
@@ -647,15 +636,9 @@ def train(args):
if args.logging_dir is not None: if args.logging_dir is not None:
logs = {"loss": current_loss} logs = {"loss": current_loss}
if block_lrs is None: if block_lrs is None:
logs["lr"] = float(lr_scheduler.get_last_lr()[0]) train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=train_unet)
if (
args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower()
): # tracking d*lr value
logs["lr/d*lr"] = (
lr_scheduler.optimizers[0].param_groups[0]["d"] * lr_scheduler.optimizers[0].param_groups[0]["lr"]
)
else: else:
append_block_lr_to_logs(block_lrs, logs, lr_scheduler, args.optimizer_type) append_block_lr_to_logs(block_lrs, logs, lr_scheduler, args.optimizer_type) # U-Net is included in block_lrs
accelerator.log(logs, step=global_step) accelerator.log(logs, step=global_step)

View File

@@ -394,13 +394,8 @@ def train(args):
current_loss = loss.detach().item() current_loss = loss.detach().item()
if args.logging_dir is not None: if args.logging_dir is not None:
logs = {"loss": current_loss, "lr": float(lr_scheduler.get_last_lr()[0])} logs = {"loss": current_loss}
if ( train_util.append_lr_to_logs(logs, lr_scheduler, args.optimizer_type, including_unet=True)
args.optimizer_type.lower().startswith("DAdapt".lower()) or args.optimizer_type.lower() == "Prodigy".lower()
): # tracking d*lr value
logs["lr/d*lr"] = (
lr_scheduler.optimizers[0].param_groups[0]["d"] * lr_scheduler.optimizers[0].param_groups[0]["lr"]
)
accelerator.log(logs, step=global_step) accelerator.log(logs, step=global_step)
loss_recorder.add(epoch=epoch, step=step, loss=current_loss) loss_recorder.add(epoch=epoch, step=step, loss=current_loss)