diff --git a/README.md b/README.md index 62551f27..03ee5d01 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,11 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser ## Change History +- 16 Feb. 2023, 2023/2/16: + - Noise offset is recorded to the metadata. Thanks to space-nuko! + - Show the moving average loss to prevent loss jumping in ``train_network.py`` and ``train_db.py``. Thanks to shirayu! + - Noise offsetがメタデータに記録されるようになりました。space-nuko氏に感謝します。 + - ``train_network.py``と``train_db.py``で学習中に表示されるlossの値が移動平均になりました。epochの先頭で表示されるlossが大きく変動する事象を解決します。shirayu氏に感謝します。 - 14 Feb. 2023, 2023/2/14: - Add support with multi-gpu trainining for ``train_network.py``. Thanks to Isotr0py! - Add ``--verbose`` option for ``resize_lora.py``. For details, see [this PR](https://github.com/kohya-ss/sd-scripts/pull/179). Thanks to mgz-dev! diff --git a/train_db.py b/train_db.py index 4a50dc94..e4f1e54c 100644 --- a/train_db.py +++ b/train_db.py @@ -206,6 +206,8 @@ def train(args): if accelerator.is_main_process: accelerator.init_trackers("dreambooth") + loss_list = [] + loss_total = 0.0 for epoch in range(num_train_epochs): print(f"epoch {epoch+1}/{num_train_epochs}") train_dataset.set_current_epoch(epoch + 1) @@ -216,7 +218,6 @@ def train(args): if args.gradient_checkpointing or global_step < args.stop_text_encoder_training: text_encoder.train() - loss_total = 0 for step, batch in enumerate(train_dataloader): # 指定したステップ数でText Encoderの学習を止める if global_step == args.stop_text_encoder_training: @@ -294,8 +295,13 @@ def train(args): logs = {"loss": current_loss, "lr": lr_scheduler.get_last_lr()[0]} accelerator.log(logs, step=global_step) + if epoch == 0: + loss_list.append(current_loss) + else: + loss_total -= loss_list[step] + loss_list[step] = current_loss loss_total += current_loss - avr_loss = loss_total / (step+1) + avr_loss = loss_total / len(loss_list) logs = {"loss": avr_loss} # , "lr": lr_scheduler.get_last_lr()[0]} progress_bar.set_postfix(**logs) @@ -303,7 +309,7 @@ def train(args): break if args.logging_dir is not None: - logs = {"epoch_loss": loss_total / len(train_dataloader)} + logs = {"loss/epoch": loss_total / len(loss_list)} accelerator.log(logs, step=epoch+1) accelerator.wait_for_everyone() diff --git a/train_network.py b/train_network.py index 1b8046d2..5983a7ef 100644 --- a/train_network.py +++ b/train_network.py @@ -353,6 +353,7 @@ def train(args): "ss_max_bucket_reso": train_dataset.max_bucket_reso, "ss_seed": args.seed, "ss_keep_tokens": args.keep_tokens, + "ss_noise_offset": args.noise_offset, "ss_dataset_dirs": json.dumps(train_dataset.dataset_dirs_info), "ss_reg_dataset_dirs": json.dumps(train_dataset.reg_dataset_dirs_info), "ss_tag_frequency": json.dumps(train_dataset.tag_frequency), @@ -392,6 +393,8 @@ def train(args): if accelerator.is_main_process: accelerator.init_trackers("network_train") + loss_list = [] + loss_total = 0.0 for epoch in range(num_train_epochs): print(f"epoch {epoch+1}/{num_train_epochs}") train_dataset.set_current_epoch(epoch + 1) @@ -400,7 +403,6 @@ def train(args): network.on_epoch_start(text_encoder, unet) - loss_total = 0 for step, batch in enumerate(train_dataloader): with accelerator.accumulate(network): with torch.no_grad(): @@ -464,8 +466,13 @@ def train(args): global_step += 1 current_loss = loss.detach().item() + if epoch == 0: + loss_list.append(current_loss) + else: + loss_total -= loss_list[step] + loss_list[step] = current_loss loss_total += current_loss - avr_loss = loss_total / (step+1) + avr_loss = loss_total / len(loss_list) logs = {"loss": avr_loss} # , "lr": lr_scheduler.get_last_lr()[0]} progress_bar.set_postfix(**logs) @@ -477,7 +484,7 @@ def train(args): break if args.logging_dir is not None: - logs = {"loss/epoch": loss_total / len(train_dataloader)} + logs = {"loss/epoch": loss_total / len(loss_list)} accelerator.log(logs, step=epoch+1) accelerator.wait_for_everyone()