save state on train end

This commit is contained in:
gesen2egee
2024-03-10 23:33:38 +08:00
parent 2d7389185c
commit 095b8035e6
9 changed files with 13 additions and 8 deletions

View File

@@ -457,7 +457,7 @@ def train(args):
accelerator.end_training() accelerator.end_training()
if args.save_state and is_main_process: if is_main_process and (args.save_state or args.save_state_on_train_end):
train_util.save_state_on_train_end(args, accelerator) train_util.save_state_on_train_end(args, accelerator)
del accelerator # この後メモリを使うのでこれは消す del accelerator # この後メモリを使うのでこれは消す

View File

@@ -2890,6 +2890,11 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
action="store_true", action="store_true",
help="save training state additionally (including optimizer states etc.) / optimizerなど学習状態も含めたstateを追加で保存する", help="save training state additionally (including optimizer states etc.) / optimizerなど学習状態も含めたstateを追加で保存する",
) )
parser.add_argument(
"--save_state_on_train_end",
action="store_true",
help="save training state additionally (including optimizer states etc.) on train end / optimizerなど学習状態も含めたstateを追加で保存する",
)
parser.add_argument("--resume", type=str, default=None, help="saved state to resume training / 学習再開するモデルのstate") parser.add_argument("--resume", type=str, default=None, help="saved state to resume training / 学習再開するモデルのstate")
parser.add_argument("--train_batch_size", type=int, default=1, help="batch size for training / 学習時のバッチサイズ") parser.add_argument("--train_batch_size", type=int, default=1, help="batch size for training / 学習時のバッチサイズ")

View File

@@ -712,7 +712,7 @@ def train(args):
accelerator.end_training() accelerator.end_training()
if args.save_state: # and is_main_process: if args.save_state or args.save_state_on_train_end:
train_util.save_state_on_train_end(args, accelerator) train_util.save_state_on_train_end(args, accelerator)
del accelerator # この後メモリを使うのでこれは消す del accelerator # この後メモリを使うのでこれは消す

View File

@@ -549,7 +549,7 @@ def train(args):
accelerator.end_training() accelerator.end_training()
if is_main_process and args.save_state: if is_main_process and (args.save_state or args.save_state_on_train_end):
train_util.save_state_on_train_end(args, accelerator) train_util.save_state_on_train_end(args, accelerator)
if is_main_process: if is_main_process:

View File

@@ -565,7 +565,7 @@ def train(args):
accelerator.end_training() accelerator.end_training()
if is_main_process and args.save_state: if is_main_process and (args.save_state or args.save_state_on_train_end):
train_util.save_state_on_train_end(args, accelerator) train_util.save_state_on_train_end(args, accelerator)
# del accelerator # この後メモリを使うのでこれは消す→printで使うので消さずにおく # del accelerator # この後メモリを使うのでこれは消す→printで使うので消さずにおく

View File

@@ -444,7 +444,7 @@ def train(args):
accelerator.end_training() accelerator.end_training()
if args.save_state and is_main_process: if is_main_process and (args.save_state or args.save_state_on_train_end):
train_util.save_state_on_train_end(args, accelerator) train_util.save_state_on_train_end(args, accelerator)
del accelerator # この後メモリを使うのでこれは消す del accelerator # この後メモリを使うのでこれは消す

View File

@@ -935,7 +935,7 @@ class NetworkTrainer:
accelerator.end_training() accelerator.end_training()
if is_main_process and args.save_state: if is_main_process and args.save_state or args.save_state_on_train_end:
train_util.save_state_on_train_end(args, accelerator) train_util.save_state_on_train_end(args, accelerator)
if is_main_process: if is_main_process:

View File

@@ -732,7 +732,7 @@ class TextualInversionTrainer:
accelerator.end_training() accelerator.end_training()
if args.save_state and is_main_process: if is_main_process and (args.save_state or args.save_state_on_train_end):
train_util.save_state_on_train_end(args, accelerator) train_util.save_state_on_train_end(args, accelerator)
if is_main_process: if is_main_process:

View File

@@ -586,7 +586,7 @@ def train(args):
accelerator.end_training() accelerator.end_training()
if args.save_state and is_main_process: if is_main_process and (args.save_state or args.save_state_on_train_end):
train_util.save_state_on_train_end(args, accelerator) train_util.save_state_on_train_end(args, accelerator)
updated_embs = text_encoder.get_input_embeddings().weight[token_ids_XTI].data.detach().clone() updated_embs = text_encoder.get_input_embeddings().weight[token_ids_XTI].data.detach().clone()