add detail dataset config feature by extra config file (#227)

* add config file schema

* change config file specification

* refactor config utility

* unify batch_size to train_batch_size

* fix indent size

* use batch_size instead of train_batch_size

* make cache_latents configurable on subset

* rename options
* bucket_repo_range
* shuffle_keep_tokens

* update readme

* revert to min_bucket_reso & max_bucket_reso

* use subset structure in dataset

* format import lines

* split mode specific options

* use only valid subset

* change valid subsets name

* manage multiple datasets by dataset group

* update config file sanitizer

* prune redundant validation

* add comments

* update type annotation

* rename json_file_name to metadata_file

* ignore when image dir is invalid

* fix tag shuffle and dropout

* ignore duplicated subset

* add method to check latent cachability

* fix format

* fix bug

* update caption dropout default values

* update annotation

* fix bug

* add option to enable bucket shuffle across dataset

* update blueprint generate function

* use blueprint generator for dataset initialization

* delete duplicated function

* update config readme

* delete debug print

* print dataset and subset info as info

* enable bucket_shuffle_across_dataset option

* update config readme for clarification

* compensate quotes for string option example

* fix bug of bad usage of join

* conserve trained metadata backward compatibility

* enable shuffle in data loader by default

* delete resolved TODO

* add comment for image data handling

* fix reference bug

* fix undefined variable bug

* prevent raise overwriting

* assert image_dir and metadata_file validity

* add debug message for ignoring subset

* fix inconsistent import statement

* loosen too strict validation on float value

* sanitize argument parser separately

* make image_dir optional for fine tuning dataset

* fix import

* fix trailing characters in print

* parse flexible dataset config deterministically

* use relative import

* print supplementary message for parsing error

* add note about different methods

* add note of benefit of separate dataset

* add error example

* add note for english readme plan

---------

Co-authored-by: Kohya S <52813779+kohya-ss@users.noreply.github.com>
This commit is contained in:
fur0ut0
2023-03-01 20:58:08 +09:00
committed by GitHub
parent 82707654ad
commit 8abb8645ae
8 changed files with 1370 additions and 321 deletions

View File

@@ -14,7 +14,14 @@ from accelerate.utils import set_seed
from diffusers import DDPMScheduler
import library.train_util as train_util
from library.train_util import DreamBoothDataset, FineTuningDataset
from library.train_util import (
DreamBoothDataset,
)
import library.config_util as config_util
from library.config_util import (
ConfigSanitizer,
BlueprintGenerator,
)
def collate_fn(examples):
@@ -47,6 +54,7 @@ def train(args):
cache_latents = args.cache_latents
use_dreambooth_method = args.in_json is None
use_user_config = args.config_file is not None
if args.seed is not None:
set_seed(args.seed)
@@ -54,35 +62,45 @@ def train(args):
tokenizer = train_util.load_tokenizer(args)
# データセットを準備する
if use_dreambooth_method:
print("Use DreamBooth method.")
train_dataset = DreamBoothDataset(args.train_batch_size, args.train_data_dir, args.reg_data_dir,
tokenizer, args.max_token_length, args.caption_extension, args.shuffle_caption, args.keep_tokens,
args.resolution, args.enable_bucket, args.min_bucket_reso, args.max_bucket_reso,
args.bucket_reso_steps, args.bucket_no_upscale,
args.prior_loss_weight, args.flip_aug, args.color_aug, args.face_crop_aug_range,
args.random_crop, args.debug_dataset)
blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, True))
if use_user_config:
print(f"Load config file from {args.config_file}")
user_config = config_util.load_user_config(args.config_file)
ignored = ["train_data_dir", "reg_data_dir", "in_json"]
if any(getattr(args, attr) is not None for attr in ignored):
print("ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(', '.join(ignored)))
else:
print("Train with captions.")
train_dataset = FineTuningDataset(args.in_json, args.train_batch_size, args.train_data_dir,
tokenizer, args.max_token_length, args.shuffle_caption, args.keep_tokens,
args.resolution, args.enable_bucket, args.min_bucket_reso, args.max_bucket_reso,
args.bucket_reso_steps, args.bucket_no_upscale,
args.flip_aug, args.color_aug, args.face_crop_aug_range, args.random_crop,
args.dataset_repeats, args.debug_dataset)
if use_dreambooth_method:
print("Use DreamBooth method.")
user_config = {
"datasets": [{
"subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(args.train_data_dir, args.reg_data_dir)
}]
}
else:
print("Train with captions.")
user_config = {
"datasets": [{
"subsets": [{
"image_dir": args.train_data_dir,
"metadata_file": args.in_json,
}]
}]
}
# 学習データのdropout率を設定する
train_dataset.set_caption_dropout(args.caption_dropout_rate, args.caption_dropout_every_n_epochs, args.caption_tag_dropout_rate)
train_dataset.make_buckets()
blueprint = blueprint_generator.generate(user_config, args, tokenizer=tokenizer)
train_dataset_group = config_util.generate_dataset_group_by_blueprint(blueprint.dataset_group)
if args.debug_dataset:
train_util.debug_dataset(train_dataset)
train_util.debug_dataset(train_dataset_group)
return
if len(train_dataset) == 0:
if len(train_dataset_group) == 0:
print("No data found. Please verify arguments (train_data_dir must be the parent of folders with images) / 画像がありません。引数指定を確認してくださいtrain_data_dirには画像があるフォルダではなく、画像があるフォルダの親フォルダを指定する必要があります")
return
if cache_latents:
assert train_dataset_group.is_latent_cachable(), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
# acceleratorを準備する
print("prepare accelerator")
accelerator, unwrap_model = train_util.prepare_accelerator(args)
@@ -107,7 +125,7 @@ def train(args):
vae.requires_grad_(False)
vae.eval()
with torch.no_grad():
train_dataset.cache_latents(vae)
train_dataset_group.cache_latents(vae)
vae.to("cpu")
if torch.cuda.is_available():
torch.cuda.empty_cache()
@@ -151,7 +169,7 @@ def train(args):
# DataLoaderのプロセス数0はメインプロセスになる
n_workers = min(args.max_data_loader_n_workers, os.cpu_count() - 1) # cpu_count-1 ただし最大で指定された数まで
train_dataloader = torch.utils.data.DataLoader(
train_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, num_workers=n_workers, persistent_workers=args.persistent_data_loader_workers)
train_dataset_group, batch_size=1, shuffle=True, collate_fn=collate_fn, num_workers=n_workers, persistent_workers=args.persistent_data_loader_workers)
# 学習ステップ数を計算する
if args.max_train_epochs is not None:
@@ -229,14 +247,15 @@ def train(args):
args.save_every_n_epochs = math.floor(num_train_epochs / args.save_n_epoch_ratio) or 1
# 学習する
# TODO: find a way to handle total batch size when there are multiple datasets
total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
print("running training / 学習開始")
print(f" num train images * repeats / 学習画像の数×繰り返し回数: {train_dataset.num_train_images}")
print(f" num reg images / 正則化画像の数: {train_dataset.num_reg_images}")
print(f" num train images * repeats / 学習画像の数×繰り返し回数: {train_dataset_group.num_train_images}")
print(f" num reg images / 正則化画像の数: {train_dataset_group.num_reg_images}")
print(f" num batches per epoch / 1epochのバッチ数: {len(train_dataloader)}")
print(f" num epochs / epoch数: {num_train_epochs}")
print(f" batch size per device / バッチサイズ: {args.train_batch_size}")
print(f" total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ(並列学習、勾配合計含む): {total_batch_size}")
print(f" batch size per device / バッチサイズ: {', '.join([str(d.batch_size) for d in train_dataset_group.datasets])}")
#print(f" total train batch size (with parallel & distributed & accumulation) / 総バッチサイズ(並列学習、勾配合計含む): {total_batch_size}")
print(f" gradient accumulation steps / 勾配を合計するステップ数 = {args.gradient_accumulation_steps}")
print(f" total optimization steps / 学習ステップ数: {args.max_train_steps}")
@@ -247,12 +266,10 @@ def train(args):
"ss_learning_rate": args.learning_rate,
"ss_text_encoder_lr": args.text_encoder_lr,
"ss_unet_lr": args.unet_lr,
"ss_num_train_images": train_dataset.num_train_images, # includes repeating
"ss_num_reg_images": train_dataset.num_reg_images,
"ss_num_train_images": train_dataset_group.num_train_images,
"ss_num_reg_images": train_dataset_group.num_reg_images,
"ss_num_batches_per_epoch": len(train_dataloader),
"ss_num_epochs": num_train_epochs,
"ss_batch_size_per_device": args.train_batch_size,
"ss_total_batch_size": total_batch_size,
"ss_gradient_checkpointing": args.gradient_checkpointing,
"ss_gradient_accumulation_steps": args.gradient_accumulation_steps,
"ss_max_train_steps": args.max_train_steps,
@@ -264,26 +281,12 @@ def train(args):
"ss_mixed_precision": args.mixed_precision,
"ss_full_fp16": bool(args.full_fp16),
"ss_v2": bool(args.v2),
"ss_resolution": args.resolution,
"ss_clip_skip": args.clip_skip,
"ss_max_token_length": args.max_token_length,
"ss_color_aug": bool(args.color_aug),
"ss_flip_aug": bool(args.flip_aug),
"ss_random_crop": bool(args.random_crop),
"ss_shuffle_caption": bool(args.shuffle_caption),
"ss_cache_latents": bool(args.cache_latents),
"ss_enable_bucket": bool(train_dataset.enable_bucket),
"ss_bucket_no_upscale": bool(train_dataset.bucket_no_upscale),
"ss_min_bucket_reso": train_dataset.min_bucket_reso,
"ss_max_bucket_reso": train_dataset.max_bucket_reso,
"ss_seed": args.seed,
"ss_lowram": args.lowram,
"ss_keep_tokens": args.keep_tokens,
"ss_noise_offset": args.noise_offset,
"ss_dataset_dirs": json.dumps(train_dataset.dataset_dirs_info),
"ss_reg_dataset_dirs": json.dumps(train_dataset.reg_dataset_dirs_info),
"ss_tag_frequency": json.dumps(train_dataset.tag_frequency),
"ss_bucket_info": json.dumps(train_dataset.bucket_info),
"ss_training_comment": args.training_comment, # will not be updated after training
"ss_sd_scripts_commit_hash": train_util.get_git_revision_hash(),
"ss_optimizer": optimizer_name + (f"({optimizer_args})" if len(optimizer_args) > 0 else ""),
@@ -295,6 +298,89 @@ def train(args):
"ss_prior_loss_weight": args.prior_loss_weight,
}
if use_user_config:
# save metadata of multiple datasets
# NOTE: pack "ss_datasets" value as json one time
# or should also pack nested collections as json?
datasets_metadata = []
for dataset in train_dataset_group.datasets:
is_dreambooth_dataset = isinstance(dataset, DreamBoothDataset)
dataset_metadata = {
"is_dreambooth": is_dreambooth_dataset,
"batch_size_per_device": dataset.batch_size,
"num_train_images": dataset.num_train_images, # includes repeating
"num_reg_images": dataset.num_reg_images,
"resolution": (dataset.width, dataset.height),
"enable_bucket": bool(dataset.enable_bucket),
"min_bucket_reso": dataset.min_bucket_reso,
"max_bucket_reso": dataset.max_bucket_reso,
"tag_frequency": dataset.tag_frequency,
"bucket_info": dataset.bucket_info,
}
subsets_metadata = []
for subset in dataset.subsets:
subset_metadata = {
"image_dir": os.path.basename(subset.image_dir),
"img_count": subset.img_count,
"num_repeats": subset.num_repeats,
"color_aug": bool(subset.color_aug),
"flip_aug": bool(subset.flip_aug),
"random_crop": bool(subset.random_crop),
"shuffle_caption": bool(subset.shuffle_caption),
"keep_tokens": subset.keep_tokens,
}
if is_dreambooth_dataset:
subset_metadata["class_tokens"] = subset.class_tokens
subset_metadata["is_reg"] = subset.is_reg
subsets_metadata.append(subset_metadata)
dataset_metadata["subsets"] = subsets_metadata
datasets_metadata.append(dataset_metadata)
metadata["ss_datasets"] = json.dumps(datasets_metadata)
else:
# conserving backward compatiblity when using train_dataset_dir and reg_dataset_dir
assert len(train_dataset_group.datasets) == 1, f"There should be a single dataset but {len(train_dataset_group.datasets)} found. This seems to be a bug. / データセットは1個だけ存在するはずですが、実際には{len(train_dataset_group.datasets)}個でした。プログラムのバグかもしれません。"
dataset = train_dataset_group.datasets[0]
dataset_dirs_info = {}
reg_dataset_dirs_info = {}
if use_dreambooth_method:
for subset in dataset.subsets:
info = reg_dataset_dirs_info if subset.is_reg else dataset_dirs_info
info[os.path.basename(subset.image_dir)] = {
"n_repeats": subset.num_repeats,
"img_count": subset.img_count
}
else:
for subset in dataset.subsets:
dataset_dirs_info[os.path.basename(subset.metadata_file)] = {
"n_repeats": subset.num_repeats,
"img_count": subset.img_count
}
metadata |= {
"ss_batch_size_per_device": args.train_batch_size,
"ss_total_batch_size": total_batch_size,
"ss_resolution": args.resolution,
"ss_color_aug": bool(args.color_aug),
"ss_flip_aug": bool(args.flip_aug),
"ss_random_crop": bool(args.random_crop),
"ss_shuffle_caption": bool(args.shuffle_caption),
"ss_enable_bucket": bool(dataset.enable_bucket),
"ss_bucket_no_upscale": bool(dataset.bucket_no_upscale),
"ss_min_bucket_reso": dataset.min_bucket_reso,
"ss_max_bucket_reso": dataset.max_bucket_reso,
"ss_keep_tokens": args.keep_tokens,
"ss_dataset_dirs": json.dumps(dataset_dirs_info),
"ss_reg_dataset_dirs": json.dumps(reg_dataset_dirs_info),
"ss_tag_frequency": json.dumps(dataset.tag_frequency),
"ss_bucket_info": json.dumps(dataset.bucket_info),
}
# uncomment if another network is added
# for key, value in net_kwargs.items():
# metadata["ss_arg_" + key] = value
@@ -330,7 +416,7 @@ def train(args):
loss_total = 0.0
for epoch in range(num_train_epochs):
print(f"epoch {epoch+1}/{num_train_epochs}")
train_dataset.set_current_epoch(epoch + 1)
train_dataset_group.set_current_epoch(epoch + 1)
metadata["ss_epoch"] = str(epoch+1)
@@ -482,6 +568,7 @@ if __name__ == '__main__':
train_util.add_dataset_arguments(parser, True, True, True)
train_util.add_training_arguments(parser, True)
train_util.add_optimizer_arguments(parser)
config_util.add_config_arguments(parser)
parser.add_argument("--no_metadata", action='store_true', help="do not save metadata in output model / メタデータを出力先モデルに保存しない")
parser.add_argument("--save_model_as", type=str, default="safetensors", choices=[None, "ckpt", "pt", "safetensors"],