From 57d8483eaf5be6081c1f167f572291f449c6ba0c Mon Sep 17 00:00:00 2001 From: Kohya S Date: Fri, 3 Feb 2023 08:45:33 +0900 Subject: [PATCH] add GIT captioning, refactoring, DataLoader --- .gitignore | 3 +- finetune/make_captions.py | 101 +++++++++++++----- finetune/make_captions_by_git.py | 136 +++++++++++++++++++++++++ finetune/merge_captions_to_metadata.py | 36 ++++--- finetune/merge_dd_tags_to_metadata.py | 41 ++++---- finetune/prepare_buckets_latents.py | 125 +++++++++++++++++------ finetune/tag_images_by_wd14_tagger.py | 128 ++++++++++++++++------- library/train_util.py | 51 ++++++++-- requirements.txt | 2 +- 9 files changed, 479 insertions(+), 144 deletions(-) create mode 100644 finetune/make_captions_by_git.py diff --git a/.gitignore b/.gitignore index 7c088d5c..0904a2a4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ __pycache__ wd14_tagger_model venv *.egg-info -build \ No newline at end of file +build +.vscode \ No newline at end of file diff --git a/finetune/make_captions.py b/finetune/make_captions.py index 495450aa..a2a35b39 100644 --- a/finetune/make_captions.py +++ b/finetune/make_captions.py @@ -11,18 +11,59 @@ import torch from torchvision import transforms from torchvision.transforms.functional import InterpolationMode from blip.blip import blip_decoder -# from Salesforce_BLIP.models.blip import blip_decoder +import library.train_util as train_util DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +IMAGE_SIZE = 384 + +# 正方形でいいのか? という気がするがソースがそうなので +IMAGE_TRANSFORM = transforms.Compose([ + transforms.Resize((IMAGE_SIZE, IMAGE_SIZE), interpolation=InterpolationMode.BICUBIC), + transforms.ToTensor(), + transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) +]) + +# 共通化したいが微妙に処理が異なる…… +class ImageLoadingTransformDataset(torch.utils.data.Dataset): + def __init__(self, image_paths): + self.images = image_paths + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + img_path = self.images[idx] + + try: + image = Image.open(img_path).convert("RGB") + # convert to tensor temporarily so dataloader will accept it + tensor = IMAGE_TRANSFORM(image) + except Exception as e: + print(f"Could not load image path / 画像を読み込めません: {img_path}, error: {e}") + return None + + return (tensor, img_path) + + +def collate_fn_remove_corrupted(batch): + """Collate function that allows to remove corrupted examples in the + dataloader. It expects that the dataloader returns 'None' when that occurs. + The 'None's in the batch are removed. + """ + # Filter out all the Nones (corrupted examples) + batch = list(filter(lambda x: x is not None, batch)) + return batch + + def main(args): # fix the seed for reproducibility - seed = args.seed # + utils.get_rank() + seed = args.seed # + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) - + if not os.path.exists("blip"): args.train_data_dir = os.path.abspath(args.train_data_dir) # convert to absolute path @@ -31,24 +72,15 @@ def main(args): os.chdir('finetune') print(f"load images from {args.train_data_dir}") - image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + glob.glob(os.path.join(args.train_data_dir, "*.jpeg")) + \ - glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp")) + image_paths = train_util.glob_images(args.train_data_dir) print(f"found {len(image_paths)} images.") print(f"loading BLIP caption: {args.caption_weights}") - image_size = 384 - model = blip_decoder(pretrained=args.caption_weights, image_size=image_size, vit='large', med_config="./blip/med_config.json") + model = blip_decoder(pretrained=args.caption_weights, image_size=IMAGE_SIZE, vit='large', med_config="./blip/med_config.json") model.eval() model = model.to(DEVICE) print("BLIP loaded") - # 正方形でいいのか? という気がするがソースがそうなので - transform = transforms.Compose([ - transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), - transforms.ToTensor(), - transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) - ]) - # captioningする def run_batch(path_imgs): imgs = torch.stack([im for _, im in path_imgs]).to(DEVICE) @@ -66,18 +98,35 @@ def main(args): if args.debug: print(image_path, caption) - b_imgs = [] - for image_path in tqdm(image_paths, smoothing=0.0): - raw_image = Image.open(image_path) - if raw_image.mode != "RGB": - print(f"convert image mode {raw_image.mode} to RGB: {image_path}") - raw_image = raw_image.convert("RGB") + # 読み込みの高速化のためにDataLoaderを使うオプション + if args.max_data_loader_n_workers is not None: + dataset = ImageLoadingTransformDataset(image_paths) + data = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.max_data_loader_n_workers, collate_fn=collate_fn_remove_corrupted, drop_last=False) + else: + data = [[(None, ip)] for ip in image_paths] - image = transform(raw_image) - b_imgs.append((image_path, image)) - if len(b_imgs) >= args.batch_size: - run_batch(b_imgs) - b_imgs.clear() + b_imgs = [] + for data_entry in tqdm(data, smoothing=0.0): + for data in data_entry: + if data is None: + continue + + img_tensor, image_path = data + if img_tensor is None: + try: + raw_image = Image.open(image_path) + if raw_image.mode != 'RGB': + raw_image = raw_image.convert("RGB") + img_tensor = IMAGE_TRANSFORM(raw_image) + except Exception as e: + print(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}") + continue + + b_imgs.append((image_path, img_tensor)) + if len(b_imgs) >= args.batch_size: + run_batch(b_imgs) + b_imgs.clear() if len(b_imgs) > 0: run_batch(b_imgs) @@ -95,6 +144,8 @@ if __name__ == '__main__': parser.add_argument("--beam_search", action="store_true", help="use beam search (default Nucleus sampling) / beam searchを使う(このオプション未指定時はNucleus sampling)") parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ") + parser.add_argument("--max_data_loader_n_workers", type=int, default=None, + help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する(読み込みを高速化)") parser.add_argument("--num_beams", type=int, default=1, help="num of beams in beam search /beam search時のビーム数(多いと精度が上がるが時間がかかる)") parser.add_argument("--top_p", type=float, default=0.9, help="top_p in Nucleus sampling / Nucleus sampling時のtop_p") parser.add_argument("--max_length", type=int, default=75, help="max length of caption / captionの最大長") diff --git a/finetune/make_captions_by_git.py b/finetune/make_captions_by_git.py new file mode 100644 index 00000000..daaef9e7 --- /dev/null +++ b/finetune/make_captions_by_git.py @@ -0,0 +1,136 @@ +import argparse +import os +import re + +from PIL import Image +from tqdm import tqdm +import torch +from transformers import AutoProcessor, AutoModelForCausalLM +from transformers.generation.utils import GenerationMixin + +import library.train_util as train_util + + +DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +PATTERN_REPLACE = [re.compile(r'with the (words?|letters?) (" ?[^"]*"|\w+)( on (the)? ?\w+)?'), + re.compile(r'that says (" ?[^"]*"|\w+)')] + + +# 誤検知しまくりの with the word xxxx を消す +def remove_words(captions, debug): + removed_caps = [] + for caption in captions: + cap = caption + for pat in PATTERN_REPLACE: + cap = pat.sub("", caption) + if debug and cap != caption: + print(caption) + print(cap) + removed_caps.append(cap) + return removed_caps + + +def collate_fn_remove_corrupted(batch): + """Collate function that allows to remove corrupted examples in the + dataloader. It expects that the dataloader returns 'None' when that occurs. + The 'None's in the batch are removed. + """ + # Filter out all the Nones (corrupted examples) + batch = list(filter(lambda x: x is not None, batch)) + return batch + + +def main(args): + # GITにバッチサイズが1より大きくても動くようにパッチを当てる: transformers 4.26.0用 + org_prepare_input_ids_for_generation = GenerationMixin._prepare_input_ids_for_generation + curr_batch_size = [args.batch_size] # ループの最後で件数がbatch_size未満になるので入れ替えられるように + + # input_idsがバッチサイズと同じ件数である必要がある:バッチサイズはこの関数から参照できないので外から渡す + # ここより上で置き換えようとするとすごく大変 + def _prepare_input_ids_for_generation_patch(self, bos_token_id, encoder_outputs): + input_ids = org_prepare_input_ids_for_generation(self, bos_token_id, encoder_outputs) + if input_ids.size()[0] != curr_batch_size[0]: + input_ids = input_ids.repeat(curr_batch_size[0], 1) + return input_ids + GenerationMixin._prepare_input_ids_for_generation = _prepare_input_ids_for_generation_patch + + print(f"load images from {args.train_data_dir}") + image_paths = train_util.glob_images(args.train_data_dir) + print(f"found {len(image_paths)} images.") + + # できればcacheに依存せず明示的にダウンロードしたい + print(f"loading GIT: {args.model_id}") + git_processor = AutoProcessor.from_pretrained(args.model_id) + git_model = AutoModelForCausalLM.from_pretrained(args.model_id).to(DEVICE) + print("GIT loaded") + + # captioningする + def run_batch(path_imgs): + imgs = [im for _, im in path_imgs] + + curr_batch_size[0] = len(path_imgs) + inputs = git_processor(images=imgs, return_tensors="pt").to(DEVICE) # 画像はpil形式 + generated_ids = git_model.generate(pixel_values=inputs.pixel_values, max_length=args.max_length) + captions = git_processor.batch_decode(generated_ids, skip_special_tokens=True) + + if args.remove_words: + captions = remove_words(captions, args.debug) + + for (image_path, _), caption in zip(path_imgs, captions): + with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding='utf-8') as f: + f.write(caption + "\n") + if args.debug: + print(image_path, caption) + + # 読み込みの高速化のためにDataLoaderを使うオプション + if args.max_data_loader_n_workers is not None: + dataset = train_util.ImageLoadingDataset(image_paths) + data = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.max_data_loader_n_workers, collate_fn=collate_fn_remove_corrupted, drop_last=False) + else: + data = [[(None, ip)] for ip in image_paths] + + b_imgs = [] + for data_entry in tqdm(data, smoothing=0.0): + for data in data_entry: + if data is None: + continue + + image, image_path = data + if image is None: + try: + image = Image.open(image_path) + if image.mode != 'RGB': + image = image.convert("RGB") + except Exception as e: + print(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}") + continue + + b_imgs.append((image_path, image)) + if len(b_imgs) >= args.batch_size: + run_batch(b_imgs) + b_imgs.clear() + + if len(b_imgs) > 0: + run_batch(b_imgs) + + print("done!") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") + parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 出力されるキャプションファイルの拡張子") + parser.add_argument("--model_id", type=str, default="microsoft/git-large-textcaps", + help="model id for GIT in Hugging Face / 使用するGITのHugging FaceのモデルID") + parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ") + parser.add_argument("--max_data_loader_n_workers", type=int, default=None, + help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する(読み込みを高速化)") + parser.add_argument("--max_length", type=int, default=50, help="max length of caption / captionの最大長") + parser.add_argument("--remove_words", action="store_true", + help="remove like `with the words xxx` from caption / `with the words xxx`のような部分をキャプションから削除する") + parser.add_argument("--debug", action="store_true", help="debug mode") + + args = parser.parse_args() + main(args) diff --git a/finetune/merge_captions_to_metadata.py b/finetune/merge_captions_to_metadata.py index 703f4f9d..cbc5033f 100644 --- a/finetune/merge_captions_to_metadata.py +++ b/finetune/merge_captions_to_metadata.py @@ -1,26 +1,24 @@ -# このスクリプトのライセンスは、Apache License 2.0とします -# (c) 2022 Kohya S. @kohya_ss - import argparse -import glob -import os import json - +from pathlib import Path +from typing import List from tqdm import tqdm +import library.train_util as train_util def main(args): - image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + glob.glob(os.path.join(args.train_data_dir, "*.jpeg")) + \ - glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp")) + assert not args.recursive or (args.recursive and args.full_path), "recursive requires full_path / recursiveはfull_pathと同時に指定してください" + + train_data_dir_path = Path(args.train_data_dir) + image_paths: List[Path] = train_util.glob_images_pathlib(train_data_dir_path, args.recursive) print(f"found {len(image_paths)} images.") - if args.in_json is None and os.path.isfile(args.out_json): + if args.in_json is None and Path(args.out_json).is_file(): args.in_json = args.out_json if args.in_json is not None: print(f"loading existing metadata: {args.in_json}") - with open(args.in_json, "rt", encoding='utf-8') as f: - metadata = json.load(f) + metadata = json.loads(Path(args.in_json).read_text(encoding='utf-8')) print("captions for existing images will be overwritten / 既存の画像のキャプションは上書きされます") else: print("new metadata will be created / 新しいメタデータファイルが作成されます") @@ -28,12 +26,10 @@ def main(args): print("merge caption texts to metadata json.") for image_path in tqdm(image_paths): - caption_path = os.path.splitext(image_path)[0] + args.caption_extension - with open(caption_path, "rt", encoding='utf-8') as f: - lines = f.readlines() - caption = lines[0].strip() if len(lines) > 0 else "" + caption_path = image_path.with_suffix(args.caption_extension) + caption = caption_path.read_text(encoding='utf-8').strip() - image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0] + image_key = str(image_path) if args.full_path else image_path.stem if image_key not in metadata: metadata[image_key] = {} @@ -43,8 +39,7 @@ def main(args): # metadataを書き出して終わり print(f"writing metadata: {args.out_json}") - with open(args.out_json, "wt", encoding='utf-8') as f: - json.dump(metadata, f, indent=2) + Path(args.out_json).write_text(json.dumps(metadata, indent=2), encoding='utf-8') print("done!") @@ -52,12 +47,15 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先") - parser.add_argument("--in_json", type=str, help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル(省略時、out_jsonが存在すればそれを読み込む)") + parser.add_argument("--in_json", type=str, + help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル(省略時、out_jsonが存在すればそれを読み込む)") parser.add_argument("--caption_extention", type=str, default=None, help="extension of caption file (for backward compatibility) / 読み込むキャプションファイルの拡張子(スペルミスしていたのを残してあります)") parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 読み込むキャプションファイルの拡張子") parser.add_argument("--full_path", action="store_true", help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする(複数の学習画像ディレクトリに対応)") + parser.add_argument("--recursive", action="store_true", + help="recursively look for training tags in all child folders of train_data_dir / train_data_dirのすべての子フォルダにある学習タグを再帰的に探す") parser.add_argument("--debug", action="store_true", help="debug mode") args = parser.parse_args() diff --git a/finetune/merge_dd_tags_to_metadata.py b/finetune/merge_dd_tags_to_metadata.py index 16267cd8..4285feb0 100644 --- a/finetune/merge_dd_tags_to_metadata.py +++ b/finetune/merge_dd_tags_to_metadata.py @@ -1,27 +1,16 @@ -# このスクリプトのライセンスは、Apache License 2.0とします -# (c) 2022 Kohya S. @kohya_ss - import argparse import json from pathlib import Path - +from typing import List from tqdm import tqdm +import library.train_util as train_util def main(args): - image_paths = None - train_data_dir_path = Path(args.train_data_dir) - if args.recursive: - image_paths = list(train_data_dir_path.rglob('*.jpg')) + \ - list(train_data_dir_path.rglob('*.jpeg')) + \ - list(train_data_dir_path.rglob('*.png')) + \ - list(train_data_dir_path.rglob('*.webp')) - else: - image_paths = list(train_data_dir_path.glob('*.jpg')) + \ - list(train_data_dir_path.glob('*.jpeg')) + \ - list(train_data_dir_path.glob('*.png')) + \ - list(train_data_dir_path.glob('*.webp')) + assert not args.recursive or (args.recursive and args.full_path), "recursive requires full_path / recursiveはfull_pathと同時に指定してください" + train_data_dir_path = Path(args.train_data_dir) + image_paths: List[Path] = train_util.glob_images_pathlib(train_data_dir_path, args.recursive) print(f"found {len(image_paths)} images.") if args.in_json is None and Path(args.out_json).is_file(): @@ -37,21 +26,21 @@ def main(args): print("merge tags to metadata json.") for image_path in tqdm(image_paths): - tags_path = image_path.with_suffix('.txt') + tags_path = image_path.with_suffix(args.caption_extension) tags = tags_path.read_text(encoding='utf-8').strip() - image_key = image_path if args.full_path else image_path.stem - if str(image_key) not in metadata: - metadata[str(image_key)] = {} + image_key = str(image_path) if args.full_path else image_path.stem + if image_key not in metadata: + metadata[image_key] = {} - metadata[str(image_key)]['tags'] = tags + metadata[image_key]['tags'] = tags if args.debug: print(image_key, tags) # metadataを書き出して終わり print(f"writing metadata: {args.out_json}") Path(args.out_json).write_text(json.dumps(metadata, indent=2), encoding='utf-8') - + print("done!") @@ -59,10 +48,14 @@ if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先") - parser.add_argument("--in_json", type=str, help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル(省略時、out_jsonが存在すればそれを読み込む)") + parser.add_argument("--in_json", type=str, + help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル(省略時、out_jsonが存在すればそれを読み込む)") parser.add_argument("--full_path", action="store_true", help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする(複数の学習画像ディレクトリに対応)") - parser.add_argument("--recursive", action="store_true", help="recursively look for training tags in all child folders of train_data_dir / train_data_dirのすべての子フォルダにある学習タグを再帰的に探す") + parser.add_argument("--recursive", action="store_true", + help="recursively look for training tags in all child folders of train_data_dir / train_data_dirのすべての子フォルダにある学習タグを再帰的に探す") + parser.add_argument("--caption_extension", type=str, default=".txt", + help="extension of caption (tag) file / 読み込むキャプション(タグ)ファイルの拡張子") parser.add_argument("--debug", action="store_true", help="debug mode, print tags") args = parser.parse_args() diff --git a/finetune/prepare_buckets_latents.py b/finetune/prepare_buckets_latents.py index 87236c43..537626d8 100644 --- a/finetune/prepare_buckets_latents.py +++ b/finetune/prepare_buckets_latents.py @@ -1,20 +1,16 @@ -# このスクリプトのライセンスは、Apache License 2.0とします -# (c) 2022 Kohya S. @kohya_ss - import argparse -import glob import os import json from tqdm import tqdm import numpy as np -from diffusers import AutoencoderKL from PIL import Image import cv2 import torch from torchvision import transforms import library.model_util as model_util +import library.train_util as train_util DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') @@ -26,6 +22,16 @@ IMAGE_TRANSFORMS = transforms.Compose( ) +def collate_fn_remove_corrupted(batch): + """Collate function that allows to remove corrupted examples in the + dataloader. It expects that the dataloader returns 'None' when that occurs. + The 'None's in the batch are removed. + """ + # Filter out all the Nones (corrupted examples) + batch = list(filter(lambda x: x is not None, batch)) + return batch + + def get_latents(vae, images, weight_dtype): img_tensors = [IMAGE_TRANSFORMS(image) for image in images] img_tensors = torch.stack(img_tensors) @@ -35,9 +41,18 @@ def get_latents(vae, images, weight_dtype): return latents +def get_npz_filename_wo_ext(data_dir, image_key, is_full_path, flip): + if is_full_path: + base_name = os.path.splitext(os.path.basename(image_key))[0] + else: + base_name = image_key + if flip: + base_name += '_flip' + return os.path.join(data_dir, base_name) + + def main(args): - image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + glob.glob(os.path.join(args.train_data_dir, "*.jpeg")) + \ - glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp")) + image_paths = train_util.glob_images(args.train_data_dir) print(f"found {len(image_paths)} images.") if os.path.exists(args.in_json): @@ -48,6 +63,25 @@ def main(args): print(f"no metadata / メタデータファイルがありません: {args.in_json}") return + # 既に存在するファイルをfilterする + if args.skip_existing: + filtered = [] + for image_path in image_paths: + image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0] + + npz_file_name_flip = get_npz_filename_wo_ext(args.train_data_dir, image_key, args.full_path, False) + ".npz" + if os.path.exists(npz_file_name_flip): + if not args.flip_aug: + continue + + npz_file_name_flip = get_npz_filename_wo_ext(args.train_data_dir, image_key, args.full_path, True) + ".npz" + if os.path.exists(npz_file_name_flip): + continue + + filtered.apppend(image_path) + print(f"number of skipped images (npz already exists) / npzファイルが存在するためスキップした画像数: {len(image_paths) - len(filtered)}") + image_paths = filtered + weight_dtype = torch.float32 if args.mixed_precision == "fp16": weight_dtype = torch.float16 @@ -70,15 +104,55 @@ def main(args): buckets_imgs = [[] for _ in range(len(bucket_resos))] bucket_counts = [0 for _ in range(len(bucket_resos))] img_ar_errors = [] - for i, image_path in enumerate(tqdm(image_paths, smoothing=0.0)): + + def process_batch(is_last): + for j in range(len(buckets_imgs)): + bucket = buckets_imgs[j] + if (is_last and len(bucket) > 0) or len(bucket) >= args.batch_size: + latents = get_latents(vae, [img for _, _, img in bucket], weight_dtype) + + for (image_key, _, _), latent in zip(bucket, latents): + npz_file_name = get_npz_filename_wo_ext(args.train_data_dir, image_key, args.full_path, False) + np.savez(npz_file_name, latent) + + # flip + if args.flip_aug: + latents = get_latents(vae, [img[:, ::-1].copy() for _, _, img in bucket], weight_dtype) # copyがないとTensor変換できない + + for (image_key, _, _), latent in zip(bucket, latents): + npz_file_name = get_npz_filename_wo_ext(args.train_data_dir, image_key, args.full_path, True) + np.savez(npz_file_name, latent) + + bucket.clear() + + # 読み込みの高速化のためにDataLoaderを使うオプション + if args.max_data_loader_n_workers is not None: + dataset = train_util.ImageLoadingDataset(image_paths) + data = torch.util.data.DataLoader(dataset, batch_size=1, shuffle=False, + num_workers=args.max_data_loader_n_workers, collate_fn=collate_fn_remove_corrupted, drop_last=False) + else: + data = [[(None, ip)] for ip in image_paths] + + for data_entry in tqdm(data, smoothing=0.0): + if data_entry[0] is None: + continue + + img_tensor, image_path = data_entry[0] + if img_tensor is not None: + image = transforms.functional.to_pil_image(img_tensor) + else: + try: + image = Image.open(image_path) + if image.mode != 'RGB': + image = image.convert("RGB") + except Exception as e: + print(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}") + continue + image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0] if image_key not in metadata: metadata[image_key] = {} - image = Image.open(image_path) - if image.mode != 'RGB': - image = image.convert("RGB") - aspect_ratio = image.width / image.height ar_errors = bucket_aspect_ratios - aspect_ratio bucket_id = np.abs(ar_errors).argmin() @@ -123,25 +197,10 @@ def main(args): metadata[image_key]['train_resolution'] = reso # バッチを推論するか判定して推論する - is_last = i == len(image_paths) - 1 - for j in range(len(buckets_imgs)): - bucket = buckets_imgs[j] - if (is_last and len(bucket) > 0) or len(bucket) >= args.batch_size: - latents = get_latents(vae, [img for _, _, img in bucket], weight_dtype) + process_batch(False) - for (image_key, reso, _), latent in zip(bucket, latents): - npz_file_name = os.path.splitext(os.path.basename(image_key))[0] if args.full_path else image_key - np.savez(os.path.join(args.train_data_dir, npz_file_name), latent) - - # flip - if args.flip_aug: - latents = get_latents(vae, [img[:, ::-1].copy() for _, _, img in bucket], weight_dtype) # copyがないとTensor変換できない - - for (image_key, reso, _), latent in zip(bucket, latents): - npz_file_name = os.path.splitext(os.path.basename(image_key))[0] if args.full_path else image_key - np.savez(os.path.join(args.train_data_dir, npz_file_name + '_flip'), latent) - - bucket.clear() + # 残りを処理する + process_batch(True) for i, (reso, count) in enumerate(zip(bucket_resos, bucket_counts)): print(f"bucket {i} {reso}: {count}") @@ -162,8 +221,10 @@ if __name__ == '__main__': parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先") parser.add_argument("model_name_or_path", type=str, help="model name or path to encode latents / latentを取得するためのモデル") parser.add_argument("--v2", action='store_true', - help='load Stable Diffusion v2.0 model / Stable Diffusion 2.0のモデルを読み込む') + help='not used (for backward compatibility) / 使用されません(互換性のため残してあります)') parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ") + parser.add_argument("--max_data_loader_n_workers", type=int, default=None, + help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する(読み込みを高速化)") parser.add_argument("--max_resolution", type=str, default="512,512", help="max resolution in fine tuning (width,height) / fine tuning時の最大画像サイズ 「幅,高さ」(使用メモリ量に関係します)") parser.add_argument("--min_bucket_reso", type=int, default=256, help="minimum resolution for buckets / bucketの最小解像度") @@ -174,6 +235,8 @@ if __name__ == '__main__': help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする(複数の学習画像ディレクトリに対応)") parser.add_argument("--flip_aug", action="store_true", help="flip augmentation, save latents for flipped images / 左右反転した画像もlatentを取得、保存する") + parser.add_argument("--skip_existing", action="store_true", + help="skip images if npz already exists (both normal and flipped exists if flip_aug is enabled) / npzが既に存在する画像をスキップする(flip_aug有効時は通常、反転の両方が存在する画像をスキップ)") args = parser.parse_args() main(args) diff --git a/finetune/tag_images_by_wd14_tagger.py b/finetune/tag_images_by_wd14_tagger.py index d7166114..609b8c50 100644 --- a/finetune/tag_images_by_wd14_tagger.py +++ b/finetune/tag_images_by_wd14_tagger.py @@ -1,6 +1,3 @@ -# このスクリプトのライセンスは、Apache License 2.0とします -# (c) 2022 Kohya S. @kohya_ss - import argparse import csv import glob @@ -12,35 +9,87 @@ from tqdm import tqdm import numpy as np from tensorflow.keras.models import load_model from huggingface_hub import hf_hub_download +import torch + +import library.train_util as train_util # from wd14 tagger IMAGE_SIZE = 448 -WD14_TAGGER_REPO = 'SmilingWolf/wd-v1-4-vit-tagger' +# wd-v1-4-swinv2-tagger-v2 / wd-v1-4-vit-tagger / wd-v1-4-vit-tagger-v2/ wd-v1-4-convnext-tagger / wd-v1-4-convnext-tagger-v2 +DEFAULT_WD14_TAGGER_REPO = 'SmilingWolf/wd-v1-4-convnext-tagger-v2' FILES = ["keras_metadata.pb", "saved_model.pb", "selected_tags.csv"] SUB_DIR = "variables" SUB_DIR_FILES = ["variables.data-00000-of-00001", "variables.index"] CSV_FILE = FILES[-1] +def preprocess_image(image): + image = np.array(image) + image = image[:, :, ::-1] # RGB->BGR + + # pad to square + size = max(image.shape[0:2]) + pad_x = size - image.shape[1] + pad_y = size - image.shape[0] + pad_l = pad_x // 2 + pad_t = pad_y // 2 + image = np.pad(image, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode='constant', constant_values=255) + + interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4 + image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp) + + image = image.astype(np.float32) + return image + + +class ImageLoadingPrepDataset(torch.utils.data.Dataset): + def __init__(self, image_paths): + self.images = image_paths + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + img_path = self.images[idx] + + try: + image = Image.open(img_path).convert("RGB") + image = preprocess_image(image) + tensor = torch.tensor(image) + except Exception as e: + print(f"Could not load image path / 画像を読み込めません: {img_path}, error: {e}") + return None + + return (tensor, img_path) + + +def collate_fn_remove_corrupted(batch): + """Collate function that allows to remove corrupted examples in the + dataloader. It expects that the dataloader returns 'None' when that occurs. + The 'None's in the batch are removed. + """ + # Filter out all the Nones (corrupted examples) + batch = list(filter(lambda x: x is not None, batch)) + return batch + + def main(args): # hf_hub_downloadをそのまま使うとsymlink関係で問題があるらしいので、キャッシュディレクトリとforce_filenameを指定してなんとかする # depreacatedの警告が出るけどなくなったらその時 # https://github.com/toriato/stable-diffusion-webui-wd14-tagger/issues/22 if not os.path.exists(args.model_dir) or args.force_download: - print("downloading wd14 tagger model from hf_hub") + print(f"downloading wd14 tagger model from hf_hub. id: {args.repo_id}") for file in FILES: hf_hub_download(args.repo_id, file, cache_dir=args.model_dir, force_download=True, force_filename=file) for file in SUB_DIR_FILES: hf_hub_download(args.repo_id, file, subfolder=SUB_DIR, cache_dir=os.path.join( args.model_dir, SUB_DIR), force_download=True, force_filename=file) + else: + print("using existing wd14 tagger model") # 画像を読み込む - image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \ - glob.glob(os.path.join(args.train_data_dir, "*.jpeg")) + \ - glob.glob(os.path.join(args.train_data_dir, "*.png")) + \ - glob.glob(os.path.join(args.train_data_dir, "*.webp")) + \ - glob.glob(os.path.join(args.train_data_dir, "*.bmp")) + image_paths = train_util.glob_images(args.train_data_dir) print(f"found {len(image_paths)} images.") print("loading model and labels") @@ -75,7 +124,7 @@ def main(args): # Everything else is tags: pick any where prediction confidence > threshold tag_text = "" for i, p in enumerate(prob[4:]): # numpyとか使うのが良いけど、まあそれほど数も多くないのでループで - if p >= args.thresh: + if p >= args.thresh and i < len(tags): tag_text += ", " + tags[i] if len(tag_text) > 0: @@ -86,34 +135,37 @@ def main(args): if args.debug: print(image_path, tag_text) + # 読み込みの高速化のためにDataLoaderを使うオプション + if args.max_data_loader_n_workers is not None: + dataset = ImageLoadingPrepDataset(image_paths) + data = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.max_data_loader_n_workers, collate_fn=collate_fn_remove_corrupted, drop_last=False) + else: + data = [[(None, ip)] for ip in image_paths] + b_imgs = [] - for image_path in tqdm(image_paths, smoothing=0.0): - img = Image.open(image_path) # cv2は日本語ファイル名で死ぬのとモード変換したいのでpillowで開く - if img.mode != 'RGB': - img = img.convert("RGB") - img = np.array(img) - img = img[:, :, ::-1] # RGB->BGR + for data_entry in tqdm(data, smoothing=0.0): + for data in data_entry: + if data is None: + continue - # pad to square - size = max(img.shape[0:2]) - pad_x = size - img.shape[1] - pad_y = size - img.shape[0] - pad_l = pad_x // 2 - pad_t = pad_y // 2 - img = np.pad(img, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode='constant', constant_values=255) + image, image_path = data + if image is not None: + image = image.detach().numpy() + else: + try: + image = Image.open(image_path) + if image.mode != 'RGB': + image = image.convert("RGB") + image = preprocess_image(image) + except Exception as e: + print(f"Could not load image path / 画像を読み込めません: {image_path}, error: {e}") + continue + b_imgs.append((image_path, image)) - interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4 - img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp) - # cv2.imshow("img", img) - # cv2.waitKey() - # cv2.destroyAllWindows() - - img = img.astype(np.float32) - b_imgs.append((image_path, img)) - - if len(b_imgs) >= args.batch_size: - run_batch(b_imgs) - b_imgs.clear() + if len(b_imgs) >= args.batch_size: + run_batch(b_imgs) + b_imgs.clear() if len(b_imgs) > 0: run_batch(b_imgs) @@ -124,7 +176,7 @@ def main(args): if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ") - parser.add_argument("--repo_id", type=str, default=WD14_TAGGER_REPO, + parser.add_argument("--repo_id", type=str, default=DEFAULT_WD14_TAGGER_REPO, help="repo id for wd14 tagger on Hugging Face / Hugging Faceのwd14 taggerのリポジトリID") parser.add_argument("--model_dir", type=str, default="wd14_tagger_model", help="directory to store wd14 tagger model / wd14 taggerのモデルを格納するディレクトリ") @@ -132,6 +184,8 @@ if __name__ == '__main__': help="force downloading wd14 tagger models / wd14 taggerのモデルを再ダウンロードします") parser.add_argument("--thresh", type=float, default=0.35, help="threshold of confidence to add a tag / タグを追加するか判定する閾値") parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ") + parser.add_argument("--max_data_loader_n_workers", type=int, default=None, + help="enable image reading by DataLoader with this number of workers (faster) / DataLoaderによる画像読み込みを有効にしてこのワーカー数を適用する(読み込みを高速化)") parser.add_argument("--caption_extention", type=str, default=None, help="extension of caption file (for backward compatibility) / 出力されるキャプションファイルの拡張子(スペルミスしていたのを残してあります)") parser.add_argument("--caption_extension", type=str, default=".txt", help="extension of caption file / 出力されるキャプションファイルの拡張子") diff --git a/library/train_util.py b/library/train_util.py index 0946c31d..459b81a1 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -44,7 +44,7 @@ DEFAULT_LAST_OUTPUT_NAME = "last" # region dataset -IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".webp", ".bmp"] +IMAGE_EXTENSIONS = [".png", ".jpg", ".jpeg", ".webp", ".bmp", ".PNG", ".JPG", ".JPEG", ".WEBP", ".BMP"] class ImageInfo(): @@ -141,7 +141,7 @@ class BaseDataset(torch.utils.data.Dataset): if type(str_to) == list: caption = random.choice(str_to) else: - caption = str_to + caption = str_to else: caption = caption.replace(str_from, str_to) @@ -247,7 +247,6 @@ class BaseDataset(torch.utils.data.Dataset): mean_img_ar_error = np.mean(np.abs(img_ar_errors)) self.bucket_info["mean_img_ar_error"] = mean_img_ar_error print(f"mean ar error (without repeats): {mean_img_ar_error}") - # 参照用indexを作る self.buckets_indices: list(BucketBatchIndex) = [] @@ -766,15 +765,30 @@ def debug_dataset(train_dataset, show_input_ids=False): break -def glob_images(dir, base): +def glob_images(directory, base="*"): img_paths = [] for ext in IMAGE_EXTENSIONS: if base == '*': - img_paths.extend(glob.glob(os.path.join(glob.escape(dir), base + ext))) + img_paths.extend(glob.glob(os.path.join(glob.escape(directory), base + ext))) else: - img_paths.extend(glob.glob(glob.escape(os.path.join(dir, base + ext)))) + img_paths.extend(glob.glob(glob.escape(os.path.join(directory, base + ext)))) + img_paths = list(set(img_paths)) # 重複を排除 + img_paths.sort() return img_paths + +def glob_images_pathlib(dir_path, recursive): + image_paths = [] + if recursive: + for ext in IMAGE_EXTENSIONS: + image_paths += list(dir_path.rglob('*' + ext)) + else: + for ext in IMAGE_EXTENSIONS: + image_paths += list(dir_path.glob('*' + ext)) + image_paths = list(set(image_paths)) # 重複を排除 + image_paths.sort() + return image_paths + # endregion @@ -1505,5 +1519,30 @@ def save_state_on_train_end(args: argparse.Namespace, accelerator): model_name = DEFAULT_LAST_OUTPUT_NAME if args.output_name is None else args.output_name accelerator.save_state(os.path.join(args.output_dir, LAST_STATE_NAME.format(model_name))) +# endregion + +# region 前処理用 + + +class ImageLoadingDataset(torch.utils.data.Dataset): + def __init__(self, image_paths): + self.images = image_paths + + def __len__(self): + return len(self.images) + + def __getitem__(self, idx): + img_path = self.images[idx] + + try: + image = Image.open(img_path).convert("RGB") + # convert to tensor temporarily so dataloader will accept it + tensor_pil = transforms.functional.pil_to_tensor(image) + except Exception as e: + print(f"Could not load image path / 画像を読み込めません: {img_path}, error: {e}") + return None + + return (tensor_pil, img_path) + # endregion diff --git a/requirements.txt b/requirements.txt index 36f48a0f..709a8342 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ accelerate==0.15.0 -transformers==4.25.1 +transformers==4.26.0 ftfy albumentations opencv-python