Add scripts.

2026-04-08 22:35:09 +00:00 · 2022-12-18 14:55:34 +09:00
parent 042a007930
commit 7a04196e66
19 changed files with 7129 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
+logs
+__pycache__
+wd14_tagger_model
--- a/README-ja.md
+++ b/README-ja.md
@@ -3,9 +3,21 @@ Stable Diffusionの学習、画像生成、その他のスクリプトを入れ

 [README in English](./README.md)

+GUIやPowerShellスクリプトなど、より使いやすくする機能が[bmaltais氏のリポジトリ](https://github.com/bmaltais/kohya_ss)で提供されています（英語です）のであわせてご覧ください。bmaltais氏に感謝します。
+
 以下のスクリプトがあります。

 * DreamBooth、U-NetおよびText Encoderの学習をサポート
 * fine-tuning、同上
 * 画像生成
 * モデル変換（Stable Diffision ckpt/safetensorsとDiffusersの相互変換）
+
+## 使用法について
+
+note.comに記事がありますのでそちらをご覧ください（将来的にはこちらへ移すかもしれません）。
+
+* [環境整備とDreamBooth学習スクリプトについて](https://note.com/kohya_ss/n/nee3ed1649fb6)
+* [fine-tuningスクリプト](https://note.com/kohya_ss/n/nbf7ce8d80f29):
+Including BLIP captioning and tagging by DeepDanbooru or WD14 tagger
+* [画像生成スクリプト](https://note.com/kohya_ss/n/n2693183a798e)
+* [モデル変換スクリプト](https://note.com/kohya_ss/n/n374f316fe4ad)
--- a/README.md
+++ b/README.md
@@ -2,11 +2,27 @@ This repository contains training, generation and utility scripts for Stable Dif

 [日本語版README](./README-ja.md)

-This repository currently contains:
+For easier use (GUI and PowerShell scripts etc...), please visit [the repository maintained by bmaltais](https://github.com/bmaltais/kohya_ss). Thanks to @bmaltais!
+
+This repository contains the scripts for:

 * DreamBooth training, including U-Net and Text Encoder
 * fine-tuning (native training), including U-Net and Text Encoder
 * image generation
-* model conversion (Stable Diffision ckpt/safetensors and Diffusers)
+* model conversion (supports 1.x and 2.x, Stable Diffision ckpt/safetensors and Diffusers)

+## About requirements_*.txt

+These files do not contain requirements for PyTorch and Diffusers. Because the versions of them depend on your environment. Please install PyTorch at first, then Diffusers. 
+
+The scripts is tested with PyTorch 1.12.1 and 1.13.0, Diffusers 0.10.2.
+
+## Links to how-to-use documents
+
+All documents are in Japanese currently, and CUI based.
+
+* [Environment setup and DreamBooth training guide](https://note.com/kohya_ss/n/nee3ed1649fb6)
+* [Fine-tuning step-by-step guide](https://note.com/kohya_ss/n/nbf7ce8d80f29):
+Including BLIP captioning and tagging by DeepDanbooru or WD14 tagger
+* [Image generation](https://note.com/kohya_ss/n/n2693183a798e)
+* [Model conversion](https://note.com/kohya_ss/n/n374f316fe4ad)
--- a/clean_captions_and_tags.py
+++ b/clean_captions_and_tags.py
@@ -0,0 +1,123 @@
+# このスクリプトのライセンスは、Apache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+
+import argparse
+import glob
+import os
+import json
+
+from tqdm import tqdm
+
+
+def clean_tags(image_key, tags):
+  # replace '_' to ' '
+  tags = tags.replace('_', ' ')
+
+  # remove rating: deepdanbooruのみ
+  tokens = tags.split(", rating")
+  if len(tokens) == 1:
+    # WD14 taggerのときはこちらになるのでメッセージは出さない
+    # print("no rating:")
+    # print(f"{image_key} {tags}")
+    pass
+  else:
+    if len(tokens) > 2:
+      print("multiple ratings:")
+      print(f"{image_key} {tags}")
+    tags = tokens[0]
+
+  return tags
+
+
+# 上から順に検索、置換される
+# ('置換元文字列', '置換後文字列')
+CAPTION_REPLACEMENTS = [
+    ('anime anime', 'anime'),
+    ('young ', ''),
+    ('anime girl', 'girl'),
+    ('cartoon female', 'girl'),
+    ('cartoon lady', 'girl'),
+    ('cartoon character', 'girl'),      # a or ~s
+    ('cartoon woman', 'girl'),
+    ('cartoon women', 'girls'),
+    ('cartoon girl', 'girl'),
+    ('anime female', 'girl'),
+    ('anime lady', 'girl'),
+    ('anime character', 'girl'),      # a or ~s
+    ('anime woman', 'girl'),
+    ('anime women', 'girls'),
+    ('lady', 'girl'),
+    ('female', 'girl'),
+    ('woman', 'girl'),
+    ('women', 'girls'),
+    ('people', 'girls'),
+    ('person', 'girl'),
+    ('a cartoon figure', 'a figure'),
+    ('a cartoon image', 'an image'),
+    ('a cartoon picture', 'a picture'),
+    ('an anime cartoon image', 'an image'),
+    ('a cartoon anime drawing', 'a drawing'),
+    ('a cartoon drawing', 'a drawing'),
+    ('girl girl', 'girl'),
+]
+
+
+def clean_caption(caption):
+  for rf, rt in CAPTION_REPLACEMENTS:
+    replaced = True
+    while replaced:
+      bef = caption
+      caption = caption.replace(rf, rt)
+      replaced = bef != caption
+  return caption
+
+
+def main(args):
+  if os.path.exists(args.in_json):
+    print(f"loading existing metadata: {args.in_json}")
+    with open(args.in_json, "rt", encoding='utf-8') as f:
+      metadata = json.load(f)
+  else:
+    print("no metadata / メタデータファイルがありません")
+    return
+
+  print("cleaning captions and tags.")
+  image_keys = list(metadata.keys())
+  for image_key in tqdm(image_keys):
+    tags = metadata[image_key].get('tags')
+    if tags is None:
+      print(f"image does not have tags / メタデータにタグがありません: {image_key}")
+    else:
+      metadata[image_key]['tags'] = clean_tags(image_key, tags)
+
+    caption = metadata[image_key].get('caption')
+    if caption is None:
+      print(f"image does not have caption / メタデータにキャプションがありません: {image_key}")
+    else:
+      metadata[image_key]['caption'] = clean_caption(caption)
+
+  # metadataを書き出して終わり
+  print(f"writing metadata: {args.out_json}")
+  with open(args.out_json, "wt", encoding='utf-8') as f:
+    json.dump(metadata, f, indent=2)
+  print("done!")
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  # parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("in_json", type=str, help="metadata file to input / 読み込むメタデータファイル")
+  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+
+  args, unknown = parser.parse_known_args()
+  if len(unknown) == 1:
+    print("WARNING: train_data_dir argument is removed. This script will not work with three arguments in future. Please specify two arguments: in_json and out_json.")
+    print("All captions and tags in the metadata are processed.")
+    print("警告: train_data_dir引数は不要になりました。将来的には三つの引数を指定すると動かなくなる予定です。読み込み元のメタデータと書き出し先の二つの引数だけ指定してください。")
+    print("メタデータ内のすべてのキャプションとタグが処理されます。")
+    args.in_json = args.out_json
+    args.out_json = unknown[0]
+  elif len(unknown) > 0:
+    raise ValueError(f"error: unrecognized arguments: {unknown}")
+
+  main(args)
--- a/convert_diffusers20_original_sd.py
+++ b/convert_diffusers20_original_sd.py
@@ -0,0 +1,93 @@
+# convert Diffusers v1.x/v2.0 model to original Stable Diffusion
+# v1: initial version
+# v2: support safetensors
+# v3: fix to support another format
+# v4: support safetensors in Diffusers
+
+import argparse
+import os
+import torch
+from diffusers import StableDiffusionPipeline
+
+import model_util
+
+
+def convert(args):
+  # 引数を確認する
+  load_dtype = torch.float16 if args.fp16 else None
+
+  save_dtype = None
+  if args.fp16:
+    save_dtype = torch.float16
+  elif args.bf16:
+    save_dtype = torch.bfloat16
+  elif args.float:
+    save_dtype = torch.float
+
+  is_load_ckpt = os.path.isfile(args.model_to_load)
+  is_save_ckpt = len(os.path.splitext(args.model_to_save)[1]) > 0
+
+  assert not is_load_ckpt or args.v1 != args.v2, f"v1 or v2 is required to load checkpoint / checkpointの読み込みにはv1/v2指定が必要です"
+  assert is_save_ckpt or args.reference_model is not None, f"reference model is required to save as Diffusers / Diffusers形式での保存には参照モデルが必要です"
+
+  # モデルを読み込む
+  msg = "checkpoint" if is_load_ckpt else ("Diffusers" + (" as fp16" if args.fp16 else ""))
+  print(f"loading {msg}: {args.model_to_load}")
+
+  if is_load_ckpt:
+    v2_model = args.v2
+    text_encoder, vae, unet = model_util.load_models_from_stable_diffusion_checkpoint(v2_model, args.model_to_load)
+  else:
+    pipe = StableDiffusionPipeline.from_pretrained(args.model_to_load, torch_dtype=load_dtype, tokenizer=None, safety_checker=None)
+    text_encoder = pipe.text_encoder
+    vae = pipe.vae
+    unet = pipe.unet
+
+    if args.v1 == args.v2:
+      # 自動判定する
+      v2_model = unet.config.cross_attention_dim == 1024
+      print("checking model version: model is " + ('v2' if v2_model else 'v1'))
+    else:
+      v2_model = args.v1
+
+  # 変換して保存する
+  msg = ("checkpoint" + ("" if save_dtype is None else f" in {save_dtype}")) if is_save_ckpt else "Diffusers"
+  print(f"converting and saving as {msg}: {args.model_to_save}")
+
+  if is_save_ckpt:
+    original_model = args.model_to_load if is_load_ckpt else None
+    key_count = model_util.save_stable_diffusion_checkpoint(v2_model, args.model_to_save, text_encoder, unet,
+                                                            original_model, args.epoch, args.global_step, save_dtype, vae)
+    print(f"model saved. total converted state_dict keys: {key_count}")
+  else:
+    print(f"copy scheduler/tokenizer config from: {args.reference_model}")
+    model_util.save_diffusers_checkpoint(v2_model, args.model_to_save, text_encoder, unet, args.reference_model, vae, args.use_safetensors)
+    print(f"model saved.")
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--v1", action='store_true',
+                      help='load v1.x model (v1 or v2 is required to load checkpoint) / 1.xのモデルを読み込む')
+  parser.add_argument("--v2", action='store_true',
+                      help='load v2.0 model (v1 or v2 is required to load checkpoint) / 2.0のモデルを読み込む')
+  parser.add_argument("--fp16", action='store_true',
+                      help='load as fp16 (Diffusers only) and save as fp16 (checkpoint only) / fp16形式で読み込み（Diffusers形式のみ対応）、保存する（checkpointのみ対応）')
+  parser.add_argument("--bf16", action='store_true', help='save as bf16 (checkpoint only) / bf16形式で保存する（checkpointのみ対応）')
+  parser.add_argument("--float", action='store_true',
+                      help='save as float (checkpoint only) / float(float32)形式で保存する（checkpointのみ対応）')
+  parser.add_argument("--epoch", type=int, default=0, help='epoch to write to checkpoint / checkpointに記録するepoch数の値')
+  parser.add_argument("--global_step", type=int, default=0,
+                      help='global_step to write to checkpoint / checkpointに記録するglobal_stepの値')
+  parser.add_argument("--reference_model", type=str, default=None,
+                      help="reference model for schduler/tokenizer, required in saving Diffusers, copy schduler/tokenizer from this / scheduler/tokenizerのコピー元のDiffusersモデル、Diffusers形式で保存するときに必要")
+  parser.add_argument("--use_safetensors", action='store_true',
+                      help="use safetensors format to save Diffusers model (checkpoint depends on the file extension) / Duffusersモデルをsafetensors形式で保存する（checkpointは拡張子で自動判定）")
+
+  parser.add_argument("model_to_load", type=str, default=None,
+                      help="model to load: checkpoint file or Diffusers model's directory / 読み込むモデル、checkpointかDiffusers形式モデルのディレクトリ")
+  parser.add_argument("model_to_save", type=str, default=None,
+                      help="model to save: checkpoint (with extension) or Diffusers model's directory (without extension) / 変換後のモデル、拡張子がある場合はcheckpoint、ない場合はDiffusesモデルとして保存")
+
+  args = parser.parse_args()
+  convert(args)
--- a/detect_face_rotate.py
+++ b/detect_face_rotate.py
@@ -0,0 +1,239 @@
+# このスクリプトのライセンスは、train_dreambooth.pyと同じくApache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+
+# 横長の画像から顔検出して正立するように回転し、そこを中心に正方形に切り出す
+
+# v2: extract max face if multiple faces are found
+# v3: add crop_ratio option
+# v4: add multple faces extraction and min/max size
+
+import argparse
+import math
+import cv2
+import glob
+import os
+from anime_face_detector import create_detector
+from tqdm import tqdm
+import numpy as np
+
+KP_REYE = 11
+KP_LEYE = 19
+
+SCORE_THRES = 0.90
+
+
+def detect_faces(detector, image, min_size):
+  preds = detector(image)                     # bgr
+  # print(len(preds))
+
+  faces = []
+  for pred in preds:
+    bb = pred['bbox']
+    score = bb[-1]
+    if score < SCORE_THRES:
+      continue
+
+    left, top, right, bottom = bb[:4]
+    cx = int((left + right) / 2)
+    cy = int((top + bottom) / 2)
+    fw = int(right - left)
+    fh = int(bottom - top)
+
+    lex, ley = pred['keypoints'][KP_LEYE, 0:2]
+    rex, rey = pred['keypoints'][KP_REYE, 0:2]
+    angle = math.atan2(ley - rey, lex - rex)
+    angle = angle / math.pi * 180
+
+    faces.append((cx, cy, fw, fh, angle))
+
+  faces.sort(key=lambda x: max(x[2], x[3]), reverse=True)         # 大きい順
+  return faces
+
+
+def rotate_image(image, angle, cx, cy):
+  h, w = image.shape[0:2]
+  rot_mat = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
+
+  # # 回転する分、すこし画像サイズを大きくする→とりあえず無効化
+  # nh = max(h, int(w * math.sin(angle)))
+  # nw = max(w, int(h * math.sin(angle)))
+  # if nh > h or nw > w:
+  #   pad_y = nh - h
+  #   pad_t = pad_y // 2
+  #   pad_x = nw - w
+  #   pad_l = pad_x // 2
+  #   m = np.array([[0, 0, pad_l],
+  #                 [0, 0, pad_t]])
+  #   rot_mat = rot_mat + m
+  #   h, w = nh, nw
+  #   cx += pad_l
+  #   cy += pad_t
+
+  result = cv2.warpAffine(image, rot_mat, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT)
+  return result, cx, cy
+
+
+def process(args):
+  assert (not args.resize_fit) or args.resize_face_size is None, f"resize_fit and resize_face_size can't be specified both / resize_fitとresize_face_sizeはどちらか片方しか指定できません"
+  assert args.crop_ratio is None or args.resize_face_size is None, f"crop_ratio指定時はresize_face_sizeは指定できません"
+
+  # アニメ顔検出モデルを読み込む
+  print("loading face detector.")
+  detector = create_detector('yolov3')
+
+  # cropの引数を解析する
+  if args.crop_size is None:
+    crop_width = crop_height = None
+  else:
+    tokens = args.crop_size.split(',')
+    assert len(tokens) == 2, f"crop_size must be 'width,height' / crop_sizeは'幅,高さ'で指定してください"
+    crop_width, crop_height = [int(t) for t in tokens]
+
+  if args.crop_ratio is None:
+    crop_h_ratio = crop_v_ratio = None
+  else:
+    tokens = args.crop_ratio.split(',')
+    assert len(tokens) == 2, f"crop_ratio must be 'horizontal,vertical' / crop_ratioは'幅,高さ'の倍率で指定してください"
+    crop_h_ratio, crop_v_ratio = [float(t) for t in tokens]
+
+  # 画像を処理する
+  print("processing.")
+  output_extension = ".png"
+
+  os.makedirs(args.dst_dir, exist_ok=True)
+  paths = glob.glob(os.path.join(args.src_dir, "*.png")) + glob.glob(os.path.join(args.src_dir, "*.jpg")) + \
+      glob.glob(os.path.join(args.src_dir, "*.webp"))
+  for path in tqdm(paths):
+    basename = os.path.splitext(os.path.basename(path))[0]
+
+    # image = cv2.imread(path)        # 日本語ファイル名でエラーになる
+    image = cv2.imdecode(np.fromfile(path, np.uint8), cv2.IMREAD_UNCHANGED)
+    if len(image.shape) == 2:
+      image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
+    if image.shape[2] == 4:
+      print(f"image has alpha. ignore / 画像の透明度が設定されているため無視します: {path}")
+      image = image[:, :, :3].copy()                    # copyをしないと内部的に透明度情報が付いたままになるらしい
+
+    h, w = image.shape[:2]
+
+    faces = detect_faces(detector, image, args.multiple_faces)
+    for i, face in enumerate(faces):
+      cx, cy, fw, fh, angle = face
+      face_size = max(fw, fh)
+      if args.min_size is not None and face_size < args.min_size:
+        continue
+      if args.max_size is not None and face_size >= args.max_size:
+        continue
+      face_suffix = f"_{i+1:02d}" if args.multiple_faces else ""
+
+      # オプション指定があれば回転する
+      face_img = image
+      if args.rotate:
+        face_img, cx, cy = rotate_image(face_img, angle, cx, cy)
+
+      # オプション指定があれば顔を中心に切り出す
+      if crop_width is not None or crop_h_ratio is not None:
+        cur_crop_width, cur_crop_height = crop_width, crop_height
+        if crop_h_ratio is not None:
+          cur_crop_width = int(face_size * crop_h_ratio + .5)
+          cur_crop_height = int(face_size * crop_v_ratio + .5)
+
+        # リサイズを必要なら行う
+        scale = 1.0
+        if args.resize_face_size is not None:
+          # 顔サイズを基準にリサイズする
+          scale = args.resize_face_size / face_size
+          if scale < cur_crop_width / w:
+            print(
+                f"image width too small in face size based resizing / 顔を基準にリサイズすると画像の幅がcrop sizeより小さい（顔が相対的に大きすぎる）ので顔サイズが変わります: {path}")
+            scale = cur_crop_width / w
+          if scale < cur_crop_height / h:
+            print(
+                f"image height too small in face size based resizing / 顔を基準にリサイズすると画像の高さがcrop sizeより小さい（顔が相対的に大きすぎる）ので顔サイズが変わります: {path}")
+            scale = cur_crop_height / h
+        elif crop_h_ratio is not None:
+          # 倍率指定の時にはリサイズしない
+          pass
+        else:
+          # 切り出しサイズ指定あり
+          if w < cur_crop_width:
+            print(f"image width too small/ 画像の幅がcrop sizeより小さいので画質が劣化します: {path}")
+            scale = cur_crop_width / w
+          if h < cur_crop_height:
+            print(f"image height too small/ 画像の高さがcrop sizeより小さいので画質が劣化します: {path}")
+            scale = cur_crop_height / h
+          if args.resize_fit:
+            scale = max(cur_crop_width / w, cur_crop_height / h)
+
+        if scale != 1.0:
+          w = int(w * scale + .5)
+          h = int(h * scale + .5)
+          face_img = cv2.resize(face_img, (w, h), interpolation=cv2.INTER_AREA if scale < 1.0 else cv2.INTER_LANCZOS4)
+          cx = int(cx * scale + .5)
+          cy = int(cy * scale + .5)
+          fw = int(fw * scale + .5)
+          fh = int(fh * scale + .5)
+
+        cur_crop_width = min(cur_crop_width, face_img.shape[1])
+        cur_crop_height = min(cur_crop_height, face_img.shape[0])
+
+        x = cx - cur_crop_width // 2
+        cx = cur_crop_width // 2
+        if x < 0:
+          cx = cx + x
+          x = 0
+        elif x + cur_crop_width > w:
+          cx = cx + (x + cur_crop_width - w)
+          x = w - cur_crop_width
+        face_img = face_img[:, x:x+cur_crop_width]
+
+        y = cy - cur_crop_height // 2
+        cy = cur_crop_height // 2
+        if y < 0:
+          cy = cy + y
+          y = 0
+        elif y + cur_crop_height > h:
+          cy = cy + (y + cur_crop_height - h)
+          y = h - cur_crop_height
+        face_img = face_img[y:y + cur_crop_height]
+
+      # # debug
+      # print(path, cx, cy, angle)
+      # crp = cv2.resize(image, (image.shape[1]//8, image.shape[0]//8))
+      # cv2.imshow("image", crp)
+      # if cv2.waitKey() == 27:
+      #   break
+      # cv2.destroyAllWindows()
+
+      # debug
+      if args.debug:
+        cv2.rectangle(face_img, (cx-fw//2, cy-fh//2), (cx+fw//2, cy+fh//2), (255, 0, 255), fw//20)
+
+      _, buf = cv2.imencode(output_extension, face_img)
+      with open(os.path.join(args.dst_dir, f"{basename}{face_suffix}_{cx:04d}_{cy:04d}_{fw:04d}_{fh:04d}{output_extension}"), "wb") as f:
+        buf.tofile(f)
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--src_dir", type=str, help="directory to load images / 画像を読み込むディレクトリ")
+  parser.add_argument("--dst_dir", type=str, help="directory to save images / 画像を保存するディレクトリ")
+  parser.add_argument("--rotate", action="store_true", help="rotate images to align faces / 顔が正立するように画像を回転する")
+  parser.add_argument("--resize_fit", action="store_true",
+                      help="resize to fit smaller side after cropping / 切り出し後の画像の短辺がcrop_sizeにあうようにリサイズする")
+  parser.add_argument("--resize_face_size", type=int, default=None,
+                      help="resize image before cropping by face size / 切り出し前に顔がこのサイズになるようにリサイズする")
+  parser.add_argument("--crop_size", type=str, default=None,
+                      help="crop images with 'width,height' pixels, face centered / 顔を中心として'幅,高さ'のサイズで切り出す")
+  parser.add_argument("--crop_ratio", type=str, default=None,
+                      help="crop images with 'horizontal,vertical' ratio to face, face centered / 顔を中心として顔サイズの'幅倍率,高さ倍率'のサイズで切り出す")
+  parser.add_argument("--min_size", type=int, default=None,
+                      help="minimum face size to output (included) / 処理対象とする顔の最小サイズ（この値以上）")
+  parser.add_argument("--max_size", type=int, default=None,
+                      help="maximum face size to output (excluded) / 処理対象とする顔の最大サイズ（この値未満）")
+  parser.add_argument("--multiple_faces", action="store_true",
+                      help="output each faces / 複数の顔が見つかった場合、それぞれを切り出す")
+  parser.add_argument("--debug", action="store_true", help="render rect for face / 処理後画像の顔位置に矩形を描画します")
+  args = parser.parse_args()
+
+  process(args)
--- a/fine_tune.py
+++ b/fine_tune.py
--- a/gen_img_diffusers.py
+++ b/gen_img_diffusers.py
--- a/hypernetwork_nai.py
+++ b/hypernetwork_nai.py
@@ -0,0 +1,96 @@
+# NAI compatible
+
+import torch
+
+
+class HypernetworkModule(torch.nn.Module):
+  def __init__(self, dim, multiplier=1.0):
+    super().__init__()
+
+    linear1 = torch.nn.Linear(dim, dim * 2)
+    linear2 = torch.nn.Linear(dim * 2, dim)
+    linear1.weight.data.normal_(mean=0.0, std=0.01)
+    linear1.bias.data.zero_()
+    linear2.weight.data.normal_(mean=0.0, std=0.01)
+    linear2.bias.data.zero_()
+    linears = [linear1, linear2]
+
+    self.linear = torch.nn.Sequential(*linears)
+    self.multiplier = multiplier
+
+  def forward(self, x):
+    return x + self.linear(x) * self.multiplier
+
+
+class Hypernetwork(torch.nn.Module):
+  enable_sizes = [320, 640, 768, 1280]
+  # return self.modules[Hypernetwork.enable_sizes.index(size)]
+
+  def __init__(self, multiplier=1.0) -> None:
+    super().__init__()
+    self.modules = []
+    for size in Hypernetwork.enable_sizes:
+      self.modules.append((HypernetworkModule(size, multiplier), HypernetworkModule(size, multiplier)))
+      self.register_module(f"{size}_0", self.modules[-1][0])
+      self.register_module(f"{size}_1", self.modules[-1][1])
+
+  def apply_to_stable_diffusion(self, text_encoder, vae, unet):
+    blocks = unet.input_blocks + [unet.middle_block] + unet.output_blocks
+    for block in blocks:
+      for subblk in block:
+        if 'SpatialTransformer' in str(type(subblk)):
+          for tf_block in subblk.transformer_blocks:
+            for attn in [tf_block.attn1, tf_block.attn2]:
+              size = attn.context_dim
+              if size in Hypernetwork.enable_sizes:
+                attn.hypernetwork = self
+              else:
+                attn.hypernetwork = None
+
+  def apply_to_diffusers(self, text_encoder, vae, unet):
+    blocks = unet.down_blocks + [unet.mid_block] + unet.up_blocks
+    for block in blocks:
+      if hasattr(block, 'attentions'):
+        for subblk in block.attentions:
+          if 'SpatialTransformer' in str(type(subblk)) or 'Transformer2DModel' in str(type(subblk)):      # 0.6.0 and 0.7~
+            for tf_block in subblk.transformer_blocks:
+              for attn in [tf_block.attn1, tf_block.attn2]:
+                size = attn.to_k.in_features
+                if size in Hypernetwork.enable_sizes:
+                  attn.hypernetwork = self
+                else:
+                  attn.hypernetwork = None
+    return True       # TODO error checking
+
+  def forward(self, x, context):
+    size = context.shape[-1]
+    assert size in Hypernetwork.enable_sizes
+    module = self.modules[Hypernetwork.enable_sizes.index(size)]
+    return module[0].forward(context), module[1].forward(context)
+
+  def load_from_state_dict(self, state_dict):
+    # old ver to new ver
+    changes = {
+        'linear1.bias': 'linear.0.bias',
+        'linear1.weight': 'linear.0.weight',
+        'linear2.bias': 'linear.1.bias',
+        'linear2.weight': 'linear.1.weight',
+    }
+    for key_from, key_to in changes.items():
+      if key_from in state_dict:
+        state_dict[key_to] = state_dict[key_from]
+        del state_dict[key_from]
+
+    for size, sd in state_dict.items():
+      if type(size) == int:
+        self.modules[Hypernetwork.enable_sizes.index(size)][0].load_state_dict(sd[0], strict=True)
+        self.modules[Hypernetwork.enable_sizes.index(size)][1].load_state_dict(sd[1], strict=True)
+    return True
+
+  def get_state_dict(self):
+    state_dict = {}
+    for i, size in enumerate(Hypernetwork.enable_sizes):
+      sd0 = self.modules[i][0].state_dict()
+      sd1 = self.modules[i][1].state_dict()
+      state_dict[size] = [sd0, sd1]
+    return state_dict
--- a/make_captions.py
+++ b/make_captions.py
@@ -0,0 +1,98 @@
+# このスクリプトのライセンスは、Apache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+
+import argparse
+import glob
+import os
+import json
+
+from PIL import Image
+from tqdm import tqdm
+import numpy as np
+import torch
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from models.blip import blip_decoder
+# from Salesforce_BLIP.models.blip import blip_decoder
+
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+def main(args):
+  image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \
+      glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp"))
+  print(f"found {len(image_paths)} images.")
+
+  print(f"loading BLIP caption: {args.caption_weights}")
+  image_size = 384
+  model = blip_decoder(pretrained=args.caption_weights, image_size=image_size, vit='large')
+  model.eval()
+  model = model.to(DEVICE)
+  print("BLIP loaded")
+
+  # 正方形でいいのか？　という気がするがソースがそうなので
+  transform = transforms.Compose([
+      transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+      transforms.ToTensor(),
+      transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+  ])
+
+  # captioningする
+  def run_batch(path_imgs):
+    imgs = torch.stack([im for _, im in path_imgs]).to(DEVICE)
+
+    with torch.no_grad():
+      if args.beam_search:
+        captions = model.generate(imgs, sample=False, num_beams=args.num_beams,
+                                  max_length=args.max_length, min_length=args.min_length)
+      else:
+        captions = model.generate(imgs, sample=True, top_p=args.top_p, max_length=args.max_length, min_length=args.min_length)
+
+    for (image_path, _), caption in zip(path_imgs, captions):
+      with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding='utf-8') as f:
+        f.write(caption + "\n")
+        if args.debug:
+          print(image_path, caption)
+
+  b_imgs = []
+  for image_path in tqdm(image_paths, smoothing=0.0):
+    raw_image = Image.open(image_path)
+    if raw_image.mode != "RGB":
+      print(f"convert image mode {raw_image.mode} to RGB: {image_path}")
+      raw_image = raw_image.convert("RGB")
+
+    image = transform(raw_image)
+    b_imgs.append((image_path, image))
+    if len(b_imgs) >= args.batch_size:
+      run_batch(b_imgs)
+      b_imgs.clear()
+  if len(b_imgs) > 0:
+    run_batch(b_imgs)
+
+  print("done!")
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("caption_weights", type=str,
+                      help="BLIP caption weights (model_large_caption.pth) / BLIP captionの重みファイル(model_large_caption.pth)")
+  parser.add_argument("--caption_extention", type=str, default=None,
+                      help="extension of caption file (for backward compatibility) / 出力されるキャプションファイルの拡張子（スペルミスしていたのを残してあります）")
+  parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 出力されるキャプションファイルの拡張子")
+  parser.add_argument("--beam_search", action="store_true",
+                      help="use beam search (default Nucleus sampling) / beam searchを使う（このオプション未指定時はNucleus sampling）")
+  parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+  parser.add_argument("--num_beams", type=int, default=1, help="num of beams in beam search /beam search時のビーム数（多いと精度が上がるが時間がかかる）")
+  parser.add_argument("--top_p", type=float, default=0.9, help="top_p in Nucleus sampling / Nucleus sampling時のtop_p")
+  parser.add_argument("--max_length", type=int, default=75, help="max length of caption / captionの最大長")
+  parser.add_argument("--min_length", type=int, default=5, help="min length of caption / captionの最小長")
+  parser.add_argument("--debug", action="store_true", help="debug mode")
+
+  args = parser.parse_args()
+
+  # スペルミスしていたオプションを復元する
+  if args.caption_extention is not None:
+    args.caption_extension = args.caption_extention
+
+  main(args)
--- a/merge_captions_to_metadata.py
+++ b/merge_captions_to_metadata.py
@@ -0,0 +1,68 @@
+# このスクリプトのライセンスは、Apache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+
+import argparse
+import glob
+import os
+import json
+
+from tqdm import tqdm
+
+
+def main(args):
+  image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \
+      glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp"))
+  print(f"found {len(image_paths)} images.")
+
+  if args.in_json is None and os.path.isfile(args.out_json):
+    args.in_json = args.out_json
+
+  if args.in_json is not None:
+    print(f"loading existing metadata: {args.in_json}")
+    with open(args.in_json, "rt", encoding='utf-8') as f:
+      metadata = json.load(f)
+    print("captions for existing images will be overwritten / 既存の画像のキャプションは上書きされます")
+  else:
+    print("new metadata will be created / 新しいメタデータファイルが作成されます")
+    metadata = {}
+
+  print("merge caption texts to metadata json.")
+  for image_path in tqdm(image_paths):
+    caption_path = os.path.splitext(image_path)[0] + args.caption_extension
+    with open(caption_path, "rt", encoding='utf-8') as f:
+      caption = f.readlines()[0].strip()
+
+    image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0]
+    if image_key not in metadata:
+      metadata[image_key] = {}
+
+    metadata[image_key]['caption'] = caption
+    if args.debug:
+      print(image_key, caption)
+
+  # metadataを書き出して終わり
+  print(f"writing metadata: {args.out_json}")
+  with open(args.out_json, "wt", encoding='utf-8') as f:
+    json.dump(metadata, f, indent=2)
+  print("done!")
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+  parser.add_argument("--in_json", type=str, help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル（省略時、out_jsonが存在すればそれを読み込む）")
+  parser.add_argument("--caption_extention", type=str, default=None,
+                      help="extension of caption file (for backward compatibility) / 読み込むキャプションファイルの拡張子（スペルミスしていたのを残してあります）")
+  parser.add_argument("--caption_extension", type=str, default=".caption", help="extension of caption file / 読み込むキャプションファイルの拡張子")
+  parser.add_argument("--full_path", action="store_true",
+                      help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）")
+  parser.add_argument("--debug", action="store_true", help="debug mode")
+
+  args = parser.parse_args()
+
+  # スペルミスしていたオプションを復元する
+  if args.caption_extention is not None:
+    args.caption_extension = args.caption_extention
+
+  main(args)
--- a/merge_dd_tags_to_metadata.py
+++ b/merge_dd_tags_to_metadata.py
@@ -0,0 +1,60 @@
+# このスクリプトのライセンスは、Apache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+
+import argparse
+import glob
+import os
+import json
+
+from tqdm import tqdm
+
+
+def main(args):
+  image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \
+      glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp"))
+  print(f"found {len(image_paths)} images.")
+
+  if args.in_json is None and os.path.isfile(args.out_json):
+    args.in_json = args.out_json
+
+  if args.in_json is not None:
+    print(f"loading existing metadata: {args.in_json}")
+    with open(args.in_json, "rt", encoding='utf-8') as f:
+      metadata = json.load(f)
+    print("tags data for existing images will be overwritten / 既存の画像のタグは上書きされます")
+  else:
+    print("new metadata will be created / 新しいメタデータファイルが作成されます")
+    metadata = {}
+
+  print("merge tags to metadata json.")
+  for image_path in tqdm(image_paths):
+    tags_path = os.path.splitext(image_path)[0] + '.txt'
+    with open(tags_path, "rt", encoding='utf-8') as f:
+      tags = f.readlines()[0].strip()
+
+    image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0]
+    if image_key not in metadata:
+      metadata[image_key] = {}
+
+    metadata[image_key]['tags'] = tags
+    if args.debug:
+      print(image_key, tags)
+
+  # metadataを書き出して終わり
+  print(f"writing metadata: {args.out_json}")
+  with open(args.out_json, "wt", encoding='utf-8') as f:
+    json.dump(metadata, f, indent=2)
+  print("done!")
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+  parser.add_argument("--in_json", type=str, help="metadata file to input (if omitted and out_json exists, existing out_json is read) / 読み込むメタデータファイル（省略時、out_jsonが存在すればそれを読み込む）")
+  parser.add_argument("--full_path", action="store_true",
+                      help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）")
+  parser.add_argument("--debug", action="store_true", help="debug mode, print tags")
+
+  args = parser.parse_args()
+  main(args)
--- a/model_util.py
+++ b/model_util.py
--- a/prepare_buckets_latents.py
+++ b/prepare_buckets_latents.py
@@ -0,0 +1,177 @@
+# このスクリプトのライセンスは、Apache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+
+import argparse
+import glob
+import os
+import json
+
+from tqdm import tqdm
+import numpy as np
+from diffusers import AutoencoderKL
+from PIL import Image
+import cv2
+import torch
+from torchvision import transforms
+
+import model_util
+
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+IMAGE_TRANSFORMS = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5]),
+    ]
+)
+
+
+def get_latents(vae, images, weight_dtype):
+  img_tensors = [IMAGE_TRANSFORMS(image) for image in images]
+  img_tensors = torch.stack(img_tensors)
+  img_tensors = img_tensors.to(DEVICE, weight_dtype)
+  with torch.no_grad():
+    latents = vae.encode(img_tensors).latent_dist.sample().float().to("cpu").numpy()
+  return latents
+
+
+def main(args):
+  image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \
+      glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp"))
+  print(f"found {len(image_paths)} images.")
+
+  if os.path.exists(args.in_json):
+    print(f"loading existing metadata: {args.in_json}")
+    with open(args.in_json, "rt", encoding='utf-8') as f:
+      metadata = json.load(f)
+  else:
+    print(f"no metadata / メタデータファイルがありません: {args.in_json}")
+    return
+
+  weight_dtype = torch.float32
+  if args.mixed_precision == "fp16":
+    weight_dtype = torch.float16
+  elif args.mixed_precision == "bf16":
+    weight_dtype = torch.bfloat16
+
+  vae = model_util.load_vae(args.model_name_or_path, weight_dtype)
+  vae.eval()
+  vae.to(DEVICE, dtype=weight_dtype)
+
+  # bucketのサイズを計算する
+  max_reso = tuple([int(t) for t in args.max_resolution.split(',')])
+  assert len(max_reso) == 2, f"illegal resolution (not 'width,height') / 画像サイズに誤りがあります。'幅,高さ'で指定してください: {args.max_resolution}"
+
+  bucket_resos, bucket_aspect_ratios = model_util.make_bucket_resolutions(
+      max_reso, args.min_bucket_reso, args.max_bucket_reso)
+
+  # 画像をひとつずつ適切なbucketに割り当てながらlatentを計算する
+  bucket_aspect_ratios = np.array(bucket_aspect_ratios)
+  buckets_imgs = [[] for _ in range(len(bucket_resos))]
+  bucket_counts = [0 for _ in range(len(bucket_resos))]
+  img_ar_errors = []
+  for i, image_path in enumerate(tqdm(image_paths, smoothing=0.0)):
+    image_key = image_path if args.full_path else os.path.splitext(os.path.basename(image_path))[0]
+    if image_key not in metadata:
+      metadata[image_key] = {}
+
+    image = Image.open(image_path)
+    if image.mode != 'RGB':
+      image = image.convert("RGB")
+
+    aspect_ratio = image.width / image.height
+    ar_errors = bucket_aspect_ratios - aspect_ratio
+    bucket_id = np.abs(ar_errors).argmin()
+    reso = bucket_resos[bucket_id]
+    ar_error = ar_errors[bucket_id]
+    img_ar_errors.append(abs(ar_error))
+
+    # どのサイズにリサイズするか→トリミングする方向で
+    if ar_error <= 0:                   # 横が長い→縦を合わせる
+      scale = reso[1] / image.height
+    else:
+      scale = reso[0] / image.width
+
+    resized_size = (int(image.width * scale + .5), int(image.height * scale + .5))
+
+    # print(image.width, image.height, bucket_id, bucket_resos[bucket_id], ar_errors[bucket_id], resized_size,
+    #       bucket_resos[bucket_id][0] - resized_size[0], bucket_resos[bucket_id][1] - resized_size[1])
+
+    assert resized_size[0] == reso[0] or resized_size[1] == reso[
+        1], f"internal error, resized size not match: {reso}, {resized_size}, {image.width}, {image.height}"
+    assert resized_size[0] >= reso[0] and resized_size[1] >= reso[
+        1], f"internal error, resized size too small: {reso}, {resized_size}, {image.width}, {image.height}"
+
+    # 画像をリサイズしてトリミングする
+    # PILにinter_areaがないのでcv2で……
+    image = np.array(image)
+    image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA)
+    if resized_size[0] > reso[0]:
+      trim_size = resized_size[0] - reso[0]
+      image = image[:, trim_size//2:trim_size//2 + reso[0]]
+    elif resized_size[1] > reso[1]:
+      trim_size = resized_size[1] - reso[1]
+      image = image[trim_size//2:trim_size//2 + reso[1]]
+    assert image.shape[0] == reso[1] and image.shape[1] == reso[0], f"internal error, illegal trimmed size: {image.shape}, {reso}"
+
+    # # debug
+    # cv2.imwrite(f"r:\\test\\img_{i:05d}.jpg", image[:, :, ::-1])
+
+    # バッチへ追加
+    buckets_imgs[bucket_id].append((image_key, reso, image))
+    bucket_counts[bucket_id] += 1
+    metadata[image_key]['train_resolution'] = reso
+
+    # バッチを推論するか判定して推論する
+    is_last = i == len(image_paths) - 1
+    for j in range(len(buckets_imgs)):
+      bucket = buckets_imgs[j]
+      if (is_last and len(bucket) > 0) or len(bucket) >= args.batch_size:
+        latents = get_latents(vae, [img for _, _, img in bucket], weight_dtype)
+
+        for (image_key, reso, _), latent in zip(bucket, latents):
+          np.savez(os.path.join(args.train_data_dir, os.path.splitext(os.path.basename(image_key))[0]), latent)
+
+        # flip
+        if args.flip_aug:
+          latents = get_latents(vae, [img[:, ::-1].copy() for _, _, img in bucket], weight_dtype)   # copyがないとTensor変換できない
+
+          for (image_key, reso, _), latent in zip(bucket, latents):
+            np.savez(os.path.join(args.train_data_dir, os.path.splitext(os.path.basename(image_key))[0] + '_flip'), latent)
+
+        bucket.clear()
+
+  for i, (reso, count) in enumerate(zip(bucket_resos, bucket_counts)):
+    print(f"bucket {i} {reso}: {count}")
+  img_ar_errors = np.array(img_ar_errors)
+  print(f"mean ar error: {np.mean(img_ar_errors)}")
+
+  # metadataを書き出して終わり
+  print(f"writing metadata: {args.out_json}")
+  with open(args.out_json, "wt", encoding='utf-8') as f:
+    json.dump(metadata, f, indent=2)
+  print("done!")
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("in_json", type=str, help="metadata file to input / 読み込むメタデータファイル")
+  parser.add_argument("out_json", type=str, help="metadata file to output / メタデータファイル書き出し先")
+  parser.add_argument("model_name_or_path", type=str, help="model name or path to encode latents / latentを取得するためのモデル")
+  parser.add_argument("--v2", action='store_true',
+                      help='load Stable Diffusion v2.0 model / Stable Diffusion 2.0のモデルを読み込む')
+  parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+  parser.add_argument("--max_resolution", type=str, default="512,512",
+                      help="max resolution in fine tuning (width,height) / fine tuning時の最大画像サイズ 「幅,高さ」（使用メモリ量に関係します）")
+  parser.add_argument("--min_bucket_reso", type=int, default=256, help="minimum resolution for buckets / bucketの最小解像度")
+  parser.add_argument("--max_bucket_reso", type=int, default=1024, help="maximum resolution for buckets / bucketの最小解像度")
+  parser.add_argument("--mixed_precision", type=str, default="no",
+                      choices=["no", "fp16", "bf16"], help="use mixed precision / 混合精度を使う場合、その精度")
+  parser.add_argument("--full_path", action="store_true",
+                      help="use full path as image-key in metadata (supports multiple directories) / メタデータで画像キーをフルパスにする（複数の学習画像ディレクトリに対応）")
+  parser.add_argument("--flip_aug", action="store_true",
+                      help="flip augmentation, save latents for flipped images / 左右反転した画像もlatentを取得、保存する")
+
+  args = parser.parse_args()
+  main(args)
--- a/requirements_blip.txt
+++ b/requirements_blip.txt
@@ -0,0 +1,3 @@
+timm==0.4.12
+transformers==4.16.2
+fairscale==0.4.4
--- a/requirements_db_finetune.txt
+++ b/requirements_db_finetune.txt
@@ -0,0 +1,8 @@
+accelerate==0.14.0
+transformers>=4.21.0
+ftfy
+albumentations
+opencv-python
+einops
+pytorch_lightning
+safetensors
--- a/requirements_wd14_tagger.txt
+++ b/requirements_wd14_tagger.txt
@@ -0,0 +1,2 @@
+tensorflow<2.11
+huggingface-hub
--- a/tag_images_by_wd14_tagger.py
+++ b/tag_images_by_wd14_tagger.py
@@ -0,0 +1,143 @@
+# このスクリプトのライセンスは、Apache License 2.0とします
+# (c) 2022 Kohya S. @kohya_ss
+
+import argparse
+import csv
+import glob
+import os
+
+from PIL import Image
+import cv2
+from tqdm import tqdm
+import numpy as np
+from tensorflow.keras.models import load_model
+from huggingface_hub import hf_hub_download
+
+# from wd14 tagger
+IMAGE_SIZE = 448
+
+WD14_TAGGER_REPO = 'SmilingWolf/wd-v1-4-vit-tagger'
+FILES = ["keras_metadata.pb", "saved_model.pb", "selected_tags.csv"]
+SUB_DIR = "variables"
+SUB_DIR_FILES = ["variables.data-00000-of-00001", "variables.index"]
+CSV_FILE = FILES[-1]
+
+
+def main(args):
+  # hf_hub_downloadをそのまま使うとsymlink関係で問題があるらしいので、キャッシュディレクトリとforce_filenameを指定してなんとかする
+  # depreacatedの警告が出るけどなくなったらその時
+  # https://github.com/toriato/stable-diffusion-webui-wd14-tagger/issues/22
+  if not os.path.exists(args.model_dir) or args.force_download:
+    print("downloading wd14 tagger model from hf_hub")
+    for file in FILES:
+      hf_hub_download(args.repo_id, file, cache_dir=args.model_dir, force_download=True, force_filename=file)
+    for file in SUB_DIR_FILES:
+      hf_hub_download(args.repo_id, file, subfolder=SUB_DIR, cache_dir=os.path.join(
+          args.model_dir, SUB_DIR), force_download=True, force_filename=file)
+
+  # 画像を読み込む
+  image_paths = glob.glob(os.path.join(args.train_data_dir, "*.jpg")) + \
+      glob.glob(os.path.join(args.train_data_dir, "*.png")) + glob.glob(os.path.join(args.train_data_dir, "*.webp"))
+  print(f"found {len(image_paths)} images.")
+
+  print("loading model and labels")
+  model = load_model(args.model_dir)
+
+  # label_names = pd.read_csv("2022_0000_0899_6549/selected_tags.csv")
+  # 依存ライブラリを増やしたくないので自力で読むよ
+  with open(os.path.join(args.model_dir, CSV_FILE), "r", encoding="utf-8") as f:
+    reader = csv.reader(f)
+    l = [row for row in reader]
+    header = l[0]             # tag_id,name,category,count
+    rows = l[1:]
+  assert header[0] == 'tag_id' and header[1] == 'name' and header[2] == 'category', f"unexpected csv format: {header}"
+
+  tags = [row[1] for row in rows[1:] if row[2] == '0']      # categoryが0、つまり通常のタグのみ
+
+  # 推論する
+  def run_batch(path_imgs):
+    imgs = np.array([im for _, im in path_imgs])
+
+    probs = model(imgs, training=False)
+    probs = probs.numpy()
+
+    for (image_path, _), prob in zip(path_imgs, probs):
+      # 最初の4つはratingなので無視する
+      # # First 4 labels are actually ratings: pick one with argmax
+      # ratings_names = label_names[:4]
+      # rating_index = ratings_names["probs"].argmax()
+      # found_rating = ratings_names[rating_index: rating_index + 1][["name", "probs"]]
+
+      # それ以降はタグなのでconfidenceがthresholdより高いものを追加する
+      # Everything else is tags: pick any where prediction confidence > threshold
+      tag_text = ""
+      for i, p in enumerate(prob[4:]):                # numpyとか使うのが良いけど、まあそれほど数も多くないのでループで
+        if p >= args.thresh:
+          tag_text += ", " + tags[i]
+
+      if len(tag_text) > 0:
+        tag_text = tag_text[2:]                   # 最初の ", " を消す
+
+      with open(os.path.splitext(image_path)[0] + args.caption_extension, "wt", encoding='utf-8') as f:
+        f.write(tag_text + '\n')
+        if args.debug:
+          print(image_path, tag_text)
+
+  b_imgs = []
+  for image_path in tqdm(image_paths, smoothing=0.0):
+    img = Image.open(image_path)                  # cv2は日本語ファイル名で死ぬのとモード変換したいのでpillowで開く
+    if img.mode != 'RGB':
+      img = img.convert("RGB")
+    img = np.array(img)
+    img = img[:, :, ::-1]                         # RGB->BGR
+
+    # pad to square
+    size = max(img.shape[0:2])
+    pad_x = size - img.shape[1]
+    pad_y = size - img.shape[0]
+    pad_l = pad_x // 2
+    pad_t = pad_y // 2
+    img = np.pad(img, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode='constant', constant_values=255)
+
+    interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4
+    img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp)
+    # cv2.imshow("img", img)
+    # cv2.waitKey()
+    # cv2.destroyAllWindows()
+
+    img = img.astype(np.float32)
+    b_imgs.append((image_path, img))
+
+    if len(b_imgs) >= args.batch_size:
+      run_batch(b_imgs)
+      b_imgs.clear()
+
+  if len(b_imgs) > 0:
+    run_batch(b_imgs)
+
+  print("done!")
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument("train_data_dir", type=str, help="directory for train images / 学習画像データのディレクトリ")
+  parser.add_argument("--repo_id", type=str, default=WD14_TAGGER_REPO,
+                      help="repo id for wd14 tagger on Hugging Face / Hugging Faceのwd14 taggerのリポジトリID")
+  parser.add_argument("--model_dir", type=str, default="wd14_tagger_model",
+                      help="directory to store wd14 tagger model / wd14 taggerのモデルを格納するディレクトリ")
+  parser.add_argument("--force_download", action='store_true',
+                      help="force downloading wd14 tagger models / wd14 taggerのモデルを再ダウンロードします")
+  parser.add_argument("--thresh", type=float, default=0.35, help="threshold of confidence to add a tag / タグを追加するか判定する閾値")
+  parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference / 推論時のバッチサイズ")
+  parser.add_argument("--caption_extention", type=str, default=None,
+                      help="extension of caption file (for backward compatibility) / 出力されるキャプションファイルの拡張子（スペルミスしていたのを残してあります）")
+  parser.add_argument("--caption_extension", type=str, default=".txt", help="extension of caption file / 出力されるキャプションファイルの拡張子")
+  parser.add_argument("--debug", action="store_true", help="debug mode")
+
+  args = parser.parse_args()
+
+  # スペルミスしていたオプションを復元する
+  if args.caption_extention is not None:
+    args.caption_extension = args.caption_extention
+
+  main(args)
--- a/train_db_fixed.py
+++ b/train_db_fixed.py