fix tokenizer 2 is not same as open clip tokenizer

This commit is contained in:
Kohya S
2023-07-14 12:27:19 +09:00
parent b4a3824ce4
commit 9de357e373

View File

@@ -114,7 +114,7 @@ def load_tokenizers(args: argparse.Namespace):
original_paths = [TOKENIZER1_PATH, TOKENIZER2_PATH]
tokeniers = []
for original_path in original_paths:
for i, original_path in enumerate(original_paths):
tokenizer: CLIPTokenizer = None
if args.tokenizer_cache_dir:
local_tokenizer_path = os.path.join(args.tokenizer_cache_dir, original_path.replace("/", "_"))
@@ -129,6 +129,9 @@ def load_tokenizers(args: argparse.Namespace):
print(f"save Tokenizer to cache: {local_tokenizer_path}")
tokenizer.save_pretrained(local_tokenizer_path)
if i == 1:
tokenizer.pad_token_id = 0 # fix pad token id to make same as open clip tokenizer
tokeniers.append(tokenizer)
if hasattr(args, "max_token_length") and args.max_token_length is not None: