Merge branch 'dev' into sd3

This commit is contained in:
Kohya S
2024-09-07 10:59:22 +09:00
8 changed files with 53 additions and 22 deletions

View File

@@ -18,4 +18,4 @@ jobs:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: typos-action - name: typos-action
uses: crate-ci/typos@v1.21.0 uses: crate-ci/typos@v1.24.3

View File

@@ -553,7 +553,12 @@ The majority of scripts is licensed under ASL 2.0 (including codes from Diffuser
### Working in progress ### Working in progress
- When enlarging images in the script (when the size of the training image is small and bucket_no_upscale is not specified), it has been changed to use Pillow's resize and LANCZOS interpolation instead of OpenCV2's resize and Lanczos4 interpolation. The quality of the image enlargement may be slightly improved. PR [#1426](https://github.com/kohya-ss/sd-scripts/pull/1426) Thanks to sdbds!
- Sample image generation during training now works on non-CUDA devices. PR [#1433](https://github.com/kohya-ss/sd-scripts/pull/1433) Thanks to millie-v!
- `--v_parameterization` is available in `sdxl_train.py`. The results are unpredictable, so use with caution. PR [#1505](https://github.com/kohya-ss/sd-scripts/pull/1505) Thanks to liesened! - `--v_parameterization` is available in `sdxl_train.py`. The results are unpredictable, so use with caution. PR [#1505](https://github.com/kohya-ss/sd-scripts/pull/1505) Thanks to liesened!
- Fused optimizer is available for SDXL training. PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) Thanks to 2kpr! - Fused optimizer is available for SDXL training. PR [#1259](https://github.com/kohya-ss/sd-scripts/pull/1259) Thanks to 2kpr!
- The memory usage during training is significantly reduced by integrating the optimizer's backward pass with step. The training results are the same as before, but if you have plenty of memory, the speed will be slower. - The memory usage during training is significantly reduced by integrating the optimizer's backward pass with step. The training results are the same as before, but if you have plenty of memory, the speed will be slower.
- Specify the `--fused_backward_pass` option in `sdxl_train.py`. At this time, only Adafactor is supported. Gradient accumulation is not available. - Specify the `--fused_backward_pass` option in `sdxl_train.py`. At this time, only Adafactor is supported. Gradient accumulation is not available.

View File

@@ -11,7 +11,7 @@ from PIL import Image
from tqdm import tqdm from tqdm import tqdm
import library.train_util as train_util import library.train_util as train_util
from library.utils import setup_logging from library.utils import setup_logging, pil_resize
setup_logging() setup_logging()
import logging import logging
@@ -42,8 +42,10 @@ def preprocess_image(image):
pad_t = pad_y // 2 pad_t = pad_y // 2
image = np.pad(image, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode="constant", constant_values=255) image = np.pad(image, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode="constant", constant_values=255)
interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4 if size > IMAGE_SIZE:
image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp) image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), cv2.INTER_AREA)
else:
image = pil_resize(image, (IMAGE_SIZE, IMAGE_SIZE))
image = image.astype(np.float32) image = image.astype(np.float32)
return image return image

View File

@@ -73,7 +73,7 @@ import library.model_util as model_util
import library.huggingface_util as huggingface_util import library.huggingface_util as huggingface_util
import library.sai_model_spec as sai_model_spec import library.sai_model_spec as sai_model_spec
import library.deepspeed_utils as deepspeed_utils import library.deepspeed_utils as deepspeed_utils
from library.utils import setup_logging from library.utils import setup_logging, pil_resize
setup_logging() setup_logging()
import logging import logging
@@ -1708,7 +1708,7 @@ class DreamBoothDataset(BaseDataset):
def load_dreambooth_dir(subset: DreamBoothSubset): def load_dreambooth_dir(subset: DreamBoothSubset):
if not os.path.isdir(subset.image_dir): if not os.path.isdir(subset.image_dir):
logger.warning(f"not directory: {subset.image_dir}") logger.warning(f"not directory: {subset.image_dir}")
return [], [] return [], [], []
info_cache_file = os.path.join(subset.image_dir, self.IMAGE_INFO_CACHE_FILE) info_cache_file = os.path.join(subset.image_dir, self.IMAGE_INFO_CACHE_FILE)
use_cached_info_for_subset = subset.cache_info use_cached_info_for_subset = subset.cache_info
@@ -2263,9 +2263,7 @@ class ControlNetDataset(BaseDataset):
# ), f"image size is small / 画像サイズが小さいようです: {image_info.absolute_path}" # ), f"image size is small / 画像サイズが小さいようです: {image_info.absolute_path}"
# resize to target # resize to target
if cond_img.shape[0] != target_size_hw[0] or cond_img.shape[1] != target_size_hw[1]: if cond_img.shape[0] != target_size_hw[0] or cond_img.shape[1] != target_size_hw[1]:
cond_img = cv2.resize( cond_img = pil_resize(cond_img, (int(target_size_hw[1]), int(target_size_hw[0])))
cond_img, (int(target_size_hw[1]), int(target_size_hw[0])), interpolation=cv2.INTER_LANCZOS4
)
if flipped: if flipped:
cond_img = cond_img[:, ::-1, :].copy() # copy to avoid negative stride cond_img = cond_img[:, ::-1, :].copy() # copy to avoid negative stride
@@ -2659,7 +2657,10 @@ def trim_and_resize_if_required(
if image_width != resized_size[0] or image_height != resized_size[1]: if image_width != resized_size[0] or image_height != resized_size[1]:
# リサイズする # リサイズする
image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA) # INTER_AREAでやりたいのでcv2でリサイズ if image_width > resized_size[0] and image_height > resized_size[1]:
image = cv2.resize(image, resized_size, interpolation=cv2.INTER_AREA) # INTER_AREAでやりたいのでcv2でリサイズ
else:
image = pil_resize(image, resized_size)
image_height, image_width = image.shape[0:2] image_height, image_width = image.shape[0:2]
@@ -5657,7 +5658,7 @@ def sample_images_common(
clean_memory_on_device(accelerator.device) clean_memory_on_device(accelerator.device)
torch.set_rng_state(rng_state) torch.set_rng_state(rng_state)
if cuda_rng_state is not None: if torch.cuda.is_available() and cuda_rng_state is not None:
torch.cuda.set_rng_state(cuda_rng_state) torch.cuda.set_rng_state(cuda_rng_state)
vae.to(org_vae_device) vae.to(org_vae_device)
@@ -5691,11 +5692,13 @@ def sample_image_inference(
if seed is not None: if seed is not None:
torch.manual_seed(seed) torch.manual_seed(seed)
torch.cuda.manual_seed(seed) if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
else: else:
# True random sample image generation # True random sample image generation
torch.seed() torch.seed()
torch.cuda.seed() if torch.cuda.is_available():
torch.cuda.seed()
scheduler = get_my_scheduler( scheduler = get_my_scheduler(
sample_sampler=sampler_name, sample_sampler=sampler_name,
@@ -5730,8 +5733,9 @@ def sample_image_inference(
controlnet_image=controlnet_image, controlnet_image=controlnet_image,
) )
with torch.cuda.device(torch.cuda.current_device()): if torch.cuda.is_available():
torch.cuda.empty_cache() with torch.cuda.device(torch.cuda.current_device()):
torch.cuda.empty_cache()
image = pipeline.latents_to_image(latents)[0] image = pipeline.latents_to_image(latents)[0]

View File

@@ -10,6 +10,9 @@ from torchvision import transforms
from diffusers import EulerAncestralDiscreteScheduler from diffusers import EulerAncestralDiscreteScheduler
import diffusers.schedulers.scheduling_euler_ancestral_discrete import diffusers.schedulers.scheduling_euler_ancestral_discrete
from diffusers.schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteSchedulerOutput from diffusers.schedulers.scheduling_euler_ancestral_discrete import EulerAncestralDiscreteSchedulerOutput
import cv2
from PIL import Image
import numpy as np
def fire_in_thread(f, *args, **kwargs): def fire_in_thread(f, *args, **kwargs):
@@ -301,6 +304,17 @@ class MemoryEfficientSafeOpen:
# return byte_tensor.view(torch.uint8).to(torch.float16).reshape(shape) # return byte_tensor.view(torch.uint8).to(torch.float16).reshape(shape)
raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)") raise ValueError(f"Unsupported float8 type: {dtype_str} (upgrade PyTorch to support float8 types)")
def pil_resize(image, size, interpolation=Image.LANCZOS):
pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
# use Pillow resize
resized_pil = pil_image.resize(size, interpolation)
# return cv2 image
resized_cv2 = cv2.cvtColor(np.array(resized_pil), cv2.COLOR_RGB2BGR)
return resized_cv2
# TODO make inf_utils.py # TODO make inf_utils.py

View File

@@ -3,7 +3,7 @@ transformers==4.44.0
diffusers[torch]==0.25.0 diffusers[torch]==0.25.0
ftfy==6.1.1 ftfy==6.1.1
# albumentations==1.3.0 # albumentations==1.3.0
opencv-python==4.7.0.68 opencv-python==4.8.1.78
einops==0.7.0 einops==0.7.0
pytorch-lightning==1.9.0 pytorch-lightning==1.9.0
bitsandbytes==0.43.3 bitsandbytes==0.43.3

View File

@@ -15,7 +15,7 @@ import os
from anime_face_detector import create_detector from anime_face_detector import create_detector
from tqdm import tqdm from tqdm import tqdm
import numpy as np import numpy as np
from library.utils import setup_logging from library.utils import setup_logging, pil_resize
setup_logging() setup_logging()
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -172,7 +172,10 @@ def process(args):
if scale != 1.0: if scale != 1.0:
w = int(w * scale + .5) w = int(w * scale + .5)
h = int(h * scale + .5) h = int(h * scale + .5)
face_img = cv2.resize(face_img, (w, h), interpolation=cv2.INTER_AREA if scale < 1.0 else cv2.INTER_LANCZOS4) if scale < 1.0:
face_img = cv2.resize(face_img, (w, h), interpolation=cv2.INTER_AREA)
else:
face_img = pil_resize(face_img, (w, h))
cx = int(cx * scale + .5) cx = int(cx * scale + .5)
cy = int(cy * scale + .5) cy = int(cy * scale + .5)
fw = int(fw * scale + .5) fw = int(fw * scale + .5)

View File

@@ -6,7 +6,7 @@ import shutil
import math import math
from PIL import Image from PIL import Image
import numpy as np import numpy as np
from library.utils import setup_logging from library.utils import setup_logging, pil_resize
setup_logging() setup_logging()
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -24,9 +24,9 @@ def resize_images(src_img_folder, dst_img_folder, max_resolution="512x512", divi
# Select interpolation method # Select interpolation method
if interpolation == 'lanczos4': if interpolation == 'lanczos4':
cv2_interpolation = cv2.INTER_LANCZOS4 pil_interpolation = Image.LANCZOS
elif interpolation == 'cubic': elif interpolation == 'cubic':
cv2_interpolation = cv2.INTER_CUBIC pil_interpolation = Image.BICUBIC
else: else:
cv2_interpolation = cv2.INTER_AREA cv2_interpolation = cv2.INTER_AREA
@@ -64,7 +64,10 @@ def resize_images(src_img_folder, dst_img_folder, max_resolution="512x512", divi
new_width = int(img.shape[1] * math.sqrt(scale_factor)) new_width = int(img.shape[1] * math.sqrt(scale_factor))
# Resize image # Resize image
img = cv2.resize(img, (new_width, new_height), interpolation=cv2_interpolation) if cv2_interpolation:
img = cv2.resize(img, (new_width, new_height), interpolation=cv2_interpolation)
else:
img = pil_resize(img, (new_width, new_height), interpolation=pil_interpolation)
else: else:
new_height, new_width = img.shape[0:2] new_height, new_width = img.shape[0:2]