From 95bc6e8749dde79b66b7150a86b7f84e6c6ec7fc Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 3 Sep 2023 12:46:40 +0900
Subject: [PATCH 1/4] update readme

---
 README.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ada3b3c9..524e63aa 100644
--- a/README.md
+++ b/README.md
@@ -22,11 +22,21 @@ __Stable Diffusion web UI now seems to support LoRA trained by ``sd-scripts``.__
 
 The feature of SDXL training is now available in sdxl branch as an experimental feature. 
 
-Aug 13, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version. 
+Sep 3, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version. 
+
+- ControlNet-LLLite is added. See [documenation](./docs/train_lllite_README.md) for details.
+- JPEG XL is supported. [#786](./pull/786) 
+- Peak memory usage is reduced. [#791](./pull/791)
+- Input perturbation noise is added. See [#798](./pull/798) for details.
+- Dataset subset now has `caption_prefix` and `caption_suffix` options. The strings are added to the beginning and the end of the captions before shuffling. You can specify the options in `.toml`.
+- Other minor changes.
+- Thanks for contributions from Isotr0py, vvern999, lansing  and others!
+
+Aug 13, 2023: 
 
 - LoRA-FA is added experimentally. Specify `--network_module networks.lora_fa` option instead of `--network_module networks.lora`. The trained model can be used as a normal LoRA model.
 
-Aug 12, 2023: Following are the changes from the previous version. 
+Aug 12, 2023: 
 
 - The default value of noise offset when omitted has been changed to 0 from 0.0357.
 - The different learning rates for each U-Net block are now supported. Specify with `--block_lr` option. Specify 23 values separated by commas like `--block_lr 1e-3,1e-3 ... 1e-3`.

From 5f08a21d123380c72a7acf8bde4d61f9fcfe16bc Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 3 Sep 2023 12:48:35 +0900
Subject: [PATCH 2/4] update readme

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 524e63aa..0d932f1d 100644
--- a/README.md
+++ b/README.md
@@ -25,9 +25,9 @@ The feature of SDXL training is now available in sdxl branch as an experimental
 Sep 3, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version. 
 
 - ControlNet-LLLite is added. See [documenation](./docs/train_lllite_README.md) for details.
-- JPEG XL is supported. [#786](./pull/786) 
-- Peak memory usage is reduced. [#791](./pull/791)
-- Input perturbation noise is added. See [#798](./pull/798) for details.
+- JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786) 
+- Peak memory usage is reduced. [#791](https://github.com/kohya-ss/sd-scripts/pull/791)
+- Input perturbation noise is added. See [#798](https://github.com/kohya-ss/sd-scripts/pull/798) for details.
 - Dataset subset now has `caption_prefix` and `caption_suffix` options. The strings are added to the beginning and the end of the captions before shuffling. You can specify the options in `.toml`.
 - Other minor changes.
 - Thanks for contributions from Isotr0py, vvern999, lansing  and others!

From 867e7d3238b7f6d5400bd3f44399d65c42ef8b39 Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 3 Sep 2023 12:49:51 +0900
Subject: [PATCH 3/4] fix typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0d932f1d..7d46dc9d 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ The feature of SDXL training is now available in sdxl branch as an experimental
 
 Sep 3, 2023: The feature will be merged into the main branch soon. Following are the changes from the previous version. 
 
-- ControlNet-LLLite is added. See [documenation](./docs/train_lllite_README.md) for details.
+- ControlNet-LLLite is added. See [documentation](./docs/train_lllite_README.md) for details.
 - JPEG XL is supported. [#786](https://github.com/kohya-ss/sd-scripts/pull/786) 
 - Peak memory usage is reduced. [#791](https://github.com/kohya-ss/sd-scripts/pull/791)
 - Input perturbation noise is added. See [#798](https://github.com/kohya-ss/sd-scripts/pull/798) for details.

From 207fc8b2561f459e0e7a8e9fa7b88fd7b0eda69c Mon Sep 17 00:00:00 2001
From: Kohya S <ykumeykume@gmail.com>
Date: Sun, 3 Sep 2023 17:50:27 +0900
Subject: [PATCH 4/4] fix to work regional LoRA

---
 networks/lora.py | 15 ++++++++++-----
 sdxl_gen_img.py  | 10 ++++++++--
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/networks/lora.py b/networks/lora.py
index cd73cbe7..0c75cd42 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -241,9 +241,13 @@ class LoRAInfModule(LoRAModule):
         else:
             area = x.size()[1]
 
-        mask = self.network.mask_dic[area]
+        mask = self.network.mask_dic.get(area, None)
         if mask is None:
-            raise ValueError(f"mask is None for resolution {area}")
+            # raise ValueError(f"mask is None for resolution {area}")
+            # emb_layers in SDXL doesn't have mask
+            # print(f"mask is None for resolution {area}, {x.size()}")
+            mask_size = (1, x.size()[1]) if len(x.size()) == 2 else (1, *x.size()[1:-1], 1)
+            return torch.ones(mask_size, dtype=x.dtype, device=x.device) / self.network.num_sub_prompts
         if len(x.size()) != 4:
             mask = torch.reshape(mask, (1, -1, 1))
         return mask
@@ -348,9 +352,10 @@ class LoRAInfModule(LoRAModule):
             out[-self.network.batch_size :] = x[-self.network.batch_size :]  # real_uncond
 
         # print("to_out_forward", self.lora_name, self.network.sub_prompt_index, self.network.num_sub_prompts)
-        # for i in range(len(masks)):
-        #     if masks[i] is None:
-        #         masks[i] = torch.zeros_like(masks[-1])
+        # if num_sub_prompts > num of LoRAs, fill with zero
+        for i in range(len(masks)):
+            if masks[i] is None:
+                masks[i] = torch.zeros_like(masks[0])
 
         mask = torch.cat(masks)
         mask_sum = torch.sum(mask, dim=0) + 1e-4
diff --git a/sdxl_gen_img.py b/sdxl_gen_img.py
index 35bd6c61..c506ad3f 100755
--- a/sdxl_gen_img.py
+++ b/sdxl_gen_img.py
@@ -451,10 +451,11 @@ class PipelineLike:
         tes_text_embs = []
         tes_uncond_embs = []
         tes_real_uncond_embs = []
-        # use last pool
+
         for tokenizer, text_encoder in zip(self.tokenizers, self.text_encoders):
             token_replacer = self.get_token_replacer(tokenizer)
 
+            # use last text_pool, because it is from text encoder 2
             text_embeddings, text_pool, uncond_embeddings, uncond_pool, _ = get_weighted_text_embeddings(
                 tokenizer,
                 text_encoder,
@@ -529,6 +530,11 @@ class PipelineLike:
         c_vector = torch.cat([emb1, emb2, emb3], dim=1).to(self.device, dtype=text_embeddings.dtype).repeat(batch_size, 1)
         uc_vector = torch.cat([uc_emb1, emb2, emb3], dim=1).to(self.device, dtype=text_embeddings.dtype).repeat(batch_size, 1)
 
+        if reginonal_network:
+            # use last pool for conditioning
+            num_sub_prompts = len(text_pool) // batch_size
+            text_pool = text_pool[num_sub_prompts - 1 :: num_sub_prompts]  # last subprompt
+
         c_vector = torch.cat([text_pool, c_vector], dim=1)
         uc_vector = torch.cat([uncond_pool, uc_vector], dim=1)
 
@@ -762,7 +768,7 @@ class PipelineLike:
         image = image.cpu().permute(0, 2, 3, 1).float().numpy()
 
         if torch.cuda.is_available():
-                torch.cuda.empty_cache()
+            torch.cuda.empty_cache()
 
         if output_type == "pil":
             # image = self.numpy_to_pil(image)