diff --git a/library/train_util.py b/library/train_util.py
index fc81a919..e9c5caef 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -14,7 +14,7 @@ import shutil
 import time
 import typing
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
-from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs, PartialState, DataLoaderConfiguration
+from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs, ProfileKwargs, PartialState, DataLoaderConfiguration
 import glob
 import math
 import os
@@ -5448,6 +5448,15 @@ def prepare_accelerator(args: argparse.Namespace):
             if args.ddp_gradient_as_bucket_view or args.ddp_static_graph
             else None
         ),
+        (
+            ProfileKwargs(
+                activities=["cpu", "cuda"],
+                output_trace_dir="/dev/shm/trace",
+                profile_memory=True,
+                record_shapes=True,
+                with_flops=True
+            )
+        )
     ]
     kwargs_handlers = [i for i in kwargs_handlers if i is not None]
     deepspeed_plugin = deepspeed_utils.prepare_deepspeed_plugin(args)
diff --git a/requirements-2.txt b/requirements-2.txt
new file mode 100644
index 00000000..2d7a2247
--- /dev/null
+++ b/requirements-2.txt
@@ -0,0 +1,48 @@
+accelerate
+transformers
+diffusers[torch]
+ftfy
+# albumentations==1.3.0
+opencv-python
+einops
+pytorch-lightning
+bitsandbytes
+prodigyopt
+lion-pytorch
+schedulefree
+tensorboard
+safetensors
+torchao
+pytorch-optimizer
+# gradio==3.16.2
+altair
+easygui
+toml
+voluptuous
+huggingface-hub
+# for Image utils
+imagesize
+numpy
+# for BLIP captioning
+# requests==2.28.2
+# timm==0.6.12
+# fairscale==0.4.13
+# for WD14 captioning (tensorflow)
+# tensorflow==2.10.1
+# for WD14 captioning (onnx)
+# onnx==1.15.0
+# onnxruntime-gpu==1.17.1
+# onnxruntime==1.17.1
+# for cuda 12.1(default 11.8)
+# onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
+
+# this is for onnx: 
+# protobuf==3.20.3
+# open clip for SDXL
+# open-clip-torch==2.20.0
+# For logging
+rich
+# for T5XXL tokenizer (SD3/FLUX)
+sentencepiece
+# for kohya_ss library
+-e .
diff --git a/train_native.py b/train_native.py
index 75d96133..91affa63 100644
--- a/train_native.py
+++ b/train_native.py
@@ -708,10 +708,10 @@ class NativeTrainer:
             unet.to(accelerator.device, dtype=weight_dtype)  # because of unet is not prepared
 
         # TODO: SDXL Model Specific
-        # TODO: Is casting to torch.tensor slowing down the performance so much? (20% slower)
+        # TODO: Why casting to torch.tensor will slow down the performance so much? (20% slower)
         training_models = []
         params_to_optimize = []
-        using_torchao = args.optimizer_type.endswith("4bit") or  args.optimizer_type.endswith("Fp8")
+        using_torchao = args.optimizer_type.endswith("4bit") or args.optimizer_type.endswith("Fp8")
         if train_unet:
             training_models.append(unet)
             if block_lrs is None:
@@ -1302,7 +1302,9 @@ class NativeTrainer:
                         f"initial_step is specified but not resuming. lr scheduler will be started from the beginning / initial_stepが指定されていますがresumeしていないため、lr schedulerは最初から始まります"
                     )
                 logger.info(f"skipping {initial_step} steps / {initial_step}ステップをスキップします")
-                initial_step *= args.gradient_accumulation_steps
+
+                #250406: Why multiply? It has been included.
+                #initial_step *= args.gradient_accumulation_steps
 
                 # set epoch to start to make initial_step less than len(train_dataloader)
                 epoch_to_start = initial_step // math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -1350,8 +1352,7 @@ class NativeTrainer:
             for skip_epoch in range(epoch_to_start):  # skip epochs
                 logger.info(f"skipping epoch {skip_epoch+1} because initial_step (multiplied) is {initial_step}")
                 initial_step -= len(train_dataloader)
-            # I have found that the log is screwed up. This should be divided back.
-            global_step = int(initial_step / args.gradient_accumulation_steps)
+            global_step = initial_step
 
         # log device and dtype for each model
         logger.info(f"unet dtype: {unet_weight_dtype}, device: {unet.device}")
@@ -1380,6 +1381,8 @@ class NativeTrainer:
                 initial_step = 1
 
             for step, batch in enumerate(skipped_dataloader or train_dataloader):
+                #Enable this for profiler. Hint: select a big area (until EPOCH VALIDATION) and tab / shift tab
+                #with accelerator.profile() as prof:
                 current_step.value = global_step
                 if initial_step > 0:
                     initial_step -= 1
@@ -1392,7 +1395,10 @@ class NativeTrainer:
                 # Tne correct specific "network" operation has been removed.
                 # The process_batch will wrap all the inference logic (because it will be used for validation dataset also)
                 with accelerator.accumulate(*training_models):
-                  
+                    # 250331: From HF guide
+                    # 250406: No need
+                    #optimizer.zero_grad(set_to_none=True)
+                    
                     # temporary, for batch processing
                     self.on_step_start(args, accelerator, text_encoders, unet, batch, weight_dtype)
 
@@ -1415,13 +1421,13 @@ class NativeTrainer:
 
                     accelerator.backward(loss)
 
+                    #250331: It is required to sync manually. See torch.Tensor.grad
                     if accelerator.sync_gradients:
                         for training_model in training_models:
-                            self.all_reduce_training_model(accelerator, training_model)  # sync DDP grad manually
-                            if args.max_grad_norm != 0.0:
-                                if hasattr(training_model, "get_trainable_params"):
-                                    params_to_clip = accelerator.unwrap_model(training_model).get_trainable_params()
-                                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                            self.all_reduce_training_model(accelerator, training_model)  # sync DDP grad manually                            
+                            if (args.max_grad_norm != 0.0) and hasattr(training_model, "get_trainable_params"):
+                                params_to_clip = accelerator.unwrap_model(training_model).get_trainable_params()
+                                accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
 
                     optimizer.step()
                     lr_scheduler.step()