diff --git a/flux_train.py b/flux_train.py
index 022467ea..81c13e4c 100644
--- a/flux_train.py
+++ b/flux_train.py
@@ -706,7 +706,9 @@ def train(args):
         accelerator.unwrap_model(flux).prepare_block_swap_before_forward()
 
     # For --sample_at_first
+    optimizer_eval_fn()
     flux_train_utils.sample_images(accelerator, args, 0, global_step, flux, ae, [clip_l, t5xxl], sample_prompts_te_outputs)
+    optimizer_train_fn()
     if len(accelerator.trackers) > 0:
         # log empty object to commit the sample images to wandb
         accelerator.log({}, step=0)
diff --git a/train_network.py b/train_network.py
index 7b2b76a1..f0d397b9 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1042,7 +1042,9 @@ class NetworkTrainer:
             text_encoder = None
 
         # For --sample_at_first
+        optimizer_eval_fn()
         self.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizers, text_encoder, unet)
+        optimizer_train_fn()
         if len(accelerator.trackers) > 0:
             # log empty object to commit the sample images to wandb
             accelerator.log({}, step=0)