Add non_blocking to loading and moving tensors

2026-04-17 17:24:21 +00:00 · 2025-10-10 14:50:27 -04:00
parent 5e366acda4
commit d4081b2e66
9 changed files with 215 additions and 113 deletions
--- a/library/custom_offloading_utils.py
+++ b/library/custom_offloading_utils.py
@@ -53,7 +53,7 @@ def swap_weight_devices_cuda(device: torch.device, layer_to_cpu: nn.Module, laye
                    # print(
                    #     f"Module {module_to_cuda_name} not found in CPU model or shape mismatch, so not swapping and moving to device"
                    # )
-                    module_to_cuda.weight.data = module_to_cuda.weight.data.to(device)
+                    module_to_cuda.weight.data = module_to_cuda.weight.data.to(device, non_blocking=True)

    torch.cuda.current_stream().synchronize()  # this prevents the illegal loss value