enable test_dispatch_model_tied_weights_memory_with_nested_offload_cpu on xpu (#3569)

yao-matrix · web-flow · commit 97c93c480937 · 2025-05-15T11:40:55.000+02:00
* enable test_dispatch_model_tied_weights_memory_with_nested_offload_cpu
case on XPU

Signed-off-by: Matrix Yao &lt;matrix.yao@intel.com&gt;

* replace hard-coded torch.cuda w/ device-dependent callings

Signed-off-by: Matrix Yao &lt;matrix.yao@intel.com&gt;

* fix style

Signed-off-by: Matrix Yao &lt;matrix.yao@intel.com&gt;

* use device agnostic clear_device_cache

Signed-off-by: Matrix Yao &lt;matrix.yao@intel.com&gt;

* fix style

Signed-off-by: Matrix Yao &lt;matrix.yao@intel.com&gt;

---------

Signed-off-by: Matrix Yao &lt;matrix.yao@intel.com&gt;
diff --git a/tests/test_big_modeling.py b/tests/test_big_modeling.py
@@ -36,7 +36,6 @@
 from accelerate.hooks import remove_hook_from_submodules
 from accelerate.test_utils import (
     require_bnb,
-    require_cuda,
     require_cuda_or_xpu,
     require_multi_device,
     require_multi_gpu_or_xpu,
@@ -47,6 +46,7 @@
     torch_device,
 )
 from accelerate.utils import is_hpu_available, offload_state_dict
+from accelerate.utils.memory import clear_device_cache
 from accelerate.utils.versions import is_torch_version
 
 
@@ -379,7 +379,7 @@ def test_dispatch_model_tied_weights_memory(self):
 
         torch_accelerator_module = getattr(torch, torch_device_type)
 
-        torch_accelerator_module.empty_cache()  # Needed in case we run several tests in a row.
+        clear_device_cache()  # Needed in case we run several tests in a row.
 
         model = nn.Sequential(
             OrderedDict(
@@ -443,7 +443,7 @@ def test_dispatch_model_tied_weights_memory_with_nested_offload_cpu(self):
         # Test that we do not duplicate tied weights at any point during dispatch_model call.
 
         torch_accelerator_module = getattr(torch, torch_device_type)
-        torch_accelerator_module.empty_cache()  # Needed in case we run several tests in a row.
+        clear_device_cache()  # Needed in case we run several tests in a row.
 
         class SubModule(torch.nn.Module):
             def __init__(self, ref_to_parameter):
@@ -521,7 +521,7 @@ def forward(self, x):
 
         torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)
 
-        torch_accelerator_module.empty_cache()
+        clear_device_cache()
 
         free_memory_bytes_after_infer = torch_accelerator_module.mem_get_info(torch_device)[0]
 
@@ -536,14 +536,16 @@ def forward(self, x):
 
     # This test fails because sometimes data_ptr() of compute2.weight is the same as compute1.weight.
     # I checked that the values are not the same but it gives the same address. This does not happen on my local machine.
-    @require_cuda
+    @require_cuda_or_xpu
     @unittest.skip(
         "Flaky test, we should have enough coverage with test_dispatch_model_tied_weights_memory_with_nested_offload_cpu test"
     )
     def test_dispatch_model_tied_weights_memory_with_nested_offload_disk(self):
         # Test that we do not duplicate tied weights at any point during dispatch_model call.
 
-        torch.cuda.empty_cache()  # Needed in case we run several tests in a row.
+        torch_accelerator_module = getattr(torch, torch_device_type)
+
+        clear_device_cache()  # Needed in case we run several tests in a row.
 
         class SubModule(torch.nn.Module):
             def __init__(self, ref_to_parameter):
@@ -589,37 +591,43 @@ def forward(self, x):
             expected = model(x)
 
         # Just to initialize CUDA context.
-        a = torch.rand(5).to("cuda:0")  # noqa: F841
+        device_0 = f"{torch_device_type}:0"
+        a = torch.rand(5).to(device_0)  # noqa: F841
 
-        free_memory_bytes = torch.cuda.mem_get_info("cuda:0")[0]
+        free_memory_bytes = torch_accelerator_module.mem_get_info(device_0)[0]
         required_memory_bytes = 2 * 5000 * 5000 * (32 // 8)  # 200 MB
 
         # Leaving 150 MB of free memory for possible buffers, etc.
         n_vals = (free_memory_bytes - required_memory_bytes - int(200e6)) // (32 // 8)
-        foo = torch.rand(n_vals, device="cuda:0")  # noqa: F841
+        foo = torch.rand(n_vals, device=device_0)  # noqa: F841
 
-        free_memory_bytes_before_dispatch = torch.cuda.mem_get_info("cuda:0")[0]
+        free_memory_bytes_before_dispatch = torch_accelerator_module.mem_get_info(device_0)[0]
         with TemporaryDirectory() as tmp_dir:
             dispatch_model(model, device_map, offload_dir=tmp_dir)
-            free_memory_bytes_after_dispatch = torch.cuda.mem_get_info("cuda:0")[0]
+            free_memory_bytes_after_dispatch = torch_accelerator_module.mem_get_info(device_0)[0]
 
             assert (free_memory_bytes_after_dispatch - free_memory_bytes_before_dispatch) * 1e-6 < 130
 
+            oom_error = (
+                torch.OutOfMemoryError
+                if hasattr(torch, "OutOfMemoryError")
+                else torch_accelerator_module.OutOfMemoryError
+            )
             with torch.no_grad():
                 try:
                     output = model(x)
-                except torch.cuda.OutOfMemoryError as e:
-                    raise torch.cuda.OutOfMemoryError(
+                except oom_error as e:
+                    raise oom_error(
                         f"OOM error in dispatch_model. This is a bug and should not happen, see test_dispatch_model_tied_weights_memory_with_nested_offload_disk. {e}"
                     )
                 except Exception as e:
                     raise e
 
             torch.testing.assert_close(expected, output.cpu(), atol=ATOL, rtol=RTOL)
 
-            torch.cuda.empty_cache()
+            clear_device_cache()
 
-            free_memory_bytes_after_infer = torch.cuda.mem_get_info("cuda:0")[0]
+            free_memory_bytes_after_infer = torch_accelerator_module.mem_get_info(device_0)[0]
 
             # Check that we have no more references on GPU for the offloaded tied weight.
             n_non_empty = 0
diff --git a/tests/test_quantization.py b/tests/test_quantization.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import gc
 import tempfile
 import unittest
 
@@ -543,8 +542,7 @@ def tearDown(self):
         del self.model_fp16
         del self.model_8bit
 
-        gc.collect()
-        torch.cuda.empty_cache()
+        clear_device_cache(garbage_collection=True)
 
     def test_memory_footprint(self):
         r"""
@@ -663,8 +661,7 @@ def tearDown(self):
         del self.model_fp16
         del self.model_4bit
 
-        gc.collect()
-        torch.cuda.empty_cache()
+        clear_device_cache(garbage_collection=True)
 
     def test_memory_footprint(self):
         r"""