Disable cudaMallocAsync for post2 release

turboderp · turboderp · commit 75f969a6d3ef · 2024-02-15T00:07:47.000+01:00
diff --git a/exllamav2/model.py b/exllamav2/model.py
@@ -9,14 +9,16 @@
 # Set CUDA context to lazy loading since we won't need 95% of the modules in Torch
 os.environ["CUDA_MODULE_LOADING"] = "LAZY"
 
-# Set cudaMallocAsync allocator by default as it appears slightly more memory efficient, unless Torch is already
-# imported in which case changing the allocator would cause it to crash
-if not "PYTORCH_CUDA_ALLOC_CONF" in os.environ:
-    try:
-        x = torch.__version__
-        # TODO: Should maybe be a warning here?
-    except NameError:
-        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
+# Disabled for 0.0.13.post2
+#
+# # Set cudaMallocAsync allocator by default as it appears slightly more memory efficient, unless Torch is already
+# # imported in which case changing the allocator would cause it to crash
+# if not "PYTORCH_CUDA_ALLOC_CONF" in os.environ:
+#     try:
+#         x = torch.__version__
+#         # TODO: Should maybe be a warning here?
+#     except NameError:
+#         os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
 
 import torch
 import math