unslothai · danielhanchen · May 12, 2025 · Mar 23, 2025 · Mar 23, 2025 · Apr 3, 2025
diff --git a/unsloth/models/__init__.py b/unsloth/models/__init__.py
@@ -18,5 +18,5 @@
 from .qwen2   import FastQwen2Model
 from .granite import FastGraniteModel
 from .dpo     import PatchDPOTrainer, PatchKTOTrainer
-from ._utils  import is_bfloat16_supported, __version__
+from ._utils  import is_bfloat16_supported, is_vLLM_available, __version__
 from .rl      import PatchFastRL, vLLMSamplingParams
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
@@ -17,6 +17,7 @@
 __all__ = [
     "SUPPORTS_BFLOAT16",
     "is_bfloat16_supported",
+    "is_vLLM_available",
 
     "prepare_model_for_kbit_training",
     "xformers",
@@ -790,6 +791,9 @@ def is_bfloat16_supported():
     return SUPPORTS_BFLOAT16
 pass
 
+def is_vLLM_available():
+    return _is_package_available("vllm")
+pass
 
 # Patches models to add RoPE Scaling
 def patch_linear_scaling(

diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -1654,9 +1654,7 @@ def from_pretrained(
             )
         pass
         if fast_inference:
-            from transformers.utils.import_utils import _is_package_available
-            _vllm_available = _is_package_available("vllm")
-            if _vllm_available == False:
+            if is_vLLM_available() == False:
                 print("Unsloth: vLLM is not installed! Will use Unsloth inference!")
                 fast_inference = False
             major_version, minor_version = torch.cuda.get_device_capability()

diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py
@@ -14,6 +14,7 @@
 
 from ._utils import (
     is_bfloat16_supported,
+    is_vLLM_available,
     HAS_FLASH_ATTENTION,
     HAS_FLASH_ATTENTION_SOFTCAPPING,
     USE_MODELSCOPE,
@@ -338,10 +339,7 @@ def from_pretrained(
         pass
 
         if fast_inference:
-            import platform
-            from transformers.utils.import_utils import _is_package_available
-            _vllm_available = _is_package_available("vllm")
-            if _vllm_available == False:
+            if is_vLLM_available() == False:
                 print("Unsloth: vLLM is not installed! Will use Unsloth inference!")
                 fast_inference = False
             pass