Skip to content

Commit 13f9f7a

Browse files
authored
[[Misc]Upgrade bitsandbytes to the latest version 0.44.0 (#8768)
1 parent 1e7d5c0 commit 13f9f7a

File tree

7 files changed

+44
-34
lines changed

7 files changed

+44
-34
lines changed

docs/source/quantization/bnb.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Below are the steps to utilize BitsAndBytes with vLLM.
1111

1212
.. code-block:: console
1313
14-
$ pip install bitsandbytes>=0.42.0
14+
$ pip install bitsandbytes>=0.44.0
1515
1616
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
1717

examples/lora_with_quantization_inference.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -79,23 +79,17 @@ def initialize_engine(model: str, quantization: str,
7979
# It quantizes the model when loading, with some config info from the
8080
# LoRA adapter repo. So need to set the parameter of load_format and
8181
# qlora_adapter_name_or_path as below.
82-
engine_args = EngineArgs(
83-
model=model,
84-
quantization=quantization,
85-
qlora_adapter_name_or_path=lora_repo,
86-
load_format="bitsandbytes",
87-
enable_lora=True,
88-
max_lora_rank=64,
89-
# set it only in GPUs of limited memory
90-
enforce_eager=True)
82+
engine_args = EngineArgs(model=model,
83+
quantization=quantization,
84+
qlora_adapter_name_or_path=lora_repo,
85+
load_format="bitsandbytes",
86+
enable_lora=True,
87+
max_lora_rank=64)
9188
else:
92-
engine_args = EngineArgs(
93-
model=model,
94-
quantization=quantization,
95-
enable_lora=True,
96-
max_loras=4,
97-
# set it only in GPUs of limited memory
98-
enforce_eager=True)
89+
engine_args = EngineArgs(model=model,
90+
quantization=quantization,
91+
enable_lora=True,
92+
max_loras=4)
9993
return LLMEngine.from_engine_args(engine_args)
10094

10195

requirements-test.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,5 @@ datamodel_code_generator # required for minicpm3 test
3030
aiohttp
3131

3232
# quantization
33-
bitsandbytes==0.42.0
33+
bitsandbytes>=0.44.0
3434
buildkite-test-collector==0.1.8

tests/quantization/test_bitsandbytes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def validate_generated_texts(hf_runner,
107107
quantization='bitsandbytes',
108108
load_format='bitsandbytes',
109109
tensor_parallel_size=vllm_tp_size,
110-
enforce_eager=True,
110+
enforce_eager=False,
111111
gpu_memory_utilization=0.8) as llm:
112112
vllm_outputs = llm.generate_greedy(prompts, 8)
113113
vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

vllm/config.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ def __init__(self,
222222
self._verify_embedding_mode()
223223
self._verify_quantization()
224224
self._verify_cuda_graph()
225+
self._verify_bnb_config()
225226

226227
def _init_multimodal_config(
227228
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
@@ -337,6 +338,28 @@ def _verify_cuda_graph(self) -> None:
337338
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
338339
self.max_model_len)
339340

341+
def _verify_bnb_config(self) -> None:
342+
"""
343+
The current version of bitsandbytes (0.44.0) with 8-bit models does not
344+
yet support CUDA graph.
345+
"""
346+
is_bitsandbytes = self.quantization == "bitsandbytes"
347+
has_quantization_config = (getattr(self.hf_config,
348+
"quantization_config", None)
349+
is not None)
350+
is_8bit = (self.hf_config.quantization_config.get(
351+
"load_in_8bit", False) if has_quantization_config else False)
352+
if all([
353+
is_bitsandbytes,
354+
has_quantization_config,
355+
is_8bit,
356+
not self.enforce_eager,
357+
]):
358+
logger.warning(
359+
"CUDA graph is not supported on BitAndBytes 8bit yet, "
360+
"fallback to the eager mode.")
361+
self.enforce_eager = True
362+
340363
def verify_async_output_proc(self, parallel_config, speculative_config,
341364
device_config) -> None:
342365
if not self.use_async_output_proc:
@@ -401,13 +424,6 @@ def verify_with_parallel_config(
401424
"Pipeline parallelism is only supported for the following "
402425
f" architectures: {_PP_SUPPORTED_MODELS}.")
403426

404-
# Remove the constraint after the bitsandbytes issue is fixed:
405-
# https://github.com/bitsandbytes-foundation/bitsandbytes/issues/1308
406-
if self.quantization == "bitsandbytes" and self.enforce_eager is False:
407-
logger.warning("CUDA graph is not supported on BitAndBytes yet, "
408-
"fallback to the eager mode.")
409-
self.enforce_eager = True
410-
411427
if pipeline_parallel_size > 1 and self.use_async_output_proc:
412428
logger.warning("Async output processor is not supported with "
413429
"pipeline parallelism currently. Disabling it.")

vllm/model_executor/layers/quantization/bitsandbytes.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,12 @@ class BitsAndBytesLinearMethod(LinearMethodBase):
121121
def __init__(self, quant_config: BitsAndBytesConfig):
122122
try:
123123
import bitsandbytes
124-
if bitsandbytes.__version__ < "0.42.0":
124+
if bitsandbytes.__version__ < "0.44.0":
125125
raise ImportError("bitsandbytes version is wrong. Please "
126-
"install bitsandbytes>=0.42.0.")
126+
"install bitsandbytes>=0.44.0.")
127127
except ImportError as err:
128-
raise ImportError("Please install bitsandbytes>=0.42.0 via "
129-
"`pip install bitsandbytes>=0.42.0` to use "
128+
raise ImportError("Please install bitsandbytes>=0.44.0 via "
129+
"`pip install bitsandbytes>=0.44.0` to use "
130130
"bitsandbytes quantizer.") from err
131131

132132
self.quant_config = quant_config

vllm/model_executor/model_loader/loader.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -851,12 +851,12 @@ def _get_quantized_weights_iterator(
851851
# only load the bitsandbytes module when needed
852852
try:
853853
import bitsandbytes
854-
if bitsandbytes.__version__ < "0.42.0":
854+
if bitsandbytes.__version__ < "0.44.0":
855855
raise ImportError("bitsandbytes version is wrong. Please "
856-
"install bitsandbytes>=0.42.0.")
856+
"install bitsandbytes>=0.44.0.")
857857
except ImportError as err:
858-
raise ImportError("Please install bitsandbytes>=0.42.0 via "
859-
"`pip install bitsandbytes>=0.42.0` to use "
858+
raise ImportError("Please install bitsandbytes>=0.44.0 via "
859+
"`pip install bitsandbytes>=0.44.0` to use "
860860
"bitsandbytes quantizer.") from err
861861

862862
hf_weights_files, use_safetensors = self._prepare_weights(

0 commit comments

Comments
 (0)