Skip to content

Commit c6c10e6

Browse files
jeejeeleeYuqi Zhang
authored and
Yuqi Zhang
committed
[Misc] Remove qlora_adapter_name_or_path (vllm-project#17699)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: Yuqi Zhang <yuqizhang@google.com>
1 parent ca5cc26 commit c6c10e6

File tree

3 files changed

+50
-63
lines changed

3 files changed

+50
-63
lines changed

examples/offline_inference/lora_with_quantization_inference.py

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -75,43 +75,38 @@ def initialize_engine(model: str, quantization: str,
7575
lora_repo: Optional[str]) -> LLMEngine:
7676
"""Initialize the LLMEngine."""
7777

78-
if quantization == "bitsandbytes":
79-
# QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique.
80-
# It quantizes the model when loading, with some config info from the
81-
# LoRA adapter repo. So need to set the parameter of load_format and
82-
# qlora_adapter_name_or_path as below.
83-
engine_args = EngineArgs(model=model,
84-
quantization=quantization,
85-
qlora_adapter_name_or_path=lora_repo,
86-
enable_lora=True,
87-
max_lora_rank=64)
88-
else:
89-
engine_args = EngineArgs(model=model,
90-
quantization=quantization,
91-
enable_lora=True,
92-
max_loras=4)
78+
engine_args = EngineArgs(model=model,
79+
quantization=quantization,
80+
enable_lora=True,
81+
max_lora_rank=64,
82+
max_loras=4)
9383
return LLMEngine.from_engine_args(engine_args)
9484

9585

9686
def main():
9787
"""Main function that sets up and runs the prompt processing."""
9888

99-
test_configs = [{
100-
"name": "qlora_inference_example",
101-
'model': "huggyllama/llama-7b",
102-
'quantization': "bitsandbytes",
103-
'lora_repo': 'timdettmers/qlora-flan-7b'
104-
}, {
105-
"name": "AWQ_inference_with_lora_example",
106-
'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
107-
'quantization': "awq",
108-
'lora_repo': 'jashing/tinyllama-colorist-lora'
109-
}, {
110-
"name": "GPTQ_inference_with_lora_example",
111-
'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
112-
'quantization': "gptq",
113-
'lora_repo': 'jashing/tinyllama-colorist-lora'
114-
}]
89+
test_configs = [
90+
# QLoRA (https://arxiv.org/abs/2305.14314)
91+
{
92+
"name": "qlora_inference_example",
93+
'model': "huggyllama/llama-7b",
94+
'quantization': "bitsandbytes",
95+
'lora_repo': 'timdettmers/qlora-flan-7b'
96+
},
97+
{
98+
"name": "AWQ_inference_with_lora_example",
99+
'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
100+
'quantization': "awq",
101+
'lora_repo': 'jashing/tinyllama-colorist-lora'
102+
},
103+
{
104+
"name": "GPTQ_inference_with_lora_example",
105+
'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
106+
'quantization': "gptq",
107+
'lora_repo': 'jashing/tinyllama-colorist-lora'
108+
}
109+
]
115110

116111
for test_config in test_configs:
117112
print(

vllm/engine/arg_utils.py

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import json
77
import re
88
import threading
9+
import warnings
910
from dataclasses import MISSING, dataclass, fields
1011
from itertools import permutations
1112
from typing import (Any, Callable, Dict, List, Literal, Optional, Type,
@@ -394,7 +395,13 @@ def __post_init__(self):
394395
if isinstance(self.compilation_config, (int, dict)):
395396
self.compilation_config = CompilationConfig.from_cli(
396397
str(self.compilation_config))
397-
398+
if self.qlora_adapter_name_or_path is not None:
399+
warnings.warn(
400+
"The `qlora_adapter_name_or_path` is deprecated "
401+
"and will be removed in v0.10.0. ",
402+
DeprecationWarning,
403+
stacklevel=2,
404+
)
398405
# Setup plugins
399406
from vllm.plugins import load_general_plugins
400407
load_general_plugins()
@@ -504,10 +511,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
504511
**load_kwargs["ignore_patterns"])
505512
load_group.add_argument("--use-tqdm-on-load",
506513
**load_kwargs["use_tqdm_on_load"])
507-
load_group.add_argument('--qlora-adapter-name-or-path',
508-
type=str,
509-
default=None,
510-
help='Name or path of the QLoRA adapter.')
514+
load_group.add_argument(
515+
"--qlora-adapter-name-or-path",
516+
type=str,
517+
default=None,
518+
help="The `--qlora-adapter-name-or-path` has no effect, do not set"
519+
" it, and it will be removed in v0.10.0.",
520+
deprecated=True,
521+
)
511522
load_group.add_argument('--pt-load-map-location',
512523
**load_kwargs["pt_load_map_location"])
513524

@@ -534,7 +545,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
534545
deprecated=True,
535546
help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
536547
"of v0.8.6. Use `--reasoning-parser` to specify the reasoning "
537-
"parser backend insteadThis flag (`--enable-reasoning`) will be "
548+
"parser backend instead. This flag (`--enable-reasoning`) will be "
538549
"removed in v0.10.0. When `--reasoning-parser` is specified, "
539550
"reasoning mode is automatically enabled.")
540551
guided_decoding_group.add_argument(
@@ -896,12 +907,6 @@ def create_model_config(self) -> ModelConfig:
896907

897908
def create_load_config(self) -> LoadConfig:
898909

899-
if(self.qlora_adapter_name_or_path is not None) and \
900-
self.quantization != "bitsandbytes":
901-
raise ValueError(
902-
"QLoRA adapter only support "
903-
f"'bitsandbytes' quantization, but got {self.quantization}")
904-
905910
if self.quantization == "bitsandbytes":
906911
self.load_format = "bitsandbytes"
907912

@@ -1098,11 +1103,6 @@ def create_engine_config(
10981103
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
10991104
and self.max_cpu_loras > 0 else None) if self.enable_lora else None
11001105

1101-
if self.qlora_adapter_name_or_path is not None and \
1102-
self.qlora_adapter_name_or_path != "":
1103-
self.model_loader_extra_config[
1104-
"qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
1105-
11061106
# bitsandbytes pre-quantized model need a specific model loader
11071107
if model_config.quantization == "bitsandbytes":
11081108
self.quantization = self.load_format = "bitsandbytes"

vllm/model_executor/model_loader/weight_utils.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -162,31 +162,23 @@ def get_quant_config(model_config: ModelConfig,
162162
None)
163163
if hf_quant_config is not None:
164164
return quant_cls.from_config(hf_quant_config)
165-
# In case of bitsandbytes/QLoRA, get quant config from the adapter model.
165+
# Inflight BNB quantization
166166
if model_config.quantization == "bitsandbytes":
167-
if (not load_config.model_loader_extra_config
168-
or "qlora_adapter_name_or_path"
169-
not in load_config.model_loader_extra_config):
170-
return quant_cls.from_config({"adapter_name_or_path": ""})
171-
model_name_or_path = load_config.model_loader_extra_config[
172-
"qlora_adapter_name_or_path"]
173-
174-
else:
175-
model_name_or_path = model_config.model
176-
is_local = os.path.isdir(model_name_or_path)
167+
return quant_cls.from_config({})
168+
is_local = os.path.isdir(model_config.model)
177169
if not is_local:
178170
# Download the config files.
179-
with get_lock(model_name_or_path, load_config.download_dir):
171+
with get_lock(model_config.model, load_config.download_dir):
180172
hf_folder = snapshot_download(
181-
model_name_or_path,
173+
model_config.model,
182174
revision=model_config.revision,
183175
allow_patterns="*.json",
184176
cache_dir=load_config.download_dir,
185177
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
186178
tqdm_class=DisabledTqdm,
187179
)
188180
else:
189-
hf_folder = model_name_or_path
181+
hf_folder = model_config.model
190182

191183
possible_config_filenames = quant_cls.get_config_filenames()
192184

@@ -213,7 +205,7 @@ def get_quant_config(model_config: ModelConfig,
213205
config = json.load(f)
214206

215207
if model_config.quantization == "bitsandbytes":
216-
config["adapter_name_or_path"] = model_name_or_path
208+
config["adapter_name_or_path"] = model_config.model
217209
elif model_config.quantization == "modelopt":
218210
if config["producer"]["name"] == "modelopt":
219211
return quant_cls.from_config(config)

0 commit comments

Comments
 (0)