Skip to content

Commit 6a6e1a0

Browse files
authoredFeb 18, 2025
feat(vllm): Additional vLLM config options (Disable logging, dtype, and Per-Prompt media limits) (mudler#4855)
* Adding the following vLLM config options: disable_log_status, dtype, limit_mm_per_prompt Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * using " marks in the config.yaml file Signed-off-by: TheDropZone <brandonbeiler@gmail.com> * adding in missing colon Signed-off-by: TheDropZone <brandonbeiler@gmail.com> --------- Signed-off-by: TheDropZone <brandonbeiler@gmail.com>
1 parent 5b19af9 commit 6a6e1a0

File tree

5 files changed

+64
-23
lines changed

5 files changed

+64
-23
lines changed
 

‎backend/backend.proto

+6-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ message Reply {
165165

166166
message GrammarTrigger {
167167
string word = 1;
168-
bool at_start = 2;
168+
bool at_start = 2;
169169
}
170170

171171
message ModelOptions {
@@ -229,6 +229,11 @@ message ModelOptions {
229229
int32 MaxModelLen = 54;
230230
int32 TensorParallelSize = 55;
231231
string LoadFormat = 58;
232+
bool DisableLogStatus = 66;
233+
string DType = 67;
234+
int32 LimitImagePerPrompt = 68;
235+
int32 LimitVideoPerPrompt = 69;
236+
int32 LimitAudioPerPrompt = 70;
232237

233238
string MMProj = 41;
234239

‎backend/python/vllm/backend.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,17 @@ async def LoadModel(self, request, context):
109109
engine_args.swap_space = request.SwapSpace
110110
if request.MaxModelLen != 0:
111111
engine_args.max_model_len = request.MaxModelLen
112+
if request.DisableLogStatus:
113+
engine_args.disable_log_status = request.DisableLogStatus
114+
if request.DType != "":
115+
engine_args.dtype = request.DType
116+
if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
117+
# limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
118+
engine_args.limit_mm_per_prompt = {
119+
"image": max(request.LimitImagePerPrompt, 1),
120+
"video": max(request.LimitVideoPerPrompt, 1),
121+
"audio": max(request.LimitAudioPerPrompt, 1)
122+
}
112123

113124
try:
114125
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
@@ -269,7 +280,7 @@ async def _predict(self, request, context, streaming=False):
269280
def load_image(self, image_path: str):
270281
"""
271282
Load an image from the given file path or base64 encoded data.
272-
283+
273284
Args:
274285
image_path (str): The path to the image file or base64 encoded data.
275286
@@ -288,7 +299,7 @@ def load_image(self, image_path: str):
288299
def load_video(self, video_path: str):
289300
"""
290301
Load a video from the given file path.
291-
302+
292303
Args:
293304
video_path (str): The path to the image file.
294305
@@ -335,4 +346,4 @@ async def serve(address):
335346
)
336347
args = parser.parse_args()
337348

338-
asyncio.run(serve(args.addr))
349+
asyncio.run(serve(args.addr))

‎core/backend/options.go

+6
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,12 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
159159
SwapSpace: int32(c.SwapSpace),
160160
MaxModelLen: int32(c.MaxModelLen),
161161
TensorParallelSize: int32(c.TensorParallelSize),
162+
DisableLogStatus: c.DisableLogStatus,
163+
DType: c.DType,
164+
// LimitMMPerPrompt vLLM
165+
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
166+
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
167+
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
162168
MMProj: c.MMProj,
163169
FlashAttention: c.FlashAttention,
164170
CacheTypeKey: c.CacheTypeK,

‎core/config/backend_config.go

+29-19
Original file line numberDiff line numberDiff line change
@@ -130,25 +130,28 @@ type LLMConfig struct {
130130
TrimSpace []string `yaml:"trimspace"`
131131
TrimSuffix []string `yaml:"trimsuffix"`
132132

133-
ContextSize *int `yaml:"context_size"`
134-
NUMA bool `yaml:"numa"`
135-
LoraAdapter string `yaml:"lora_adapter"`
136-
LoraBase string `yaml:"lora_base"`
137-
LoraAdapters []string `yaml:"lora_adapters"`
138-
LoraScales []float32 `yaml:"lora_scales"`
139-
LoraScale float32 `yaml:"lora_scale"`
140-
NoMulMatQ bool `yaml:"no_mulmatq"`
141-
DraftModel string `yaml:"draft_model"`
142-
NDraft int32 `yaml:"n_draft"`
143-
Quantization string `yaml:"quantization"`
144-
LoadFormat string `yaml:"load_format"`
145-
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
146-
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
147-
EnforceEager bool `yaml:"enforce_eager"` // vLLM
148-
SwapSpace int `yaml:"swap_space"` // vLLM
149-
MaxModelLen int `yaml:"max_model_len"` // vLLM
150-
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
151-
MMProj string `yaml:"mmproj"`
133+
ContextSize *int `yaml:"context_size"`
134+
NUMA bool `yaml:"numa"`
135+
LoraAdapter string `yaml:"lora_adapter"`
136+
LoraBase string `yaml:"lora_base"`
137+
LoraAdapters []string `yaml:"lora_adapters"`
138+
LoraScales []float32 `yaml:"lora_scales"`
139+
LoraScale float32 `yaml:"lora_scale"`
140+
NoMulMatQ bool `yaml:"no_mulmatq"`
141+
DraftModel string `yaml:"draft_model"`
142+
NDraft int32 `yaml:"n_draft"`
143+
Quantization string `yaml:"quantization"`
144+
LoadFormat string `yaml:"load_format"`
145+
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
146+
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
147+
EnforceEager bool `yaml:"enforce_eager"` // vLLM
148+
SwapSpace int `yaml:"swap_space"` // vLLM
149+
MaxModelLen int `yaml:"max_model_len"` // vLLM
150+
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
151+
DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
152+
DType string `yaml:"dtype"` // vLLM
153+
LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
154+
MMProj string `yaml:"mmproj"`
152155

153156
FlashAttention bool `yaml:"flash_attention"`
154157
NoKVOffloading bool `yaml:"no_kv_offloading"`
@@ -166,6 +169,13 @@ type LLMConfig struct {
166169
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
167170
}
168171

172+
// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
173+
type LimitMMPerPrompt struct {
174+
LimitImagePerPrompt int `yaml:"image"`
175+
LimitVideoPerPrompt int `yaml:"video"`
176+
LimitAudioPerPrompt int `yaml:"audio"`
177+
}
178+
169179
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
170180
type AutoGPTQ struct {
171181
ModelBaseName string `yaml:"model_base_name"`

‎gallery/vllm.yaml

+9
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ config_file: |
1616
use_tokenizer_template: true
1717
# Uncomment to specify a quantization method (optional)
1818
# quantization: "awq"
19+
# Uncomment to set dtype, choices are: "auto", "half", "float16", "bfloat16", "float", "float32". awq on vLLM does not support bfloat16
20+
# dtype: "float16"
1921
# Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
2022
# gpu_memory_utilization: 0.5
2123
# Uncomment to trust remote code from huggingface
@@ -30,3 +32,10 @@ config_file: |
3032
# Allows you to partition and run large models. Performance gains are limited.
3133
# https://github.com/vllm-project/vllm/issues/1435
3234
# tensor_parallel_size: 2
35+
# Uncomment to disable log stats
36+
# disable_log_stats: true
37+
# Uncomment to specify Multi-Model limits per prompt, defaults to 1 per modality if not specified
38+
# limit_mm_per_prompt:
39+
# image: 2
40+
# video: 2
41+
# audio: 2

0 commit comments

Comments
 (0)