feat(vllm): Additional vLLM config options (Disable logging, dtype, and Per-Prompt media limits) (mudler#4855)

TheDropZone · web-flow · commit 6a6e1a0ea9f8 · 2025-02-18T19:27:58.000+01:00
* Adding the following vLLM config options: disable_log_status, dtype, limit_mm_per_prompt

Signed-off-by: TheDropZone &lt;brandonbeiler@gmail.com&gt;

* using " marks in the config.yaml file

Signed-off-by: TheDropZone &lt;brandonbeiler@gmail.com&gt;

* adding in missing colon

Signed-off-by: TheDropZone &lt;brandonbeiler@gmail.com&gt;

---------

Signed-off-by: TheDropZone &lt;brandonbeiler@gmail.com&gt;
diff --git a/backend/backend.proto b/backend/backend.proto
@@ -165,7 +165,7 @@ message Reply {
 
 message GrammarTrigger {
   string word = 1;
-  bool at_start = 2; 
+  bool at_start = 2;
 }
 
 message ModelOptions {
@@ -229,6 +229,11 @@ message ModelOptions {
   int32  MaxModelLen = 54;
   int32  TensorParallelSize = 55;
   string LoadFormat = 58;
+  bool   DisableLogStatus = 66;
+  string DType = 67;
+  int32  LimitImagePerPrompt = 68;
+  int32  LimitVideoPerPrompt = 69;
+  int32  LimitAudioPerPrompt = 70;
 
   string MMProj = 41;
 
diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
@@ -109,6 +109,17 @@ async def LoadModel(self, request, context):
             engine_args.swap_space = request.SwapSpace
         if request.MaxModelLen != 0:
             engine_args.max_model_len = request.MaxModelLen
+        if request.DisableLogStatus:
+            engine_args.disable_log_status = request.DisableLogStatus
+        if request.DType != "":
+            engine_args.dtype = request.DType
+        if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
+            # limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
+            engine_args.limit_mm_per_prompt = {
+                "image": max(request.LimitImagePerPrompt, 1),
+                "video": max(request.LimitVideoPerPrompt, 1),
+                "audio": max(request.LimitAudioPerPrompt, 1)
+            }
 
         try:
             self.llm = AsyncLLMEngine.from_engine_args(engine_args)
@@ -269,7 +280,7 @@ async def _predict(self, request, context, streaming=False):
     def load_image(self, image_path: str):
         """
         Load an image from the given file path or base64 encoded data.
-        
+
         Args:
             image_path (str): The path to the image file or base64 encoded data.
 
@@ -288,7 +299,7 @@ def load_image(self, image_path: str):
     def load_video(self, video_path: str):
         """
         Load a video from the given file path.
-        
+
         Args:
             video_path (str): The path to the image file.
 
@@ -335,4 +346,4 @@ async def serve(address):
     )
     args = parser.parse_args()
 
-    asyncio.run(serve(args.addr))
+    asyncio.run(serve(args.addr))
diff --git a/core/backend/options.go b/core/backend/options.go
@@ -159,6 +159,12 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		SwapSpace:            int32(c.SwapSpace),
 		MaxModelLen:          int32(c.MaxModelLen),
 		TensorParallelSize:   int32(c.TensorParallelSize),
+		DisableLogStatus:     c.DisableLogStatus,
+		DType:                c.DType,
+		// LimitMMPerPrompt vLLM
+		LimitImagePerPrompt:  int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
+		LimitVideoPerPrompt:  int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
+		LimitAudioPerPrompt:  int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
 		MMProj:               c.MMProj,
 		FlashAttention:       c.FlashAttention,
 		CacheTypeKey:         c.CacheTypeK,
diff --git a/core/config/backend_config.go b/core/config/backend_config.go
@@ -130,25 +130,28 @@ type LLMConfig struct {
 	TrimSpace       []string `yaml:"trimspace"`
 	TrimSuffix      []string `yaml:"trimsuffix"`
 
-	ContextSize          *int      `yaml:"context_size"`
-	NUMA                 bool      `yaml:"numa"`
-	LoraAdapter          string    `yaml:"lora_adapter"`
-	LoraBase             string    `yaml:"lora_base"`
-	LoraAdapters         []string  `yaml:"lora_adapters"`
-	LoraScales           []float32 `yaml:"lora_scales"`
-	LoraScale            float32   `yaml:"lora_scale"`
-	NoMulMatQ            bool      `yaml:"no_mulmatq"`
-	DraftModel           string    `yaml:"draft_model"`
-	NDraft               int32     `yaml:"n_draft"`
-	Quantization         string    `yaml:"quantization"`
-	LoadFormat           string    `yaml:"load_format"`
-	GPUMemoryUtilization float32   `yaml:"gpu_memory_utilization"` // vLLM
-	TrustRemoteCode      bool      `yaml:"trust_remote_code"`      // vLLM
-	EnforceEager         bool      `yaml:"enforce_eager"`          // vLLM
-	SwapSpace            int       `yaml:"swap_space"`             // vLLM
-	MaxModelLen          int       `yaml:"max_model_len"`          // vLLM
-	TensorParallelSize   int       `yaml:"tensor_parallel_size"`   // vLLM
-	MMProj               string    `yaml:"mmproj"`
+	ContextSize          *int               `yaml:"context_size"`
+	NUMA                 bool               `yaml:"numa"`
+	LoraAdapter          string             `yaml:"lora_adapter"`
+	LoraBase             string             `yaml:"lora_base"`
+	LoraAdapters         []string           `yaml:"lora_adapters"`
+	LoraScales           []float32          `yaml:"lora_scales"`
+	LoraScale            float32            `yaml:"lora_scale"`
+	NoMulMatQ            bool               `yaml:"no_mulmatq"`
+	DraftModel           string             `yaml:"draft_model"`
+	NDraft               int32              `yaml:"n_draft"`
+	Quantization         string             `yaml:"quantization"`
+	LoadFormat           string             `yaml:"load_format"`
+	GPUMemoryUtilization float32            `yaml:"gpu_memory_utilization"` // vLLM
+	TrustRemoteCode      bool               `yaml:"trust_remote_code"`      // vLLM
+	EnforceEager         bool               `yaml:"enforce_eager"`          // vLLM
+	SwapSpace            int                `yaml:"swap_space"`             // vLLM
+	MaxModelLen          int                `yaml:"max_model_len"`          // vLLM
+	TensorParallelSize   int                `yaml:"tensor_parallel_size"`   // vLLM
+	DisableLogStatus     bool               `yaml:"disable_log_stats"`      // vLLM
+	DType                string             `yaml:"dtype"`                  // vLLM
+	LimitMMPerPrompt     LimitMMPerPrompt   `yaml:"limit_mm_per_prompt"`    // vLLM
+	MMProj               string             `yaml:"mmproj"`
 
 	FlashAttention bool   `yaml:"flash_attention"`
 	NoKVOffloading bool   `yaml:"no_kv_offloading"`
@@ -166,6 +169,13 @@ type LLMConfig struct {
 	CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
 }
 
+// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
+type LimitMMPerPrompt struct {
+	LimitImagePerPrompt   int   `yaml:"image"`
+	LimitVideoPerPrompt   int   `yaml:"video"`
+	LimitAudioPerPrompt   int   `yaml:"audio"`
+}
+
 // AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
 type AutoGPTQ struct {
 	ModelBaseName    string `yaml:"model_base_name"`
diff --git a/gallery/vllm.yaml b/gallery/vllm.yaml
@@ -16,6 +16,8 @@ config_file: |
       use_tokenizer_template: true
     # Uncomment to specify a quantization method (optional)
     # quantization: "awq"
+    # Uncomment to set dtype, choices are: "auto", "half", "float16", "bfloat16", "float", "float32". awq on vLLM does not support bfloat16
+    # dtype: "float16"
     # Uncomment to limit the GPU memory utilization (vLLM default is 0.9 for 90%)
     # gpu_memory_utilization: 0.5
     # Uncomment to trust remote code from huggingface
@@ -30,3 +32,10 @@ config_file: |
     # Allows you to partition and run large models. Performance gains are limited.
     # https://github.com/vllm-project/vllm/issues/1435
     # tensor_parallel_size: 2
+    # Uncomment to disable log stats
+    # disable_log_stats: true
+    # Uncomment to specify Multi-Model limits per prompt, defaults to 1 per modality if not specified
+    # limit_mm_per_prompt:
+    #   image: 2
+    #   video: 2
+    #   audio: 2