@@ -130,25 +130,28 @@ type LLMConfig struct {
130
130
TrimSpace []string `yaml:"trimspace"`
131
131
TrimSuffix []string `yaml:"trimsuffix"`
132
132
133
- ContextSize * int `yaml:"context_size"`
134
- NUMA bool `yaml:"numa"`
135
- LoraAdapter string `yaml:"lora_adapter"`
136
- LoraBase string `yaml:"lora_base"`
137
- LoraAdapters []string `yaml:"lora_adapters"`
138
- LoraScales []float32 `yaml:"lora_scales"`
139
- LoraScale float32 `yaml:"lora_scale"`
140
- NoMulMatQ bool `yaml:"no_mulmatq"`
141
- DraftModel string `yaml:"draft_model"`
142
- NDraft int32 `yaml:"n_draft"`
143
- Quantization string `yaml:"quantization"`
144
- LoadFormat string `yaml:"load_format"`
145
- GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
146
- TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
147
- EnforceEager bool `yaml:"enforce_eager"` // vLLM
148
- SwapSpace int `yaml:"swap_space"` // vLLM
149
- MaxModelLen int `yaml:"max_model_len"` // vLLM
150
- TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
151
- MMProj string `yaml:"mmproj"`
133
+ ContextSize * int `yaml:"context_size"`
134
+ NUMA bool `yaml:"numa"`
135
+ LoraAdapter string `yaml:"lora_adapter"`
136
+ LoraBase string `yaml:"lora_base"`
137
+ LoraAdapters []string `yaml:"lora_adapters"`
138
+ LoraScales []float32 `yaml:"lora_scales"`
139
+ LoraScale float32 `yaml:"lora_scale"`
140
+ NoMulMatQ bool `yaml:"no_mulmatq"`
141
+ DraftModel string `yaml:"draft_model"`
142
+ NDraft int32 `yaml:"n_draft"`
143
+ Quantization string `yaml:"quantization"`
144
+ LoadFormat string `yaml:"load_format"`
145
+ GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
146
+ TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
147
+ EnforceEager bool `yaml:"enforce_eager"` // vLLM
148
+ SwapSpace int `yaml:"swap_space"` // vLLM
149
+ MaxModelLen int `yaml:"max_model_len"` // vLLM
150
+ TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
151
+ DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
152
+ DType string `yaml:"dtype"` // vLLM
153
+ LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
154
+ MMProj string `yaml:"mmproj"`
152
155
153
156
FlashAttention bool `yaml:"flash_attention"`
154
157
NoKVOffloading bool `yaml:"no_kv_offloading"`
@@ -166,6 +169,13 @@ type LLMConfig struct {
166
169
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
167
170
}
168
171
172
+ // LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
173
+ type LimitMMPerPrompt struct {
174
+ LimitImagePerPrompt int `yaml:"image"`
175
+ LimitVideoPerPrompt int `yaml:"video"`
176
+ LimitAudioPerPrompt int `yaml:"audio"`
177
+ }
178
+
169
179
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
170
180
type AutoGPTQ struct {
171
181
ModelBaseName string `yaml:"model_base_name"`
0 commit comments