Config: Fix quantized cache naming scheme

kingbri1 · kingbri1 · commit fed4cb5a8bbd · 2025-03-11T23:17:33.000-04:00
The standard naming version is "q8_0" instead of "Q8_0", always
convert the string to lowercase which avoids case mismatch and rewite
the enum to use llamacpp's convention for quant naming.

Signed-off-by: kingbri &lt;8082010+kingbri1@users.noreply.github.com&gt;
diff --git a/bindings/types.ts b/bindings/types.ts
@@ -1,14 +1,13 @@
 // Subset for caching
 export enum GGMLType {
-    F32 = 0,
-    F16 = 1,
-    Q4_0 = 2,
-    Q4_1 = 3,
+    f32 = 0,
+    f16 = 1,
+    q4_0 = 2,
+    q4_1 = 3,
     // 4 and 5 were removed (Q4_2 and Q4_3)
-    Q5_0 = 6,
-    Q5_1 = 7,
-    Q8_0 = 8,
-    Q8_1 = 9,
+    q5_0 = 6,
+    q5_1 = 7,
+    q8_0 = 8,
 }
 
 export enum BindingFinishReason {
diff --git a/common/configModels.ts b/common/configModels.ts
@@ -27,15 +27,19 @@ export const ModelConfig = z.object({
     rope_freq_base: z.number().nullish().coalesce(0),
     enable_yarn: z.boolean().nullish().coalesce(false),
     cache_mode_k: z.union([
-        z.string().transform((str) => GGMLType[str as keyof typeof GGMLType]),
+        z.string().transform((str) =>
+            GGMLType[str.toLowerCase() as keyof typeof GGMLType]
+        ),
         z.number(),
     ])
-        .nullish().coalesce(GGMLType.F16),
+        .nullish().coalesce(GGMLType.f16),
     cache_mode_v: z.union([
-        z.string().transform((str) => GGMLType[str as keyof typeof GGMLType]),
+        z.string().transform((str) =>
+            GGMLType[str.toLowerCase() as keyof typeof GGMLType]
+        ),
         z.number(),
     ])
-        .nullish().coalesce(GGMLType.F16),
+        .nullish().coalesce(GGMLType.f16),
 });
 
 export type ModelConfig = z.infer<typeof ModelConfig>;
diff --git a/config_sample.yml b/config_sample.yml
@@ -59,7 +59,9 @@ model:
   enable_yarn: false
 
   # K cache quantization type (default: F16)
-  cache_mode_k: F16
+  # Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
+  cache_mode_k: f16
 
   # V cache quantization type (default: F16)
-  cache_mode_v: F16
+  # Possible values - f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0
+  cache_mode_v: f16