Merge branch 'main' into do-pooling

abetlen · web-flow · commit b586060d7472 · 2024-04-25T21:25:34.000-04:00
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -57,7 +57,7 @@ jobs:
 
   build-macos:
 
-    runs-on: macos-latest
+    runs-on: macos-13
     strategy:
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
@@ -107,7 +107,7 @@ jobs:
 
   build-macos-metal:
 
-    runs-on: macos-latest
+    runs-on: macos-13
 
     steps:
       - uses: actions/checkout@v3
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -811,6 +811,7 @@ class llama_context_params(ctypes.Structure):
 #     bool quantize_output_tensor;         // quantize output.weight
 #     bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
 #     bool pure;                           // quantize all tensors to the default type
+#     bool keep_split;                     // quantize to the same number of shards
 #     void * imatrix;                      // pointer to importance matrix data
 #     void * kv_overrides;                 // pointer to vector containing overrides
 # } llama_model_quantize_params;
@@ -826,6 +827,7 @@ class llama_model_quantize_params(ctypes.Structure):
         quantize_output_tensor (bool): quantize output.weight
         only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
         pure (bool): quantize all tensors to the default type
+        keep_split (bool): quantize to the same number of shards
         imatrix (ctypes.c_void_p): pointer to importance matrix data
         kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
     """
@@ -839,6 +841,7 @@ class llama_model_quantize_params(ctypes.Structure):
         quantize_output_tensor: bool
         only_copy: bool
         pure: bool
+        keep_split: bool
         imatrix: ctypes.c_void_p
         kv_overrides: ctypes.c_void_p
 
@@ -851,6 +854,7 @@ class llama_model_quantize_params(ctypes.Structure):
         ("quantize_output_tensor", ctypes.c_bool),
         ("only_copy", ctypes.c_bool),
         ("pure", ctypes.c_bool),
+        ("keep_split", ctypes.c_bool),
         ("imatrix", ctypes.c_void_p),
         ("kv_overrides", ctypes.c_void_p),
     ]
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
@@ -2,8 +2,10 @@
 
 import multiprocessing
 
-from typing import Optional, List, Literal, Union
-from pydantic import Field, root_validator
+from typing import Optional, List, Literal, Union, Dict, cast
+from typing_extensions import Self
+
+from pydantic import Field, model_validator
 from pydantic_settings import BaseSettings
 
 import llama_cpp
@@ -173,15 +175,16 @@ class ModelSettings(BaseSettings):
         default=True, description="Whether to print debug information."
     )
 
-    @root_validator(pre=True)  # pre=True to ensure this runs before any other validation
-    def set_dynamic_defaults(cls, values):
+    @model_validator(mode="before")  # pre=True to ensure this runs before any other validation
+    def set_dynamic_defaults(self) -> Self:
         # If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
         cpu_count = multiprocessing.cpu_count()
+        values = cast(Dict[str, int], self)
         if values.get('n_threads', 0) == -1:
             values['n_threads'] = cpu_count
         if values.get('n_threads_batch', 0) == -1:
             values['n_threads_batch'] = cpu_count
-        return values
+        return self
 
 
 class ServerSettings(BaseSettings):
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 784e11dea1f5ce9638851b2b0dddb107e2a609c8
+Subproject commit 46e12c4692a37bdd31a0432fc5153d7d22bc7f72