Skip to content

Commit b586060

Browse files
authored
Merge branch 'main' into do-pooling
2 parents c15d867 + fcfea66 commit b586060

File tree

4 files changed

+15
-8
lines changed

4 files changed

+15
-8
lines changed

.github/workflows/test.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ jobs:
5757
5858
build-macos:
5959

60-
runs-on: macos-latest
60+
runs-on: macos-13
6161
strategy:
6262
matrix:
6363
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
@@ -107,7 +107,7 @@ jobs:
107107

108108
build-macos-metal:
109109

110-
runs-on: macos-latest
110+
runs-on: macos-13
111111

112112
steps:
113113
- uses: actions/checkout@v3

llama_cpp/llama_cpp.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -811,6 +811,7 @@ class llama_context_params(ctypes.Structure):
811811
# bool quantize_output_tensor; // quantize output.weight
812812
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
813813
# bool pure; // quantize all tensors to the default type
814+
# bool keep_split; // quantize to the same number of shards
814815
# void * imatrix; // pointer to importance matrix data
815816
# void * kv_overrides; // pointer to vector containing overrides
816817
# } llama_model_quantize_params;
@@ -826,6 +827,7 @@ class llama_model_quantize_params(ctypes.Structure):
826827
quantize_output_tensor (bool): quantize output.weight
827828
only_copy (bool): only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
828829
pure (bool): quantize all tensors to the default type
830+
keep_split (bool): quantize to the same number of shards
829831
imatrix (ctypes.c_void_p): pointer to importance matrix data
830832
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
831833
"""
@@ -839,6 +841,7 @@ class llama_model_quantize_params(ctypes.Structure):
839841
quantize_output_tensor: bool
840842
only_copy: bool
841843
pure: bool
844+
keep_split: bool
842845
imatrix: ctypes.c_void_p
843846
kv_overrides: ctypes.c_void_p
844847

@@ -851,6 +854,7 @@ class llama_model_quantize_params(ctypes.Structure):
851854
("quantize_output_tensor", ctypes.c_bool),
852855
("only_copy", ctypes.c_bool),
853856
("pure", ctypes.c_bool),
857+
("keep_split", ctypes.c_bool),
854858
("imatrix", ctypes.c_void_p),
855859
("kv_overrides", ctypes.c_void_p),
856860
]

llama_cpp/server/settings.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22

33
import multiprocessing
44

5-
from typing import Optional, List, Literal, Union
6-
from pydantic import Field, root_validator
5+
from typing import Optional, List, Literal, Union, Dict, cast
6+
from typing_extensions import Self
7+
8+
from pydantic import Field, model_validator
79
from pydantic_settings import BaseSettings
810

911
import llama_cpp
@@ -173,15 +175,16 @@ class ModelSettings(BaseSettings):
173175
default=True, description="Whether to print debug information."
174176
)
175177

176-
@root_validator(pre=True) # pre=True to ensure this runs before any other validation
177-
def set_dynamic_defaults(cls, values):
178+
@model_validator(mode="before") # pre=True to ensure this runs before any other validation
179+
def set_dynamic_defaults(self) -> Self:
178180
# If n_threads or n_threads_batch is -1, set it to multiprocessing.cpu_count()
179181
cpu_count = multiprocessing.cpu_count()
182+
values = cast(Dict[str, int], self)
180183
if values.get('n_threads', 0) == -1:
181184
values['n_threads'] = cpu_count
182185
if values.get('n_threads_batch', 0) == -1:
183186
values['n_threads_batch'] = cpu_count
184-
return values
187+
return self
185188

186189

187190
class ServerSettings(BaseSettings):

vendor/llama.cpp

0 commit comments

Comments
 (0)