235
235
# LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
236
236
# LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
237
237
# LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
238
+ # LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
239
+ # LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
238
240
# };
239
241
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
240
242
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
252
254
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
253
255
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
254
256
LLAMA_VOCAB_PRE_TYPE_PORO = 15
255
- LLAMA_VOCAV_PRE_TYPE_CHATGLM3 = 16
257
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16
256
258
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17
257
259
LLAMA_VOCAB_PRE_TYPE_VIKING = 18
258
260
LLAMA_VOCAB_PRE_TYPE_JAIS = 19
269
271
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
270
272
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
271
273
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
274
+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33
275
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34
272
276
273
277
274
278
# // note: these values should be synchronized with ggml_rope
@@ -891,17 +895,18 @@ class llama_context_params(ctypes.Structure):
891
895
892
896
# // model quantization parameters
893
897
# typedef struct llama_model_quantize_params {
894
- # int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
895
- # enum llama_ftype ftype; // quantize to this llama_ftype
896
- # enum ggml_type output_tensor_type; // output tensor type
897
- # enum ggml_type token_embedding_type; // token embeddings tensor type
898
- # bool allow_requantize; // allow quantizing non-f32/f16 tensors
899
- # bool quantize_output_tensor; // quantize output.weight
900
- # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
901
- # bool pure; // quantize all tensors to the default type
902
- # bool keep_split; // quantize to the same number of shards
903
- # void * imatrix; // pointer to importance matrix data
904
- # void * kv_overrides; // pointer to vector containing overrides
898
+ # int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
899
+ # enum llama_ftype ftype; // quantize to this llama_ftype
900
+ # enum ggml_type output_tensor_type; // output tensor type
901
+ # enum ggml_type token_embedding_type; // token embeddings tensor type
902
+ # bool allow_requantize; // allow quantizing non-f32/f16 tensors
903
+ # bool quantize_output_tensor; // quantize output.weight
904
+ # bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
905
+ # bool pure; // quantize all tensors to the default type
906
+ # bool keep_split; // quantize to the same number of shards
907
+ # void * imatrix; // pointer to importance matrix data
908
+ # void * kv_overrides; // pointer to vector containing overrides
909
+ # void * tensor_types; // pointer to vector containing tensor types
905
910
# } llama_model_quantize_params;
906
911
class llama_model_quantize_params (ctypes .Structure ):
907
912
"""Parameters for llama_model_quantize
@@ -918,6 +923,7 @@ class llama_model_quantize_params(ctypes.Structure):
918
923
keep_split (bool): quantize to the same number of shards
919
924
imatrix (ctypes.c_void_p): pointer to importance matrix data
920
925
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
926
+ tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
921
927
"""
922
928
923
929
if TYPE_CHECKING :
@@ -932,6 +938,7 @@ class llama_model_quantize_params(ctypes.Structure):
932
938
keep_split : bool
933
939
imatrix : ctypes .c_void_p
934
940
kv_overrides : ctypes .c_void_p
941
+ tensor_types : ctypes .c_void_p
935
942
936
943
_fields_ = [
937
944
("nthread" , ctypes .c_int32 ),
@@ -945,6 +952,7 @@ class llama_model_quantize_params(ctypes.Structure):
945
952
("keep_split" , ctypes .c_bool ),
946
953
("imatrix" , ctypes .c_void_p ),
947
954
("kv_overrides" , ctypes .c_void_p ),
955
+ ("tensor_types" , ctypes .c_void_p ),
948
956
]
949
957
950
958
@@ -3812,6 +3820,7 @@ def llama_sampler_init_softmax() -> llama_sampler_p:
3812
3820
3813
3821
3814
3822
# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
3823
+ # /// Setting k <= 0 makes this a noop
3815
3824
# LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
3816
3825
@ctypes_function ("llama_sampler_init_top_k" , [ctypes .c_int32 ], llama_sampler_p_ctypes )
3817
3826
def llama_sampler_init_top_k (k : int ) -> llama_sampler_p :
0 commit comments