Skip to content

Commit 4c6514d

Browse files
committed
feat: Update llama.cpp
1 parent 99f2ebf commit 4c6514d

File tree

3 files changed

+23
-14
lines changed

3 files changed

+23
-14
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ if (LLAMA_BUILD)
143143
endif()
144144

145145
# Building llava
146-
add_subdirectory(vendor/llama.cpp/examples/llava)
146+
add_subdirectory(vendor/llama.cpp/tools/mtmd)
147147
set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
148148

149149
if (WIN32)

llama_cpp/llama_cpp.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,8 @@
235235
# LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
236236
# LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
237237
# LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
238+
# LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
239+
# LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
238240
# };
239241
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
240242
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -252,7 +254,7 @@
252254
LLAMA_VOCAB_PRE_TYPE_DBRX = 13
253255
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14
254256
LLAMA_VOCAB_PRE_TYPE_PORO = 15
255-
LLAMA_VOCAV_PRE_TYPE_CHATGLM3 = 16
257+
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16
256258
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17
257259
LLAMA_VOCAB_PRE_TYPE_VIKING = 18
258260
LLAMA_VOCAB_PRE_TYPE_JAIS = 19
@@ -269,6 +271,8 @@
269271
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30
270272
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31
271273
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32
274+
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33
275+
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34
272276

273277

274278
# // note: these values should be synchronized with ggml_rope
@@ -891,17 +895,18 @@ class llama_context_params(ctypes.Structure):
891895

892896
# // model quantization parameters
893897
# typedef struct llama_model_quantize_params {
894-
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
895-
# enum llama_ftype ftype; // quantize to this llama_ftype
896-
# enum ggml_type output_tensor_type; // output tensor type
897-
# enum ggml_type token_embedding_type; // token embeddings tensor type
898-
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
899-
# bool quantize_output_tensor; // quantize output.weight
900-
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
901-
# bool pure; // quantize all tensors to the default type
902-
# bool keep_split; // quantize to the same number of shards
903-
# void * imatrix; // pointer to importance matrix data
904-
# void * kv_overrides; // pointer to vector containing overrides
898+
# int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
899+
# enum llama_ftype ftype; // quantize to this llama_ftype
900+
# enum ggml_type output_tensor_type; // output tensor type
901+
# enum ggml_type token_embedding_type; // token embeddings tensor type
902+
# bool allow_requantize; // allow quantizing non-f32/f16 tensors
903+
# bool quantize_output_tensor; // quantize output.weight
904+
# bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
905+
# bool pure; // quantize all tensors to the default type
906+
# bool keep_split; // quantize to the same number of shards
907+
# void * imatrix; // pointer to importance matrix data
908+
# void * kv_overrides; // pointer to vector containing overrides
909+
# void * tensor_types; // pointer to vector containing tensor types
905910
# } llama_model_quantize_params;
906911
class llama_model_quantize_params(ctypes.Structure):
907912
"""Parameters for llama_model_quantize
@@ -918,6 +923,7 @@ class llama_model_quantize_params(ctypes.Structure):
918923
keep_split (bool): quantize to the same number of shards
919924
imatrix (ctypes.c_void_p): pointer to importance matrix data
920925
kv_overrides (ctypes.c_void_p): pointer to vector containing overrides
926+
tensor_types (ctypes.c_void_p): pointer to vector containing tensor types
921927
"""
922928

923929
if TYPE_CHECKING:
@@ -932,6 +938,7 @@ class llama_model_quantize_params(ctypes.Structure):
932938
keep_split: bool
933939
imatrix: ctypes.c_void_p
934940
kv_overrides: ctypes.c_void_p
941+
tensor_types: ctypes.c_void_p
935942

936943
_fields_ = [
937944
("nthread", ctypes.c_int32),
@@ -945,6 +952,7 @@ class llama_model_quantize_params(ctypes.Structure):
945952
("keep_split", ctypes.c_bool),
946953
("imatrix", ctypes.c_void_p),
947954
("kv_overrides", ctypes.c_void_p),
955+
("tensor_types", ctypes.c_void_p),
948956
]
949957

950958

@@ -3812,6 +3820,7 @@ def llama_sampler_init_softmax() -> llama_sampler_p:
38123820

38133821

38143822
# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
3823+
# /// Setting k <= 0 makes this a noop
38153824
# LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
38163825
@ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes)
38173826
def llama_sampler_init_top_k(k: int) -> llama_sampler_p:

vendor/llama.cpp

0 commit comments

Comments
 (0)