Skip to content

Commit 255e1b4

Browse files
committed
feat: Update llama.cpp
1 parent d634efc commit 255e1b4

File tree

3 files changed

+38
-103
lines changed

3 files changed

+38
-103
lines changed

llama_cpp/_internals.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,9 @@ def token_get_score(self, token: int) -> float:
128128
assert self.model is not None
129129
return llama_cpp.llama_token_get_score(self.model, token)
130130

131-
def token_get_type(self, token: int) -> int:
131+
def token_get_attr(self, token: int) -> int:
132132
assert self.model is not None
133-
return llama_cpp.llama_token_get_type(self.model, token)
133+
return llama_cpp.llama_token_get_attr(self.model, token)
134134

135135
# Special tokens
136136

llama_cpp/llama_cpp.py

Lines changed: 35 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
333333
LLAMA_ROPE_TYPE_GLM = 4
334334

335335

336-
# enum llama_token_type {
336+
# enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
337337
# LLAMA_TOKEN_TYPE_UNDEFINED = 0,
338338
# LLAMA_TOKEN_TYPE_NORMAL = 1,
339339
# LLAMA_TOKEN_TYPE_UNKNOWN = 2,
@@ -351,6 +351,32 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
351351
LLAMA_TOKEN_TYPE_BYTE = 6
352352

353353

354+
# enum llama_token_attr {
355+
# LLAMA_TOKEN_ATTR_UNDEFINED = 0,
356+
# LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
357+
# LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
358+
# LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
359+
# LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
360+
# LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
361+
# LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
362+
# LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
363+
# LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
364+
# LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
365+
# LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
366+
# };
367+
LLAMA_TOKEN_ATTR_UNDEFINED = 0
368+
LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0
369+
LLAMA_TOKEN_ATTR_UNUSED = 1 << 1
370+
LLAMA_TOKEN_ATTR_NORMAL = 1 << 2
371+
LLAMA_TOKEN_ATTR_CONTROL = 1 << 3
372+
LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4
373+
LLAMA_TOKEN_ATTR_BYTE = 1 << 5
374+
LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6
375+
LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7
376+
LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8
377+
LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9
378+
379+
354380
# // model file types
355381
# enum llama_ftype {
356382
# LLAMA_FTYPE_ALL_F32 = 0,
@@ -959,6 +985,9 @@ class llama_model_quantize_params(ctypes.Structure):
959985
# // modifies a preceding LLAMA_GRETYPE_CHAR or
960986
# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
961987
# LLAMA_GRETYPE_CHAR_ALT = 6,
988+
989+
# // any character (.)
990+
# LLAMA_GRETYPE_CHAR_ANY = 7,
962991
# };
963992
LLAMA_GRETYPE_END = 0
964993
LLAMA_GRETYPE_ALT = 1
@@ -967,6 +996,7 @@ class llama_model_quantize_params(ctypes.Structure):
967996
LLAMA_GRETYPE_CHAR_NOT = 4
968997
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5
969998
LLAMA_GRETYPE_CHAR_ALT = 6
999+
LLAMA_GRETYPE_CHAR_ANY = 7
9701000

9711001

9721002
# typedef struct llama_grammar_element {
@@ -2438,11 +2468,11 @@ def llama_token_get_score(
24382468
) -> float: ...
24392469

24402470

2441-
# LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
2471+
# LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
24422472
@ctypes_function(
2443-
"llama_token_get_type", [llama_model_p_ctypes, llama_token], ctypes.c_int
2473+
"llama_token_get_attr", [llama_model_p_ctypes, llama_token], ctypes.c_int
24442474
)
2445-
def llama_token_get_type(
2475+
def llama_token_get_attr(
24462476
model: llama_model_p, token: Union[llama_token, int], /
24472477
) -> int: ...
24482478

@@ -3200,104 +3230,9 @@ def llama_grammar_accept_token(
32003230

32013231

32023232
# //
3203-
# // Beam search
3233+
# // Model split
32043234
# //
32053235

3206-
# struct llama_beam_view {
3207-
# const llama_token * tokens;
3208-
3209-
3210-
# size_t n_tokens;
3211-
# float p; // Cumulative beam probability (renormalized relative to all beams)
3212-
# bool eob; // Callback should set this to true when a beam is at end-of-beam.
3213-
# };
3214-
class llama_beam_view(ctypes.Structure):
3215-
if TYPE_CHECKING:
3216-
tokens: CtypesArray[llama_token]
3217-
n_tokens: int
3218-
p: float
3219-
eob: bool
3220-
3221-
_fields_ = [
3222-
("tokens", llama_token_p),
3223-
("n_tokens", ctypes.c_size_t),
3224-
("p", ctypes.c_float),
3225-
("eob", ctypes.c_bool),
3226-
]
3227-
3228-
3229-
# // Passed to beam_search_callback function.
3230-
# // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
3231-
# // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
3232-
# // These pointers are valid only during the synchronous callback, so should not be saved.
3233-
# struct llama_beams_state {
3234-
# struct llama_beam_view * beam_views;
3235-
# size_t n_beams; // Number of elements in beam_views[].
3236-
# size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
3237-
# bool last_call; // True iff this is the last callback invocation.
3238-
# };
3239-
class llama_beams_state(ctypes.Structure):
3240-
if TYPE_CHECKING:
3241-
beam_views: CtypesArray[llama_beam_view]
3242-
n_beams: int
3243-
common_prefix_length: int
3244-
last_call: bool
3245-
3246-
_fields_ = [
3247-
("beam_views", ctypes.POINTER(llama_beam_view)),
3248-
("n_beams", ctypes.c_size_t),
3249-
("common_prefix_length", ctypes.c_size_t),
3250-
("last_call", ctypes.c_bool),
3251-
]
3252-
3253-
3254-
# // Type of pointer to the beam_search_callback function.
3255-
# // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
3256-
# // passed back to beam_search_callback. This avoids having to use global variables in the callback.
3257-
# typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
3258-
llama_beam_search_callback_fn_t = ctypes.CFUNCTYPE(
3259-
None, ctypes.c_void_p, llama_beams_state
3260-
)
3261-
3262-
3263-
# /// @details Deterministically returns entire sentence constructed by a beam search.
3264-
# /// @param ctx Pointer to the llama_context.
3265-
# /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
3266-
# /// @param callback_data A pointer that is simply passed back to callback.
3267-
# /// @param n_beams Number of beams to use.
3268-
# /// @param n_past Number of tokens already evaluated.
3269-
# /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
3270-
# /// @param n_threads Number of threads as passed to llama_eval().
3271-
# LLAMA_API void llama_beam_search(
3272-
# struct llama_context * ctx,
3273-
# llama_beam_search_callback_fn_t callback,
3274-
# void * callback_data,
3275-
# size_t n_beams,
3276-
# int32_t n_past,
3277-
# int32_t n_predict);
3278-
@ctypes_function(
3279-
"llama_beam_search",
3280-
[
3281-
llama_context_p_ctypes,
3282-
llama_beam_search_callback_fn_t,
3283-
ctypes.c_void_p,
3284-
ctypes.c_size_t,
3285-
ctypes.c_int32,
3286-
ctypes.c_int32,
3287-
],
3288-
None,
3289-
)
3290-
def llama_beam_search(
3291-
ctx: llama_context_p,
3292-
callback: CtypesFuncPointer,
3293-
callback_data: ctypes.c_void_p,
3294-
n_beams: Union[ctypes.c_size_t, int],
3295-
n_past: Union[ctypes.c_int, int],
3296-
n_predict: Union[ctypes.c_int, int],
3297-
/,
3298-
): ...
3299-
3300-
33013236
# /// @details Build a split GGUF final path for this chunk.
33023237
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
33033238
# // Returns the split_path length.

vendor/llama.cpp

0 commit comments

Comments
 (0)