@@ -333,7 +333,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
333
333
LLAMA_ROPE_TYPE_GLM = 4
334
334
335
335
336
- # enum llama_token_type {
336
+ # enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
337
337
# LLAMA_TOKEN_TYPE_UNDEFINED = 0,
338
338
# LLAMA_TOKEN_TYPE_NORMAL = 1,
339
339
# LLAMA_TOKEN_TYPE_UNKNOWN = 2,
@@ -351,6 +351,32 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
351
351
LLAMA_TOKEN_TYPE_BYTE = 6
352
352
353
353
354
+ # enum llama_token_attr {
355
+ # LLAMA_TOKEN_ATTR_UNDEFINED = 0,
356
+ # LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0,
357
+ # LLAMA_TOKEN_ATTR_UNUSED = 1 << 1,
358
+ # LLAMA_TOKEN_ATTR_NORMAL = 1 << 2,
359
+ # LLAMA_TOKEN_ATTR_CONTROL = 1 << 3, // SPECIAL?
360
+ # LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
361
+ # LLAMA_TOKEN_ATTR_BYTE = 1 << 5,
362
+ # LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6,
363
+ # LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7,
364
+ # LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8,
365
+ # LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9,
366
+ # };
367
+ LLAMA_TOKEN_ATTR_UNDEFINED = 0
368
+ LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0
369
+ LLAMA_TOKEN_ATTR_UNUSED = 1 << 1
370
+ LLAMA_TOKEN_ATTR_NORMAL = 1 << 2
371
+ LLAMA_TOKEN_ATTR_CONTROL = 1 << 3
372
+ LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4
373
+ LLAMA_TOKEN_ATTR_BYTE = 1 << 5
374
+ LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6
375
+ LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7
376
+ LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8
377
+ LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9
378
+
379
+
354
380
# // model file types
355
381
# enum llama_ftype {
356
382
# LLAMA_FTYPE_ALL_F32 = 0,
@@ -959,6 +985,9 @@ class llama_model_quantize_params(ctypes.Structure):
959
985
# // modifies a preceding LLAMA_GRETYPE_CHAR or
960
986
# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
961
987
# LLAMA_GRETYPE_CHAR_ALT = 6,
988
+
989
+ # // any character (.)
990
+ # LLAMA_GRETYPE_CHAR_ANY = 7,
962
991
# };
963
992
LLAMA_GRETYPE_END = 0
964
993
LLAMA_GRETYPE_ALT = 1
@@ -967,6 +996,7 @@ class llama_model_quantize_params(ctypes.Structure):
967
996
LLAMA_GRETYPE_CHAR_NOT = 4
968
997
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5
969
998
LLAMA_GRETYPE_CHAR_ALT = 6
999
+ LLAMA_GRETYPE_CHAR_ANY = 7
970
1000
971
1001
972
1002
# typedef struct llama_grammar_element {
@@ -2438,11 +2468,11 @@ def llama_token_get_score(
2438
2468
) -> float : ...
2439
2469
2440
2470
2441
- # LLAMA_API enum llama_token_type llama_token_get_type (const struct llama_model * model, llama_token token);
2471
+ # LLAMA_API enum llama_token_attr llama_token_get_attr (const struct llama_model * model, llama_token token);
2442
2472
@ctypes_function (
2443
- "llama_token_get_type " , [llama_model_p_ctypes , llama_token ], ctypes .c_int
2473
+ "llama_token_get_attr " , [llama_model_p_ctypes , llama_token ], ctypes .c_int
2444
2474
)
2445
- def llama_token_get_type (
2475
+ def llama_token_get_attr (
2446
2476
model : llama_model_p , token : Union [llama_token , int ], /
2447
2477
) -> int : ...
2448
2478
@@ -3200,104 +3230,9 @@ def llama_grammar_accept_token(
3200
3230
3201
3231
3202
3232
# //
3203
- # // Beam search
3233
+ # // Model split
3204
3234
# //
3205
3235
3206
- # struct llama_beam_view {
3207
- # const llama_token * tokens;
3208
-
3209
-
3210
- # size_t n_tokens;
3211
- # float p; // Cumulative beam probability (renormalized relative to all beams)
3212
- # bool eob; // Callback should set this to true when a beam is at end-of-beam.
3213
- # };
3214
- class llama_beam_view (ctypes .Structure ):
3215
- if TYPE_CHECKING :
3216
- tokens : CtypesArray [llama_token ]
3217
- n_tokens : int
3218
- p : float
3219
- eob : bool
3220
-
3221
- _fields_ = [
3222
- ("tokens" , llama_token_p ),
3223
- ("n_tokens" , ctypes .c_size_t ),
3224
- ("p" , ctypes .c_float ),
3225
- ("eob" , ctypes .c_bool ),
3226
- ]
3227
-
3228
-
3229
- # // Passed to beam_search_callback function.
3230
- # // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
3231
- # // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
3232
- # // These pointers are valid only during the synchronous callback, so should not be saved.
3233
- # struct llama_beams_state {
3234
- # struct llama_beam_view * beam_views;
3235
- # size_t n_beams; // Number of elements in beam_views[].
3236
- # size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
3237
- # bool last_call; // True iff this is the last callback invocation.
3238
- # };
3239
- class llama_beams_state (ctypes .Structure ):
3240
- if TYPE_CHECKING :
3241
- beam_views : CtypesArray [llama_beam_view ]
3242
- n_beams : int
3243
- common_prefix_length : int
3244
- last_call : bool
3245
-
3246
- _fields_ = [
3247
- ("beam_views" , ctypes .POINTER (llama_beam_view )),
3248
- ("n_beams" , ctypes .c_size_t ),
3249
- ("common_prefix_length" , ctypes .c_size_t ),
3250
- ("last_call" , ctypes .c_bool ),
3251
- ]
3252
-
3253
-
3254
- # // Type of pointer to the beam_search_callback function.
3255
- # // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
3256
- # // passed back to beam_search_callback. This avoids having to use global variables in the callback.
3257
- # typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
3258
- llama_beam_search_callback_fn_t = ctypes .CFUNCTYPE (
3259
- None , ctypes .c_void_p , llama_beams_state
3260
- )
3261
-
3262
-
3263
- # /// @details Deterministically returns entire sentence constructed by a beam search.
3264
- # /// @param ctx Pointer to the llama_context.
3265
- # /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
3266
- # /// @param callback_data A pointer that is simply passed back to callback.
3267
- # /// @param n_beams Number of beams to use.
3268
- # /// @param n_past Number of tokens already evaluated.
3269
- # /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
3270
- # /// @param n_threads Number of threads as passed to llama_eval().
3271
- # LLAMA_API void llama_beam_search(
3272
- # struct llama_context * ctx,
3273
- # llama_beam_search_callback_fn_t callback,
3274
- # void * callback_data,
3275
- # size_t n_beams,
3276
- # int32_t n_past,
3277
- # int32_t n_predict);
3278
- @ctypes_function (
3279
- "llama_beam_search" ,
3280
- [
3281
- llama_context_p_ctypes ,
3282
- llama_beam_search_callback_fn_t ,
3283
- ctypes .c_void_p ,
3284
- ctypes .c_size_t ,
3285
- ctypes .c_int32 ,
3286
- ctypes .c_int32 ,
3287
- ],
3288
- None ,
3289
- )
3290
- def llama_beam_search (
3291
- ctx : llama_context_p ,
3292
- callback : CtypesFuncPointer ,
3293
- callback_data : ctypes .c_void_p ,
3294
- n_beams : Union [ctypes .c_size_t , int ],
3295
- n_past : Union [ctypes .c_int , int ],
3296
- n_predict : Union [ctypes .c_int , int ],
3297
- / ,
3298
- ): ...
3299
-
3300
-
3301
3236
# /// @details Build a split GGUF final path for this chunk.
3302
3237
# /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
3303
3238
# // Returns the split_path length.
0 commit comments