@@ -1736,15 +1736,13 @@ def llama_kv_cache_view_init(
1736
1736
ctx : llama_context_p , n_seq_max : Union [ctypes .c_int32 , int ], / ,
1737
1737
) -> llama_kv_cache_view :
1738
1738
"""Create an empty KV cache view. (use only for debugging purposes)"""
1739
- ...
1740
1739
1741
1740
1742
1741
# // Free a KV cache view. (use only for debugging purposes)
1743
1742
# LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
1744
1743
@ctypes_function ("llama_kv_cache_view_free" , [llama_kv_cache_view_p ], None )
1745
1744
def llama_kv_cache_view_free (view : ctypes .pointer [llama_kv_cache_view ], / ): # type: ignore
1746
1745
"""Free a KV cache view. (use only for debugging purposes)"""
1747
- ...
1748
1746
1749
1747
1750
1748
# // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
@@ -1754,7 +1752,6 @@ def llama_kv_cache_view_free(view: ctypes.pointer[llama_kv_cache_view], /): # t
1754
1752
)
1755
1753
def llama_kv_cache_view_update (ctx : llama_context_p , view : CtypesPointerOrRef [llama_kv_cache_view ], / ): # type: ignore
1756
1754
"""Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
1757
- ...
1758
1755
1759
1756
1760
1757
# // Returns the number of tokens in the KV cache (slow, use only for debug)
@@ -1767,7 +1764,6 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
1767
1764
"""Returns the number of tokens in the KV cache (slow, use only for debug)
1768
1765
If a KV cell has multiple sequences assigned to it, it will be counted multiple times
1769
1766
"""
1770
- ...
1771
1767
1772
1768
1773
1769
# // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
@@ -1777,7 +1773,6 @@ def llama_get_kv_cache_token_count(ctx: llama_context_p, /) -> int:
1777
1773
)
1778
1774
def llama_get_kv_cache_used_cells (ctx : llama_context_p , / ) -> int :
1779
1775
"""Returns the number of used KV cells (i.e. have at least one sequence assigned to them)"""
1780
- ...
1781
1776
1782
1777
1783
1778
# // Clear the KV cache - both cell info is erased and KV data is zeroed
@@ -1786,7 +1781,6 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
1786
1781
@ctypes_function ("llama_kv_cache_clear" , [llama_context_p_ctypes ], None )
1787
1782
def llama_kv_cache_clear (ctx : llama_context_p , / ):
1788
1783
"""Clear the KV cache"""
1789
- ...
1790
1784
1791
1785
1792
1786
# // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1823,7 +1817,6 @@ def llama_kv_cache_seq_rm(
1823
1817
seq_id < 0 : match any sequence
1824
1818
p0 < 0 : [0, p1]
1825
1819
p1 < 0 : [p0, inf)"""
1826
- ...
1827
1820
1828
1821
1829
1822
# // Copy all tokens that belong to the specified sequence to another sequence
@@ -1859,7 +1852,6 @@ def llama_kv_cache_seq_cp(
1859
1852
Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
1860
1853
p0 < 0 : [0, p1]
1861
1854
p1 < 0 : [p0, inf)"""
1862
- ...
1863
1855
1864
1856
1865
1857
# // Removes all tokens that do not belong to the specified sequence
@@ -1871,7 +1863,6 @@ def llama_kv_cache_seq_cp(
1871
1863
)
1872
1864
def llama_kv_cache_seq_keep (ctx : llama_context_p , seq_id : Union [llama_seq_id , int ], / ):
1873
1865
"""Removes all tokens that do not belong to the specified sequence"""
1874
- ...
1875
1866
1876
1867
1877
1868
# // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -1947,7 +1938,6 @@ def llama_kv_cache_seq_div(
1947
1938
If the KV cache is RoPEd, the KV data is updated accordingly
1948
1939
p0 < 0 : [0, p1]
1949
1940
p1 < 0 : [p0, inf)"""
1950
- ...
1951
1941
1952
1942
1953
1943
# // Defragment the KV cache
@@ -1961,15 +1951,13 @@ def llama_kv_cache_defrag(ctx: llama_context_p, /):
1961
1951
This will be applied:
1962
1952
- lazily on next llama_decode()
1963
1953
- explicitly with llama_kv_cache_update()"""
1964
- ...
1965
1954
1966
1955
1967
1956
# // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
1968
1957
# LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
1969
1958
@ctypes_function ("llama_kv_cache_update" , [llama_context_p_ctypes ], None )
1970
1959
def llama_kv_cache_update (ctx : llama_context_p , / ):
1971
1960
"""Apply the KV cache updates (such as K-shifts, defragmentation, etc.)"""
1972
- ...
1973
1961
1974
1962
1975
1963
# //
@@ -1984,7 +1972,6 @@ def llama_kv_cache_update(ctx: llama_context_p, /):
1984
1972
@ctypes_function ("llama_state_get_size" , [llama_context_p_ctypes ], ctypes .c_size_t )
1985
1973
def llama_state_get_size (ctx : llama_context_p , / ) -> int :
1986
1974
"""Returns the *actual* size in bytes of the state (rng, logits, embedding and kv_cache) - will often be smaller after compacting tokens"""
1987
- ...
1988
1975
1989
1976
1990
1977
# LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
@@ -1993,7 +1980,6 @@ def llama_state_get_size(ctx: llama_context_p, /) -> int:
1993
1980
def llama_get_state_size (ctx : llama_context_p , / ) -> int :
1994
1981
"""Returns the maximum size in bytes of the state (rng, logits, embedding
1995
1982
and kv_cache) - will often be smaller after compacting tokens"""
1996
- ...
1997
1983
1998
1984
1999
1985
# // Copies the state to the specified destination address.
@@ -2021,7 +2007,6 @@ def llama_state_get_data(
2021
2007
"""Copies the state to the specified destination address.
2022
2008
Destination needs to have allocated enough memory.
2023
2009
Returns the number of bytes copied"""
2024
- ...
2025
2010
2026
2011
2027
2012
# LLAMA_API DEPRECATED(size_t llama_copy_state_data(
@@ -2042,7 +2027,6 @@ def llama_copy_state_data(
2042
2027
"""Copies the state to the specified destination address.
2043
2028
Destination needs to have allocated enough memory.
2044
2029
Returns the number of bytes copied"""
2045
- ...
2046
2030
2047
2031
2048
2032
# // Set the state reading from the specified address
@@ -2064,7 +2048,6 @@ def llama_state_set_data(
2064
2048
) -> int :
2065
2049
"""Set the state reading from the specified address
2066
2050
Returns the number of bytes read"""
2067
- ...
2068
2051
2069
2052
2070
2053
# LLAMA_API DEPRECATED(size_t llama_set_state_data(
@@ -2080,7 +2063,6 @@ def llama_set_state_data(
2080
2063
ctx : llama_context_p , src : CtypesArray [ctypes .c_uint8 ], / ,
2081
2064
) -> int :
2082
2065
"""Set the state reading from the specified address"""
2083
- ...
2084
2066
2085
2067
2086
2068
# Save/load session file
@@ -2203,7 +2185,6 @@ def llama_save_session_file(
2203
2185
)
2204
2186
def llama_state_seq_get_size (ctx : llama_context_p , seq_id : llama_seq_id , / ) -> int :
2205
2187
"""Get the exact size needed to copy the KV cache of a single sequence"""
2206
- ...
2207
2188
2208
2189
2209
2190
# // Copy the KV cache of a single sequence into the specified buffer
@@ -2260,7 +2241,6 @@ def llama_state_seq_set_data(
2260
2241
/ ,
2261
2242
) -> int :
2262
2243
"""Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence"""
2263
- ...
2264
2244
2265
2245
2266
2246
# LLAMA_API size_t llama_state_seq_save_file(
@@ -2357,7 +2337,6 @@ def llama_batch_get_one(
2357
2337
2358
2338
NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
2359
2339
"""
2360
- ...
2361
2340
2362
2341
2363
2342
# // Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
@@ -2387,15 +2366,13 @@ def llama_batch_init(
2387
2366
Otherwise, llama_batch.token will be allocated to store n_tokens llama_token
2388
2367
The rest of the llama_batch members are allocated with size n_tokens
2389
2368
All members are left uninitialized"""
2390
- ...
2391
2369
2392
2370
2393
2371
# // Frees a batch of tokens allocated with llama_batch_init()
2394
2372
# LLAMA_API void llama_batch_free(struct llama_batch batch);
2395
2373
@ctypes_function ("llama_batch_free" , [llama_batch ], None )
2396
2374
def llama_batch_free (batch : llama_batch , / ):
2397
2375
"""Frees a batch of tokens allocated with llama_batch_init()"""
2398
- ...
2399
2376
2400
2377
2401
2378
# // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
@@ -2411,7 +2388,6 @@ def llama_encode(ctx: llama_context_p, batch: llama_batch, /) -> int:
2411
2388
Stores the encoder output internally for later use by the decoder cross-attention layers.
2412
2389
0 - success
2413
2390
< 0 - error"""
2414
- ...
2415
2391
2416
2392
2417
2393
# // Positive return values does not mean a fatal error, but rather a warning.
@@ -2427,7 +2403,6 @@ def llama_decode(ctx: llama_context_p, batch: llama_batch, /) -> int:
2427
2403
0 - success
2428
2404
1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
2429
2405
< 0 - error"""
2430
- ...
2431
2406
2432
2407
2433
2408
# // Set the number of threads used for decoding
@@ -2453,23 +2428,20 @@ def llama_set_n_threads(
2453
2428
n_threads is the number of threads used for generation (single token)
2454
2429
n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
2455
2430
"""
2456
- ...
2457
2431
2458
2432
2459
2433
# // Get the number of threads used for generation of a single token.
2460
2434
# LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
2461
2435
@ctypes_function ("llama_n_threads" , [llama_context_p_ctypes ], ctypes .c_uint32 )
2462
2436
def llama_n_threads (ctx : llama_context_p , / ) -> int :
2463
2437
"""Get the number of threads used for generation of a single token"""
2464
- ...
2465
2438
2466
2439
2467
2440
# // Get the number of threads used for prompt and batch processing (multiple token).
2468
2441
# LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
2469
2442
@ctypes_function ("llama_n_threads_batch" , [llama_context_p_ctypes ], ctypes .c_uint32 )
2470
2443
def llama_n_threads_batch (ctx : llama_context_p , / ) -> int :
2471
2444
"""Get the number of threads used for prompt and batch processing (multiple token)"""
2472
- ...
2473
2445
2474
2446
2475
2447
# // Set whether the model is in embeddings mode or not
@@ -2479,7 +2451,6 @@ def llama_n_threads_batch(ctx: llama_context_p, /) -> int:
2479
2451
def llama_set_embeddings (ctx : llama_context_p , embeddings : bool , / ):
2480
2452
"""Set whether the model is in embeddings model or not
2481
2453
If true, embeddings will be returned but logits will not"""
2482
- ...
2483
2454
2484
2455
2485
2456
# // Set whether to use causal attention or not
@@ -2489,7 +2460,6 @@ def llama_set_embeddings(ctx: llama_context_p, embeddings: bool, /):
2489
2460
def llama_set_causal_attn (ctx : llama_context_p , causal_attn : bool , / ):
2490
2461
"""Set whether to use causal attention or not
2491
2462
If set to true, the model will only attend to the past tokens"""
2492
- ...
2493
2463
2494
2464
2495
2465
# // Set abort callback
@@ -2506,7 +2476,6 @@ def llama_set_abort_callback(
2506
2476
/ ,
2507
2477
):
2508
2478
"""Set abort callback"""
2509
- ...
2510
2479
2511
2480
2512
2481
# // Wait until all computations are finished
@@ -2518,7 +2487,6 @@ def llama_synchronize(ctx: llama_context_p, /):
2518
2487
"""Wait until all computations are finished
2519
2488
This is automatically done when using one of the functions below to obtain the computation results
2520
2489
and is not necessary to call it explicitly in most cases"""
2521
- ...
2522
2490
2523
2491
2524
2492
# // Token logits obtained from the last call to llama_decode()
@@ -2539,7 +2507,6 @@ def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
2539
2507
2540
2508
Returns:
2541
2509
Pointer to the logits buffer of shape (n_tokens, n_vocab)"""
2542
- ...
2543
2510
2544
2511
2545
2512
# // Logits for the ith token. For positive indices, Equivalent to:
@@ -2557,7 +2524,6 @@ def llama_get_logits_ith(
2557
2524
) -> CtypesArray [ctypes .c_float ]:
2558
2525
"""Logits for the ith token. Equivalent to:
2559
2526
llama_get_logits(ctx) + i*n_vocab"""
2560
- ...
2561
2527
2562
2528
2563
2529
# // Get all output token embeddings.
@@ -2573,7 +2539,6 @@ def llama_get_logits_ith(
2573
2539
def llama_get_embeddings (ctx : llama_context_p , / ) -> CtypesArray [ctypes .c_float ]:
2574
2540
"""Get the embeddings for the input
2575
2541
shape: [n_embd] (1-dimensional)"""
2576
- ...
2577
2542
2578
2543
2579
2544
# // Get the embeddings for the ith token. For positive indices, Equivalent to:
@@ -2592,7 +2557,6 @@ def llama_get_embeddings_ith(
2592
2557
) -> CtypesArray [ctypes .c_float ]:
2593
2558
"""Get the embeddings for the ith sequence
2594
2559
llama_get_embeddings(ctx) + i*n_embd"""
2595
- ...
2596
2560
2597
2561
2598
2562
# // Get the embeddings for a sequence id
0 commit comments