@@ -782,6 +782,7 @@ class llama_context_params(ctypes.Structure):
782
782
embeddings (bool): if true, extract embeddings (together with logits)
783
783
offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
784
784
flash_attn (bool): whether to use flash attention
785
+ no_perf (bool): whether to measure performance timings
785
786
abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
786
787
abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
787
788
"""
@@ -812,6 +813,7 @@ class llama_context_params(ctypes.Structure):
812
813
embeddings : bool
813
814
offload_kqv : bool
814
815
flash_attn : bool
816
+ no_perf : bool
815
817
abort_callback : Callable [[ctypes .c_void_p ], bool ]
816
818
abort_callback_data : ctypes .c_void_p
817
819
@@ -841,6 +843,7 @@ class llama_context_params(ctypes.Structure):
841
843
("embeddings" , ctypes .c_bool ),
842
844
("offload_kqv" , ctypes .c_bool ),
843
845
("flash_attn" , ctypes .c_bool ),
846
+ ("no_perf" , ctypes .c_bool ),
844
847
("abort_callback" , ggml_abort_callback ),
845
848
("abort_callback_data" , ctypes .c_void_p ),
846
849
]
0 commit comments