Skip to content

Commit d2f0d12

Browse files
committed
fix: Display performance metrics by default
1 parent 83ed554 commit d2f0d12

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

llama_cpp/llama.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def __init__(
9494
offload_kqv: bool = True,
9595
flash_attn: bool = False,
9696
# Sampling Params
97+
no_perf: bool = False,
9798
last_n_tokens_size: int = 64,
9899
# LoRA Params
99100
lora_base: Optional[str] = None,
@@ -173,6 +174,7 @@ def __init__(
173174
embedding: Embedding mode only.
174175
offload_kqv: Offload K, Q, V to GPU.
175176
flash_attn: Use flash attention.
177+
no_perf: Measure performance timings.
176178
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
177179
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
178180
lora_path: Path to a LoRA file to apply to the model.
@@ -351,6 +353,7 @@ def __init__(
351353
if type_v is not None:
352354
self.context_params.type_v = type_v
353355
# Sampling Params
356+
self.context_params.no_perf = no_perf
354357
self.last_n_tokens_size = last_n_tokens_size
355358

356359
self.cache: Optional[BaseLlamaCache] = None
@@ -2093,6 +2096,7 @@ def __getstate__(self):
20932096
offload_kqv=self.context_params.offload_kqv,
20942097
flash_attn=self.context_params.flash_attn,
20952098
# Sampling Params
2099+
no_perf=self.context_params.no_perf,
20962100
last_n_tokens_size=self.last_n_tokens_size,
20972101
# LoRA Params
20982102
lora_base=self.lora_base,

0 commit comments

Comments
 (0)