@@ -94,6 +94,7 @@ def __init__(
94
94
offload_kqv : bool = True ,
95
95
flash_attn : bool = False ,
96
96
# Sampling Params
97
+ no_perf : bool = False ,
97
98
last_n_tokens_size : int = 64 ,
98
99
# LoRA Params
99
100
lora_base : Optional [str ] = None ,
@@ -173,6 +174,7 @@ def __init__(
173
174
embedding: Embedding mode only.
174
175
offload_kqv: Offload K, Q, V to GPU.
175
176
flash_attn: Use flash attention.
177
+ no_perf: Measure performance timings.
176
178
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
177
179
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
178
180
lora_path: Path to a LoRA file to apply to the model.
@@ -351,6 +353,7 @@ def __init__(
351
353
if type_v is not None :
352
354
self .context_params .type_v = type_v
353
355
# Sampling Params
356
+ self .context_params .no_perf = no_perf
354
357
self .last_n_tokens_size = last_n_tokens_size
355
358
356
359
self .cache : Optional [BaseLlamaCache ] = None
@@ -2093,6 +2096,7 @@ def __getstate__(self):
2093
2096
offload_kqv = self .context_params .offload_kqv ,
2094
2097
flash_attn = self .context_params .flash_attn ,
2095
2098
# Sampling Params
2099
+ no_perf = self .context_params .no_perf ,
2096
2100
last_n_tokens_size = self .last_n_tokens_size ,
2097
2101
# LoRA Params
2098
2102
lora_base = self .lora_base ,
0 commit comments