Skip to content

Commit f0df0df

Browse files
authored
Merge branch 'main' into cuda
2 parents 04af8b6 + 5ab40e6 commit f0df0df

File tree

10 files changed

+143
-46
lines changed

10 files changed

+143
-46
lines changed

.github/dependabot.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,12 @@ updates:
88
- package-ecosystem: "pip" # See documentation for possible values
99
directory: "/" # Location of package manifests
1010
schedule:
11-
interval: "weekly"
11+
interval: "daily"
1212
- package-ecosystem: "github-actions"
1313
directory: "/"
1414
schedule:
15-
interval: "weekly"
15+
interval: "daily"
16+
- package-ecosystem: "docker"
17+
directory: "/"
18+
schedule:
19+
interval: "daily"

CHANGELOG.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.71]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@911b3900dded9a1cfe0f0e41b82c7a29baf3a217
13+
- fix: Make leading bos_token optional for image chat formats, fix nanollava system message by @abetlen in 77122638b4153e31d9f277b3d905c2900b536632
14+
- fix: free last image embed in llava chat handler by @abetlen in 3757328b703b2cd32dcbd5853271e3a8c8599fe7
15+
16+
## [0.2.70]
17+
18+
- feat: Update llama.cpp to ggerganov/llama.cpp@c0e6fbf8c380718102bd25fcb8d2e55f8f9480d1
19+
- feat: fill-in-middle support by @CISC in #1386
20+
- fix: adding missing args in create_completion for functionary chat handler by @skalade in #1430
21+
- docs: update README.md @eltociear in #1432
22+
- fix: chat_format log where auto-detected format prints None by @balvisio in #1434
23+
- feat(server): Add support for setting root_path by @abetlen in 0318702cdc860999ee70f277425edbbfe0e60419
24+
- feat(ci): Add docker checks and check deps more frequently by @Smartappli in #1426
25+
- fix: detokenization case where first token does not start with a leading space by @noamgat in #1375
26+
- feat: Implement streaming for Functionary v2 + Bug fixes by @jeffrey-fong in #1419
27+
- fix: Use memmove to copy str_value kv_override by @abetlen in 9f7a85571ae80d3b6ddbd3e1bae407b9f1e3448a
28+
- feat(server): Remove temperature bounds checks for server by @abetlen in 0a454bebe67d12a446981eb16028c168ca5faa81
29+
- fix(server): Propagate flash_attn to model load by @dthuerck in #1424
30+
1031
## [0.2.69]
1132

1233
- feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,7 @@ chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
516516
llm = Llama(
517517
model_path="./path/to/llava/llama-model.gguf",
518518
chat_handler=chat_handler,
519-
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
519+
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
520520
)
521521
llm.create_chat_completion(
522522
messages = [
@@ -547,7 +547,7 @@ llm = Llama.from_pretrained(
547547
repo_id="vikhyatk/moondream2",
548548
filename="*text-model*",
549549
chat_handler=chat_handler,
550-
n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
550+
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
551551
)
552552

553553
respoonse = llm.create_chat_completion(

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.69"
4+
__version__ = "0.2.71"

llama_cpp/llama.py

Lines changed: 70 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,7 @@ def __init__(
378378

379379
self.chat_format = chat_format
380380
self.chat_handler = chat_handler
381+
self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = {}
381382

382383
self.draft_model = draft_model
383384

@@ -409,10 +410,33 @@ def __init__(
409410
if self.verbose:
410411
print(f"Model metadata: {self.metadata}", file=sys.stderr)
411412

413+
eos_token_id = int(self.metadata.get("tokenizer.ggml.eos_token_id", self.token_eos()))
414+
bos_token_id = int(self.metadata.get("tokenizer.ggml.bos_token_id", self.token_bos()))
415+
416+
eos_token = self._model.token_get_text(eos_token_id)
417+
bos_token = self._model.token_get_text(bos_token_id)
418+
419+
# Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
420+
template_choices = dict((name[10:], template) for name, template in self.metadata.items() if name.startswith("tokenizer.chat_template."))
421+
422+
if "tokenizer.chat_template" in self.metadata:
423+
template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"]
424+
425+
if self.verbose and template_choices:
426+
print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr)
427+
428+
for name, template in template_choices.items():
429+
self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter(
430+
template=template,
431+
eos_token=eos_token,
432+
bos_token=bos_token,
433+
stop_token_ids=[eos_token_id],
434+
).to_chat_handler()
435+
412436
if (
413437
self.chat_format is None
414438
and self.chat_handler is None
415-
and "tokenizer.chat_template" in self.metadata
439+
and "chat_template.default" in template_choices
416440
):
417441
chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
418442
self.metadata
@@ -423,35 +447,17 @@ def __init__(
423447
if self.verbose:
424448
print(f"Guessed chat format: {chat_format}", file=sys.stderr)
425449
else:
426-
template = self.metadata["tokenizer.chat_template"]
427-
try:
428-
eos_token_id = int(self.metadata["tokenizer.ggml.eos_token_id"])
429-
except:
430-
eos_token_id = self.token_eos()
431-
try:
432-
bos_token_id = int(self.metadata["tokenizer.ggml.bos_token_id"])
433-
except:
434-
bos_token_id = self.token_bos()
435-
436-
eos_token = self._model.token_get_text(eos_token_id)
437-
bos_token = self._model.token_get_text(bos_token_id)
438-
439450
if self.verbose:
440-
print(f"Using gguf chat template: {template}", file=sys.stderr)
451+
print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr)
441452
print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
442453
print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
443454

444-
self.chat_handler = llama_chat_format.Jinja2ChatFormatter(
445-
template=template,
446-
eos_token=eos_token,
447-
bos_token=bos_token,
448-
stop_token_ids=[eos_token_id],
449-
).to_chat_handler()
455+
self.chat_format = "chat_template.default"
450456

451457
if self.chat_format is None and self.chat_handler is None:
452458
self.chat_format = "llama-2"
453459
if self.verbose:
454-
print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
460+
print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
455461

456462
@property
457463
def ctx(self) -> llama_cpp.llama_context_p:
@@ -955,18 +961,53 @@ def _create_completion(
955961

956962
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
957963
created: int = int(time.time())
964+
prefix_token_id: int = int(self.metadata.get("tokenizer.ggml.prefix_token_id", self._model.token_prefix()))
965+
middle_token_id: int = int(self.metadata.get("tokenizer.ggml.middle_token_id", self._model.token_middle()))
966+
suffix_token_id: int = int(self.metadata.get("tokenizer.ggml.suffix_token_id", self._model.token_suffix()))
958967
# If prompt is empty, initialize completion with BOS token to avoid
959968
# detokenization including a space at the beginning of the completion
960969
completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
961970
# Add blank space to start of prompt to match OG llama tokenizer
962971
prompt_tokens: List[int] = (
963972
(
964-
self.tokenize(prompt.encode("utf-8"), special=True)
965-
if prompt != ""
966-
else [self.token_bos()]
973+
[prefix_token_id]
974+
if prefix_token_id >= 0 and suffix is not None
975+
else []
976+
)
977+
+
978+
(
979+
(
980+
self.tokenize(prompt.encode("utf-8"), add_bos=(prefix_token_id < 0 or suffix is None), special=(prefix_token_id < 0 or suffix is None))
981+
if prompt != ""
982+
else (
983+
[]
984+
if prefix_token_id >= 0 and suffix is not None
985+
else [self.token_bos()]
986+
)
987+
)
988+
if isinstance(prompt, str)
989+
else prompt
990+
)
991+
+
992+
(
993+
(
994+
[suffix_token_id]
995+
+
996+
(
997+
self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)
998+
if suffix
999+
else []
1000+
)
1001+
)
1002+
if suffix_token_id >= 0 and suffix is not None
1003+
else []
1004+
)
1005+
+
1006+
(
1007+
[middle_token_id]
1008+
if middle_token_id >= 0 and suffix is not None
1009+
else []
9671010
)
968-
if isinstance(prompt, str)
969-
else prompt
9701011
)
9711012
text: bytes = b""
9721013
returned_tokens: int = 0
@@ -1346,7 +1387,7 @@ def logit_bias_processor(
13461387
if echo:
13471388
text_str = prompt + text_str
13481389

1349-
if suffix is not None:
1390+
if suffix_token_id < 0 and suffix is not None:
13501391
text_str = text_str + suffix
13511392

13521393
logprobs_or_none: Optional[CompletionLogprobs] = None
@@ -1684,7 +1725,7 @@ def create_chat_completion(
16841725
Returns:
16851726
Generated chat completion or a stream of chat completion chunks.
16861727
"""
1687-
handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
1728+
handler = self.chat_handler or self._chat_handlers.get(self.chat_format) or llama_chat_format.get_chat_completion_handler(
16881729
self.chat_format
16891730
)
16901731
return handler(

llama_cpp/llama_chat_format.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2322,7 +2322,7 @@ def generate_streaming(tools, functions, function_call, prompt):
23222322
prompt = prompt
23232323
stops = ["\n", END_ASSISTANT_TOKEN]
23242324

2325-
completion = create_completion(stop=stops)
2325+
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
23262326
completion_text = completion["choices"][0]["text"]
23272327
completion_tokens += completion["usage"]["completion_tokens"]
23282328

@@ -2349,7 +2349,7 @@ def generate_streaming(tools, functions, function_call, prompt):
23492349
completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
23502350
)
23512351
grammar = get_grammar(function_calls[-1])
2352-
completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
2352+
completion = create_completion(prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar)
23532353
completion_tokens += completion["usage"]["completion_tokens"]
23542354
function_bodies.append(completion["choices"][0]["text"].strip())
23552355
# If the prompt involves a function call, just append generated parameters to function_bodies
@@ -2363,7 +2363,7 @@ def generate_streaming(tools, functions, function_call, prompt):
23632363
function_calls.append(function_call)
23642364
grammar = get_grammar(function_call)
23652365
stops = [STOP_TOKEN, FROM_TOKEN]
2366-
completion = create_completion(stop=stops)
2366+
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
23672367
completion_text = completion["choices"][0]["text"]
23682368
completion_tokens += completion["usage"]["completion_tokens"]
23692369
function_bodies.append(completion_text.strip())
@@ -2373,7 +2373,7 @@ def generate_streaming(tools, functions, function_call, prompt):
23732373
# Generate function name first
23742374
grammar = None
23752375
stops = CONTENT_TOKEN
2376-
completion = create_completion(stop=stops)
2376+
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
23772377
completion_text = completion["choices"][0]["text"]
23782378
completion_tokens += completion["usage"]["completion_tokens"]
23792379
function_name = completion_text.strip()
@@ -2386,7 +2386,7 @@ def generate_streaming(tools, functions, function_call, prompt):
23862386
grammar = get_grammar(function_call)
23872387
# Generate content
23882388
stops = [RECIPIENT_TOKEN, STOP_TOKEN]
2389-
completion = create_completion(stop=stops)
2389+
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
23902390
completion_text = completion["choices"][0]["text"]
23912391
completion_tokens += completion["usage"]["completion_tokens"]
23922392
if function_name == "all":
@@ -2413,7 +2413,7 @@ def generate_streaming(tools, functions, function_call, prompt):
24132413
# Check whether the model wants to generate another turn
24142414
prompt += completion_text.strip()
24152415
grammar = None
2416-
completion = create_completion(stop=stops)
2416+
completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
24172417
completion_tokens += completion["usage"]["completion_tokens"]
24182418
if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
24192419
prompt += "\n<|from|>assistant\n<|recipient|>"
@@ -2603,13 +2603,23 @@ def __call__(
26032603

26042604
image_urls = self.get_image_urls(messages)
26052605
template = jinja2.Template(self.CHAT_FORMAT)
2606-
text = template.render(messages=messages, add_generation_prompt=True)
2606+
text = template.render(
2607+
messages=messages,
2608+
add_generation_prompt=True,
2609+
eos_token=llama.detokenize([llama.token_eos()]),
2610+
bos_token=llama.detokenize([llama.token_bos()]),
2611+
)
26072612
split_text = self.split_text_on_image_urls(text, image_urls)
26082613

26092614
def embed_image_bytes(image_bytes: bytes):
26102615
if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash:
26112616
return self._last_image_embed
26122617
with suppress_stdout_stderr(disable=self.verbose):
2618+
# Free the previous image embed
2619+
if self._last_image_embed is not None:
2620+
self._llava_cpp.llava_image_embed_free(self._last_image_embed)
2621+
self._last_image_embed = None
2622+
self._last_image_hash = None
26132623
embed = (
26142624
self._llava_cpp.llava_image_embed_make_with_bytes(
26152625
self.clip_ctx,
@@ -2624,9 +2634,9 @@ def embed_image_bytes(image_bytes: bytes):
26242634

26252635
# Evaluate prompt
26262636
llama.reset()
2627-
for i, (type_, value) in enumerate(split_text):
2637+
for type_, value in split_text:
26282638
if type_ == "text":
2629-
tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0)
2639+
tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)
26302640
if llama.n_tokens + len(tokens) > llama.n_ctx():
26312641
raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
26322642
llama.eval(tokens)
@@ -2644,6 +2654,8 @@ def embed_image_bytes(image_bytes: bytes):
26442654
llama.n_batch,
26452655
n_past_p,
26462656
)
2657+
# Required to avoid issues with hf tokenizer
2658+
llama.input_ids[llama.n_tokens : n_past.value] = -1
26472659
llama.n_tokens = n_past.value
26482660

26492661
# Get prompt tokens to avoid a cache miss
@@ -3033,6 +3045,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
30333045
# Answer the question<|im_end|><|im_start|>user
30343046
# <image>
30353047
# What is the picture about?<|im_end|><|im_start|>assistant
3048+
DEFAULT_SYSTEM_MESSAGE = "Answer the question"
30363049

30373050
CHAT_FORMAT = (
30383051
"{% for message in messages %}"
@@ -3564,4 +3577,4 @@ def chatml_function_calling(
35643577
},
35653578
}
35663579

3567-
raise ValueError("Automatic streaming tool choice is not supported")
3580+
raise ValueError("Automatic streaming tool choice is not supported")

llama_cpp/llama_cpp.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,11 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
294294
# LLAMA_VOCAB_PRE_TYPE_MPT = 5,
295295
# LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
296296
# LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
297+
# LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
298+
# LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
299+
# LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
300+
# LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
301+
# LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
297302
# };
298303
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
299304
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -303,6 +308,11 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
303308
LLAMA_VOCAB_PRE_TYPE_MPT = 5
304309
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
305310
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
311+
LLAMA_VOCAB_PRE_TYPE_REFACT = 8
312+
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9
313+
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10
314+
LLAMA_VOCAB_PRE_TYPE_OLMO = 11
315+
LLAMA_VOCAB_PRE_TYPE_DBRX = 12
306316

307317

308318
# // note: these values should be synchronized with ggml_rope
@@ -371,6 +381,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
371381
# LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
372382
# LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
373383
# LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
384+
# LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
374385

375386
# LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
376387
# };
@@ -403,6 +414,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
403414
LLAMA_FTYPE_MOSTLY_IQ2_S = 28
404415
LLAMA_FTYPE_MOSTLY_IQ2_M = 29
405416
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
417+
LLAMA_FTYPE_MOSTLY_IQ1_M = 31
418+
LLAMA_FTYPE_MOSTLY_BF16 = 32
406419
LLAMA_FTYPE_GUESSED = 1024
407420

408421
# enum llama_rope_scaling_type {
@@ -494,7 +507,7 @@ class llama_token_data_array(ctypes.Structure):
494507

495508
llama_token_data_array_p = ctypes.POINTER(llama_token_data_array)
496509

497-
# typedef bool (*llama_progress_callback)(float progress, void *ctx);
510+
# typedef bool (*llama_progress_callback)(float progress, void * user_data);
498511
llama_progress_callback = ctypes.CFUNCTYPE(
499512
ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
500513
)

0 commit comments

Comments
 (0)