Merge branch 'main' into cuda

Smartappli · web-flow · commit f0df0df742e4 · 2024-05-10T00:45:03.000+02:00
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -8,8 +8,12 @@ updates:
   - package-ecosystem: "pip" # See documentation for possible values
     directory: "/" # Location of package manifests
     schedule:
-      interval: "weekly"
+      interval: "daily"
   - package-ecosystem: "github-actions"
     directory: "/"
     schedule:
-      interval: "weekly"    
+      interval: "daily"
+  - package-ecosystem: "docker"
+    directory: "/"
+    schedule:
+      interval: "daily"   
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.2.71]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@911b3900dded9a1cfe0f0e41b82c7a29baf3a217
+- fix: Make leading bos_token optional for image chat formats, fix nanollava system message by @abetlen in 77122638b4153e31d9f277b3d905c2900b536632
+- fix: free last image embed in llava chat handler by @abetlen in 3757328b703b2cd32dcbd5853271e3a8c8599fe7
+
+## [0.2.70]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@c0e6fbf8c380718102bd25fcb8d2e55f8f9480d1
+- feat: fill-in-middle support by @CISC in #1386
+- fix: adding missing args in create_completion for functionary chat handler by @skalade in #1430
+- docs: update README.md @eltociear in #1432
+- fix: chat_format log where auto-detected format prints None by @balvisio in #1434
+- feat(server): Add support for setting root_path by @abetlen in 0318702cdc860999ee70f277425edbbfe0e60419
+- feat(ci): Add docker checks and check deps more frequently by @Smartappli in #1426
+- fix: detokenization case where first token does not start with a leading space by @noamgat in #1375
+- feat: Implement streaming for Functionary v2 + Bug fixes by @jeffrey-fong in #1419
+- fix: Use memmove to copy str_value kv_override by @abetlen in 9f7a85571ae80d3b6ddbd3e1bae407b9f1e3448a
+- feat(server): Remove temperature bounds checks for server by @abetlen in 0a454bebe67d12a446981eb16028c168ca5faa81
+- fix(server): Propagate flash_attn to model load by @dthuerck in #1424
+
 ## [0.2.69]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@6ecf3189e00a1e8e737a78b6d10e1d7006e050a2
diff --git a/README.md b/README.md
@@ -516,7 +516,7 @@ chat_handler = Llava15ChatHandler(clip_model_path="path/to/llava/mmproj.bin")
 llm = Llama(
   model_path="./path/to/llava/llama-model.gguf",
   chat_handler=chat_handler,
-  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
+  n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
 )
 llm.create_chat_completion(
     messages = [
@@ -547,7 +547,7 @@ llm = Llama.from_pretrained(
   repo_id="vikhyatk/moondream2",
   filename="*text-model*",
   chat_handler=chat_handler,
-  n_ctx=2048, # n_ctx should be increased to accomodate the image embedding
+  n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
 )
 
 respoonse = llm.create_chat_completion(
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.2.69"
+__version__ = "0.2.71"
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -378,6 +378,7 @@ def __init__(
 
         self.chat_format = chat_format
         self.chat_handler = chat_handler
+        self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = {}
 
         self.draft_model = draft_model
 
@@ -409,10 +410,33 @@ def __init__(
         if self.verbose:
             print(f"Model metadata: {self.metadata}", file=sys.stderr)
 
+        eos_token_id = int(self.metadata.get("tokenizer.ggml.eos_token_id", self.token_eos()))
+        bos_token_id = int(self.metadata.get("tokenizer.ggml.bos_token_id", self.token_bos()))
+
+        eos_token = self._model.token_get_text(eos_token_id)
+        bos_token = self._model.token_get_text(bos_token_id)
+
+        # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
+        template_choices = dict((name[10:], template) for name, template in self.metadata.items() if name.startswith("tokenizer.chat_template."))
+
+        if "tokenizer.chat_template" in self.metadata:
+            template_choices["chat_template.default"] = self.metadata["tokenizer.chat_template"]
+
+        if self.verbose and template_choices:
+            print(f"Available chat formats from metadata: {', '.join(template_choices.keys())}", file=sys.stderr)
+
+        for name, template in template_choices.items():
+            self._chat_handlers[name] = llama_chat_format.Jinja2ChatFormatter(
+                template=template,
+                eos_token=eos_token,
+                bos_token=bos_token,
+                stop_token_ids=[eos_token_id],
+            ).to_chat_handler()
+
         if (
             self.chat_format is None
             and self.chat_handler is None
-            and "tokenizer.chat_template" in self.metadata
+            and "chat_template.default" in template_choices
         ):
             chat_format = llama_chat_format.guess_chat_format_from_gguf_metadata(
                 self.metadata
@@ -423,35 +447,17 @@ def __init__(
                 if self.verbose:
                     print(f"Guessed chat format: {chat_format}", file=sys.stderr)
             else:
-                template = self.metadata["tokenizer.chat_template"]
-                try:
-                    eos_token_id = int(self.metadata["tokenizer.ggml.eos_token_id"])
-                except:
-                    eos_token_id = self.token_eos()
-                try:
-                    bos_token_id = int(self.metadata["tokenizer.ggml.bos_token_id"])
-                except:
-                    bos_token_id = self.token_bos()
-
-                eos_token = self._model.token_get_text(eos_token_id)
-                bos_token = self._model.token_get_text(bos_token_id)
-
                 if self.verbose:
-                    print(f"Using gguf chat template: {template}", file=sys.stderr)
+                    print(f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr)
                     print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
                     print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
 
-                self.chat_handler = llama_chat_format.Jinja2ChatFormatter(
-                    template=template,
-                    eos_token=eos_token,
-                    bos_token=bos_token,
-                    stop_token_ids=[eos_token_id],
-                ).to_chat_handler()
+                self.chat_format = "chat_template.default"
 
         if self.chat_format is None and self.chat_handler is None:
             self.chat_format = "llama-2"
             if self.verbose:
-                print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
+                print(f"Using fallback chat format: {self.chat_format}", file=sys.stderr)
 
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
@@ -955,18 +961,53 @@ def _create_completion(
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
+        prefix_token_id: int = int(self.metadata.get("tokenizer.ggml.prefix_token_id", self._model.token_prefix()))
+        middle_token_id: int = int(self.metadata.get("tokenizer.ggml.middle_token_id", self._model.token_middle()))
+        suffix_token_id: int = int(self.metadata.get("tokenizer.ggml.suffix_token_id", self._model.token_suffix()))
         # If prompt is empty, initialize completion with BOS token to avoid
         # detokenization including a space at the beginning of the completion
         completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
         # Add blank space to start of prompt to match OG llama tokenizer
         prompt_tokens: List[int] = (
             (
-                self.tokenize(prompt.encode("utf-8"), special=True)
-                if prompt != ""
-                else [self.token_bos()]
+                [prefix_token_id]
+                if prefix_token_id >= 0 and suffix is not None
+                else []
+            )
+            +
+            (
+                (
+                    self.tokenize(prompt.encode("utf-8"), add_bos=(prefix_token_id < 0 or suffix is None), special=(prefix_token_id < 0 or suffix is None))
+                    if prompt != ""
+                    else (
+                        []
+                        if prefix_token_id >= 0 and suffix is not None
+                        else [self.token_bos()]
+                    )
+                )
+                if isinstance(prompt, str)
+                else prompt
+            )
+            +
+            (
+                (
+                    [suffix_token_id]
+                    +
+                    (
+                        self.tokenize(suffix.encode("utf-8"), add_bos=False, special=False)
+                        if suffix
+                        else []
+                    )
+                )
+                if suffix_token_id >= 0 and suffix is not None
+                else []
+            )
+            +
+            (
+                [middle_token_id]
+                if middle_token_id >= 0 and suffix is not None
+                else []
             )
-            if isinstance(prompt, str)
-            else prompt
         )
         text: bytes = b""
         returned_tokens: int = 0
@@ -1346,7 +1387,7 @@ def logit_bias_processor(
         if echo:
             text_str = prompt + text_str
 
-        if suffix is not None:
+        if suffix_token_id < 0 and suffix is not None:
             text_str = text_str + suffix
 
         logprobs_or_none: Optional[CompletionLogprobs] = None
@@ -1684,7 +1725,7 @@ def create_chat_completion(
         Returns:
             Generated chat completion or a stream of chat completion chunks.
         """
-        handler = self.chat_handler or llama_chat_format.get_chat_completion_handler(
+        handler = self.chat_handler or self._chat_handlers.get(self.chat_format) or llama_chat_format.get_chat_completion_handler(
             self.chat_format
         )
         return handler(
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2322,7 +2322,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                 prompt = prompt
                 stops = ["\n", END_ASSISTANT_TOKEN]
 
-            completion = create_completion(stop=stops)
+            completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
             completion_text = completion["choices"][0]["text"]
             completion_tokens += completion["usage"]["completion_tokens"]
             
@@ -2349,7 +2349,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                     completion_text.split(START_FUNCTION_CALL_TOKEN)[-1][:-1].strip()
                 )
                 grammar = get_grammar(function_calls[-1])
-                completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
+                completion = create_completion(prompt=prompt, stop=END_FUNCTION_CALL_TOKEN, grammar=grammar)
                 completion_tokens += completion["usage"]["completion_tokens"]
                 function_bodies.append(completion["choices"][0]["text"].strip())
             # If the prompt involves a function call, just append generated parameters to function_bodies
@@ -2363,7 +2363,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                 function_calls.append(function_call)
                 grammar = get_grammar(function_call)
                 stops = [STOP_TOKEN, FROM_TOKEN]
-                completion = create_completion(stop=stops)
+                completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
                 completion_text = completion["choices"][0]["text"]
                 completion_tokens += completion["usage"]["completion_tokens"]
                 function_bodies.append(completion_text.strip())
@@ -2373,7 +2373,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                     # Generate function name first
                     grammar = None
                     stops = CONTENT_TOKEN
-                    completion = create_completion(stop=stops)
+                    completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
                     completion_text = completion["choices"][0]["text"]
                     completion_tokens += completion["usage"]["completion_tokens"]
                     function_name = completion_text.strip()
@@ -2386,7 +2386,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         grammar = get_grammar(function_call)
                     # Generate content
                     stops = [RECIPIENT_TOKEN, STOP_TOKEN]
-                    completion = create_completion(stop=stops)
+                    completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
                     completion_text = completion["choices"][0]["text"]
                     completion_tokens += completion["usage"]["completion_tokens"]
                     if function_name == "all":
@@ -2413,7 +2413,7 @@ def generate_streaming(tools, functions, function_call, prompt):
                         # Check whether the model wants to generate another turn
                         prompt += completion_text.strip()
                         grammar = None
-                        completion = create_completion(stop=stops)
+                        completion = create_completion(prompt=prompt, stop=stops, grammar=grammar)
                         completion_tokens += completion["usage"]["completion_tokens"]
                         if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
                             prompt += "\n<|from|>assistant\n<|recipient|>"
@@ -2603,13 +2603,23 @@ def __call__(
 
         image_urls = self.get_image_urls(messages)
         template = jinja2.Template(self.CHAT_FORMAT)
-        text = template.render(messages=messages, add_generation_prompt=True)
+        text = template.render(
+            messages=messages,
+            add_generation_prompt=True,
+            eos_token=llama.detokenize([llama.token_eos()]),
+            bos_token=llama.detokenize([llama.token_bos()]),
+        )
         split_text = self.split_text_on_image_urls(text, image_urls)
 
         def embed_image_bytes(image_bytes: bytes):
             if self._last_image_embed is not None and self._last_image_hash is not None and hash(image_bytes) == self._last_image_hash:
                 return self._last_image_embed
             with suppress_stdout_stderr(disable=self.verbose):
+                # Free the previous image embed
+                if self._last_image_embed is not None:
+                    self._llava_cpp.llava_image_embed_free(self._last_image_embed)
+                    self._last_image_embed = None
+                    self._last_image_hash = None
                 embed = (
                     self._llava_cpp.llava_image_embed_make_with_bytes(
                         self.clip_ctx,
@@ -2624,9 +2634,9 @@ def embed_image_bytes(image_bytes: bytes):
 
         # Evaluate prompt
         llama.reset()
-        for i, (type_, value) in enumerate(split_text):
+        for type_, value in split_text:
             if type_ == "text":
-                tokens = llama.tokenize(value.encode("utf8"), add_bos=i == 0)
+                tokens = llama.tokenize(value.encode("utf8"), add_bos=False, special=True)
                 if llama.n_tokens + len(tokens) > llama.n_ctx():
                     raise ValueError("Prompt exceeds n_ctx") # TODO: Fix
                 llama.eval(tokens)
@@ -2644,6 +2654,8 @@ def embed_image_bytes(image_bytes: bytes):
                         llama.n_batch,
                         n_past_p,
                     )
+                # Required to avoid issues with hf tokenizer
+                llama.input_ids[llama.n_tokens : n_past.value] = -1
                 llama.n_tokens = n_past.value
 
         # Get prompt tokens to avoid a cache miss
@@ -3033,6 +3045,7 @@ class NanoLlavaChatHandler(Llava15ChatHandler):
     # Answer the question<|im_end|><|im_start|>user
     # <image>
     # What is the picture about?<|im_end|><|im_start|>assistant
+    DEFAULT_SYSTEM_MESSAGE = "Answer the question"
 
     CHAT_FORMAT = (
         "{% for message in messages %}"
@@ -3564,4 +3577,4 @@ def chatml_function_calling(
             },
         }
 
-    raise ValueError("Automatic streaming tool choice is not supported")
+    raise ValueError("Automatic streaming tool choice is not supported")
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -294,6 +294,11 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 #     LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
 #     LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
 #     LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+#     LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
+#     LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
+#     LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
+#     LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
+#     LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
 # };
 LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
@@ -303,6 +308,11 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 LLAMA_VOCAB_PRE_TYPE_MPT = 5
 LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
 LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
+LLAMA_VOCAB_PRE_TYPE_REFACT = 8
+LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9
+LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10
+LLAMA_VOCAB_PRE_TYPE_OLMO = 11
+LLAMA_VOCAB_PRE_TYPE_DBRX = 12
 
 
 # // note: these values should be synchronized with ggml_rope
@@ -371,6 +381,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 #     LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
 #     LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
+#     LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
 
 #     LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
 # };
@@ -403,6 +414,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 LLAMA_FTYPE_MOSTLY_IQ2_S = 28
 LLAMA_FTYPE_MOSTLY_IQ2_M = 29
 LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
+LLAMA_FTYPE_MOSTLY_IQ1_M = 31
+LLAMA_FTYPE_MOSTLY_BF16 = 32
 LLAMA_FTYPE_GUESSED = 1024
 
 # enum llama_rope_scaling_type {
@@ -494,7 +507,7 @@ class llama_token_data_array(ctypes.Structure):
 
 llama_token_data_array_p = ctypes.POINTER(llama_token_data_array)
 
-# typedef bool (*llama_progress_callback)(float progress, void *ctx);
+# typedef bool (*llama_progress_callback)(float progress, void * user_data);
 llama_progress_callback = ctypes.CFUNCTYPE(
     ctypes.c_bool, ctypes.c_float, ctypes.c_void_p
 )
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
diff --git a/vendor/llama.cpp b/vendor/llama.cpp