From 9d053d6f736c04a56f629266399d48935e09e386 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Thu, 9 May 2024 20:04:06 +0200
Subject: [PATCH 01/12] Templates sometimes have BOS in them, remove duplicate

---
 llama_cpp/llama.py | 4 ++++
 1 file changed, 4 insertions(+)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 4212669eb..576e5aeb0 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1016,6 +1016,10 @@ def _create_completion(
         )
         model_name: str = model if model is not None else self.model_path
 
+        # User or template may have added an unwanted extra BOS
+        if prompt_tokens[:2] == [self.token_bos()] * 2:
+            del prompt_tokens[0]
+
         # NOTE: This likely doesn't work correctly for the first token in the prompt
         # because of the extra space added to the start of the prompt_tokens
         if logit_bias is not None:

From a3df77d8d2243527e5910cee4c9811274e2207df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Fri, 10 May 2024 23:09:01 +0200
Subject: [PATCH 02/12] tokenize chat format prompts before completion

This is to ensure that we don't duplicate any special tokens.

Hopefully I amended the existing formats correctly?
---
 llama_cpp/llama_chat_format.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 84de989a5..57d4ac178 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -547,7 +547,7 @@ def chat_completion_handler(
             tools=tools,
             tool_choice=tool_choice,
         )
-        prompt = result.prompt
+        prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=False, special=True)
         if result.stop is not None:
             stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
             rstop = result.stop if isinstance(result.stop, list) else [result.stop]
@@ -958,7 +958,7 @@ def format_alpaca(
     _sep2 = "</s>"
     system_message = _get_system_message(messages)
     _messages = _map_roles(messages, _roles)
-    _prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2)
+    _prompt = "<s>" + _format_add_colon_two(system_message, _messages, _sep, _sep2)
     return ChatFormatterResponse(prompt=_prompt)
 
 
@@ -991,7 +991,7 @@ def format(
     system_message = _system_message
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2)
+    _prompt = "<s>" + _format_add_colon_two(system_message, _messages, _sep, _sep2)
     return ChatFormatterResponse(prompt=_prompt)
 
 
@@ -1007,7 +1007,7 @@ def format_oasst_llama(
     system_message = _system_template.format(system_message=system_message)
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = _format_no_colon_single(system_message, _messages, _sep)
+    _prompt = "<s>" + _format_no_colon_single(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt)
 
 
@@ -1187,7 +1187,7 @@ def format_zephyr(
     _sep = "</s>"
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = _format_chatml(system_message, _messages, _sep)
+    _prompt = "<s>" + _format_chatml(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
 
@@ -1262,7 +1262,7 @@ def format_chatglm3(
     _sep = "</s>"
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = _format_chatglm3(system_message, _messages, _sep)
+    _prompt = "<s>" + _format_chatglm3(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
 

From 803e8fa1c48f051aef91b0cd8bec2e191531ee9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Fri, 10 May 2024 23:11:43 +0200
Subject: [PATCH 03/12] updated comment

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 576e5aeb0..25b45c9b7 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1016,7 +1016,7 @@ def _create_completion(
         )
         model_name: str = model if model is not None else self.model_path
 
-        # User or template may have added an unwanted extra BOS
+        # User may have added an unwanted extra BOS
         if prompt_tokens[:2] == [self.token_bos()] * 2:
             del prompt_tokens[0]
 

From ed4e56b6f7093051cc653e265e445eff9e4bc998 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 11 May 2024 08:30:09 +0200
Subject: [PATCH 04/12] corrected a few

---
 llama_cpp/llama_chat_format.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 57d4ac178..a5899137b 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1187,7 +1187,7 @@ def format_zephyr(
     _sep = "</s>"
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = "<s>" + _format_chatml(system_message, _messages, _sep)
+    _prompt = _format_chatml(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
 
@@ -1262,7 +1262,7 @@ def format_chatglm3(
     _sep = "</s>"
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = "<s>" + _format_chatglm3(system_message, _messages, _sep)
+    _prompt = _format_chatglm3(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
 
@@ -1280,7 +1280,7 @@ def format_openchat(
     _sep = "<|end_of_turn|>"
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = _format_chatml(system_message, _messages, _sep)
+    _prompt = "<s>" + _format_chatml(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
 

From 06cf25d1ed4c681163b2246e46612a402e8e841b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 11 May 2024 09:49:18 +0200
Subject: [PATCH 05/12] add some missing internals

---
 llama_cpp/_internals.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index b404601d3..043f588a6 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -142,6 +142,14 @@ def token_eos(self) -> int:
         assert self.model is not None
         return llama_cpp.llama_token_eos(self.model)
 
+    def token_cls(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_cls(self.model)
+
+    def token_sep(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_token_sep(self.model)
+
     def token_nl(self) -> int:
         assert self.model is not None
         return llama_cpp.llama_token_nl(self.model)
@@ -162,6 +170,14 @@ def token_eot(self) -> int:
         assert self.model is not None
         return llama_cpp.llama_token_eot(self.model)
 
+    def add_bos_token(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_add_bos_token(self.model)
+
+    def add_eos_token(self) -> int:
+        assert self.model is not None
+        return llama_cpp.llama_add_eos_token(self.model)
+
     # Tokenization
 
     def tokenize(self, text: bytes, add_bos: bool, special: bool):

From bb6cf4f913bcade9d0b771be6045fdb3e150468a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 11 May 2024 10:27:13 +0200
Subject: [PATCH 06/12] proper bos/eos detection

---
 llama_cpp/llama_chat_format.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index a5899137b..307752aea 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -159,6 +159,7 @@ class ChatFormatterResponse:
     prompt: str
     stop: Optional[Union[str, List[str]]] = None
     stopping_criteria: Optional[llama.StoppingCriteriaList] = None
+    added_special: bool = False
 
 
 class ChatFormatter(Protocol):
@@ -231,7 +232,7 @@ def stop_on_last_token(
                 return tokens[-1] in self.stop_token_ids
             stopping_criteria = llama.StoppingCriteriaList([stop_on_last_token])
 
-        return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria)
+        return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria, added_special=True)
 
     def to_chat_handler(self) -> LlamaChatCompletionHandler:
         return chat_formatter_to_chat_completion_handler(self)
@@ -547,7 +548,11 @@ def chat_completion_handler(
             tools=tools,
             tool_choice=tool_choice,
         )
-        prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=False, special=True)
+
+        prompt = [llama.token_bos()] if not result.added_special and llama._model.add_bos_token() != 0 else []
+        prompt += llama.tokenize(result.prompt.encode("utf-8"), add_bos=False, special=True)
+        prompt += [llama.token_eos()] if not result.added_special and llama._model.vocab_type() == 1 and llama._model.add_eos_token() != 0 else []
+
         if result.stop is not None:
             stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
             rstop = result.stop if isinstance(result.stop, list) else [result.stop]
@@ -654,7 +659,7 @@ def format_autotokenizer(
         prompt: str = tokenizer.apply_chat_template(messages, tokenize=False)  # type: ignore
         assert isinstance(prompt, str)
         # Return formatted prompt and eos token by default
-        return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token)
+        return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token, added_special=True)
 
     return format_autotokenizer
 
@@ -708,7 +713,7 @@ def format_tokenizer_config(
             bos_token=bos_token,
             eos_token=eos_token,
         )
-        return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token])
+        return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token], added_special=True)
 
     return format_tokenizer_config
 
@@ -918,7 +923,7 @@ def format_llama2(
     messages: List[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
-    _system_template = "<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>"
+    _system_template = "[INST] <<SYS>>\n{system_message}\n<</SYS>>"
     _roles = dict(user="<s>[INST]", assistant="[/INST]")
     _messages = _map_roles(messages, _roles)
     system_message = _get_system_message(messages)
@@ -940,11 +945,10 @@ def format_llama3(
         user="<|start_header_id|>user<|end_header_id|>\n\n",
         assistant="<|start_header_id|>assistant<|end_header_id|>\n\n",
     )
-    _begin_token = "<|begin_of_text|>"
     _sep = "<|eot_id|>"
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = _format_no_colon_single(_begin_token, _messages, _sep)
+    _prompt = _format_no_colon_single("", _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
 
@@ -958,7 +962,7 @@ def format_alpaca(
     _sep2 = "</s>"
     system_message = _get_system_message(messages)
     _messages = _map_roles(messages, _roles)
-    _prompt = "<s>" + _format_add_colon_two(system_message, _messages, _sep, _sep2)
+    _prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2)
     return ChatFormatterResponse(prompt=_prompt)
 
 
@@ -991,7 +995,7 @@ def format(
     system_message = _system_message
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = "<s>" + _format_add_colon_two(system_message, _messages, _sep, _sep2)
+    _prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2)
     return ChatFormatterResponse(prompt=_prompt)
 
 
@@ -1007,7 +1011,7 @@ def format_oasst_llama(
     system_message = _system_template.format(system_message=system_message)
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = "<s>" + _format_no_colon_single(system_message, _messages, _sep)
+    _prompt = _format_no_colon_single(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt)
 
 
@@ -1229,10 +1233,9 @@ def format_mistral_instruct(
     messages: List[llama_types.ChatCompletionRequestMessage],
     **kwargs: Any,
 ) -> ChatFormatterResponse:
-    bos = "<s>"
     eos = "</s>"
     stop = eos
-    prompt = bos
+    prompt = ""
     for message in messages:
         if (
             message["role"] == "user"
@@ -1280,7 +1283,7 @@ def format_openchat(
     _sep = "<|end_of_turn|>"
     _messages = _map_roles(messages, _roles)
     _messages.append((_roles["assistant"], None))
-    _prompt = "<s>" + _format_chatml(system_message, _messages, _sep)
+    _prompt = _format_chatml(system_message, _messages, _sep)
     return ChatFormatterResponse(prompt=_prompt, stop=_sep)
 
 

From 2e26f2d4d17d615dacd5a3dc0e4bffcfc326c876 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 11 May 2024 10:42:37 +0200
Subject: [PATCH 07/12] just let tokenizer do the job

---
 llama_cpp/llama_chat_format.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 307752aea..564f5fd45 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -548,11 +548,7 @@ def chat_completion_handler(
             tools=tools,
             tool_choice=tool_choice,
         )
-
-        prompt = [llama.token_bos()] if not result.added_special and llama._model.add_bos_token() != 0 else []
-        prompt += llama.tokenize(result.prompt.encode("utf-8"), add_bos=False, special=True)
-        prompt += [llama.token_eos()] if not result.added_special and llama._model.vocab_type() == 1 and llama._model.add_eos_token() != 0 else []
-
+        prompt += llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True)
         if result.stop is not None:
             stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
             rstop = result.stop if isinstance(result.stop, list) else [result.stop]

From aa25cd3dbb30444387f0269b183a20a3e4813fae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 11 May 2024 10:44:28 +0200
Subject: [PATCH 08/12] typo--

---
 llama_cpp/llama_chat_format.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 564f5fd45..252267e94 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -548,7 +548,7 @@ def chat_completion_handler(
             tools=tools,
             tool_choice=tool_choice,
         )
-        prompt += llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True)
+        prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True)
         if result.stop is not None:
             stop = [] if stop is None else [stop] if isinstance(stop, str) else stop
             rstop = result.stop if isinstance(result.stop, list) else [result.stop]

From aef3b1c31a2ad372338106a5a90fa24331ab1dfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Sat, 11 May 2024 10:51:34 +0200
Subject: [PATCH 09/12] align test with new response

---
 tests/test_llama_chat_format.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py
index c10aee42e..f031bf72b 100644
--- a/tests/test_llama_chat_format.py
+++ b/tests/test_llama_chat_format.py
@@ -21,12 +21,13 @@ def test_mistral_instruct():
     response = llama_chat_format.format_mistral_instruct(
         messages=messages,
     )
+    prompt = ("" if response.added_special else "<s>") + response.prompt
     reference = chat_formatter.render(
         messages=messages,
         bos_token="<s>",
         eos_token="</s>",
     )
-    assert response.prompt == reference
+    assert prompt == reference
 
 
 mistral_7b_tokenizer_config = """{

From b9a1e61f2490d91620e371f98a7831d668c2bb25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Wed, 22 May 2024 11:21:34 +0200
Subject: [PATCH 10/12] changed to a warning

---
 llama_cpp/llama.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 25b45c9b7..a1dc4d60c 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1016,9 +1016,8 @@ def _create_completion(
         )
         model_name: str = model if model is not None else self.model_path
 
-        # User may have added an unwanted extra BOS
         if prompt_tokens[:2] == [self.token_bos()] * 2:
-            del prompt_tokens[0]
+            print(f'*** WARNING: Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...', file=sys.stderr)
 
         # NOTE: This likely doesn't work correctly for the first token in the prompt
         # because of the extra space added to the start of the prompt_tokens

From a6e5917ca4009285562204887b1d020c85cc5d1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Wed, 29 May 2024 20:24:42 +0200
Subject: [PATCH 11/12] move to another PR

---
 llama_cpp/_internals.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index 043f588a6..750bd965d 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -170,14 +170,6 @@ def token_eot(self) -> int:
         assert self.model is not None
         return llama_cpp.llama_token_eot(self.model)
 
-    def add_bos_token(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_add_bos_token(self.model)
-
-    def add_eos_token(self) -> int:
-        assert self.model is not None
-        return llama_cpp.llama_add_eos_token(self.model)
-
     # Tokenization
 
     def tokenize(self, text: bytes, add_bos: bool, special: bool):

From 71805353ef3fd7f9f560ff36030f7f5336f12283 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Tue, 4 Jun 2024 10:13:15 -0400
Subject: [PATCH 12/12] Use python warnings module

---
 llama_cpp/llama.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 3f0f5d3f9..45e1526ef 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -8,6 +8,7 @@
 import ctypes
 import typing
 import fnmatch
+import warnings
 import multiprocessing
 
 from typing import (
@@ -1020,7 +1021,10 @@ def _create_completion(
         model_name: str = model if model is not None else self.model_path
 
         if prompt_tokens[:2] == [self.token_bos()] * 2:
-            print(f'*** WARNING: Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...', file=sys.stderr)
+            warnings.warn(
+                f'Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...',
+                RuntimeWarning,
+            )
 
         # NOTE: This likely doesn't work correctly for the first token in the prompt
         # because of the extra space added to the start of the prompt_tokens