From 9d053d6f736c04a56f629266399d48935e09e386 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Thu, 9 May 2024 20:04:06 +0200 Subject: [PATCH 01/12] Templates sometimes have BOS in them, remove duplicate --- llama_cpp/llama.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 4212669eb..576e5aeb0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1016,6 +1016,10 @@ def _create_completion( ) model_name: str = model if model is not None else self.model_path + # User or template may have added an unwanted extra BOS + if prompt_tokens[:2] == [self.token_bos()] * 2: + del prompt_tokens[0] + # NOTE: This likely doesn't work correctly for the first token in the prompt # because of the extra space added to the start of the prompt_tokens if logit_bias is not None: From a3df77d8d2243527e5910cee4c9811274e2207df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 10 May 2024 23:09:01 +0200 Subject: [PATCH 02/12] tokenize chat format prompts before completion This is to ensure that we don't duplicate any special tokens. Hopefully I amended the existing formats correctly? --- llama_cpp/llama_chat_format.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 84de989a5..57d4ac178 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -547,7 +547,7 @@ def chat_completion_handler( tools=tools, tool_choice=tool_choice, ) - prompt = result.prompt + prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=False, special=True) if result.stop is not None: stop = [] if stop is None else [stop] if isinstance(stop, str) else stop rstop = result.stop if isinstance(result.stop, list) else [result.stop] @@ -958,7 +958,7 @@ def format_alpaca( _sep2 = "" system_message = _get_system_message(messages) _messages = _map_roles(messages, _roles) - _prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2) + _prompt = "" + _format_add_colon_two(system_message, _messages, _sep, _sep2) return ChatFormatterResponse(prompt=_prompt) @@ -991,7 +991,7 @@ def format( system_message = _system_message _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2) + _prompt = "" + _format_add_colon_two(system_message, _messages, _sep, _sep2) return ChatFormatterResponse(prompt=_prompt) @@ -1007,7 +1007,7 @@ def format_oasst_llama( system_message = _system_template.format(system_message=system_message) _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = _format_no_colon_single(system_message, _messages, _sep) + _prompt = "" + _format_no_colon_single(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt) @@ -1187,7 +1187,7 @@ def format_zephyr( _sep = "" _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = _format_chatml(system_message, _messages, _sep) + _prompt = "" + _format_chatml(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) @@ -1262,7 +1262,7 @@ def format_chatglm3( _sep = "" _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = _format_chatglm3(system_message, _messages, _sep) + _prompt = "" + _format_chatglm3(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) From 803e8fa1c48f051aef91b0cd8bec2e191531ee9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Fri, 10 May 2024 23:11:43 +0200 Subject: [PATCH 03/12] updated comment --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 576e5aeb0..25b45c9b7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1016,7 +1016,7 @@ def _create_completion( ) model_name: str = model if model is not None else self.model_path - # User or template may have added an unwanted extra BOS + # User may have added an unwanted extra BOS if prompt_tokens[:2] == [self.token_bos()] * 2: del prompt_tokens[0] From ed4e56b6f7093051cc653e265e445eff9e4bc998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 11 May 2024 08:30:09 +0200 Subject: [PATCH 04/12] corrected a few --- llama_cpp/llama_chat_format.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 57d4ac178..a5899137b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -1187,7 +1187,7 @@ def format_zephyr( _sep = "" _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = "" + _format_chatml(system_message, _messages, _sep) + _prompt = _format_chatml(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) @@ -1262,7 +1262,7 @@ def format_chatglm3( _sep = "" _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = "" + _format_chatglm3(system_message, _messages, _sep) + _prompt = _format_chatglm3(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) @@ -1280,7 +1280,7 @@ def format_openchat( _sep = "<|end_of_turn|>" _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = _format_chatml(system_message, _messages, _sep) + _prompt = "" + _format_chatml(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) From 06cf25d1ed4c681163b2246e46612a402e8e841b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 11 May 2024 09:49:18 +0200 Subject: [PATCH 05/12] add some missing internals --- llama_cpp/_internals.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b404601d3..043f588a6 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -142,6 +142,14 @@ def token_eos(self) -> int: assert self.model is not None return llama_cpp.llama_token_eos(self.model) + def token_cls(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_cls(self.model) + + def token_sep(self) -> int: + assert self.model is not None + return llama_cpp.llama_token_sep(self.model) + def token_nl(self) -> int: assert self.model is not None return llama_cpp.llama_token_nl(self.model) @@ -162,6 +170,14 @@ def token_eot(self) -> int: assert self.model is not None return llama_cpp.llama_token_eot(self.model) + def add_bos_token(self) -> int: + assert self.model is not None + return llama_cpp.llama_add_bos_token(self.model) + + def add_eos_token(self) -> int: + assert self.model is not None + return llama_cpp.llama_add_eos_token(self.model) + # Tokenization def tokenize(self, text: bytes, add_bos: bool, special: bool): From bb6cf4f913bcade9d0b771be6045fdb3e150468a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 11 May 2024 10:27:13 +0200 Subject: [PATCH 06/12] proper bos/eos detection --- llama_cpp/llama_chat_format.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index a5899137b..307752aea 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -159,6 +159,7 @@ class ChatFormatterResponse: prompt: str stop: Optional[Union[str, List[str]]] = None stopping_criteria: Optional[llama.StoppingCriteriaList] = None + added_special: bool = False class ChatFormatter(Protocol): @@ -231,7 +232,7 @@ def stop_on_last_token( return tokens[-1] in self.stop_token_ids stopping_criteria = llama.StoppingCriteriaList([stop_on_last_token]) - return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria) + return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token], stopping_criteria=stopping_criteria, added_special=True) def to_chat_handler(self) -> LlamaChatCompletionHandler: return chat_formatter_to_chat_completion_handler(self) @@ -547,7 +548,11 @@ def chat_completion_handler( tools=tools, tool_choice=tool_choice, ) - prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=False, special=True) + + prompt = [llama.token_bos()] if not result.added_special and llama._model.add_bos_token() != 0 else [] + prompt += llama.tokenize(result.prompt.encode("utf-8"), add_bos=False, special=True) + prompt += [llama.token_eos()] if not result.added_special and llama._model.vocab_type() == 1 and llama._model.add_eos_token() != 0 else [] + if result.stop is not None: stop = [] if stop is None else [stop] if isinstance(stop, str) else stop rstop = result.stop if isinstance(result.stop, list) else [result.stop] @@ -654,7 +659,7 @@ def format_autotokenizer( prompt: str = tokenizer.apply_chat_template(messages, tokenize=False) # type: ignore assert isinstance(prompt, str) # Return formatted prompt and eos token by default - return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token) + return ChatFormatterResponse(prompt=prompt, stop=tokenizer.eos_token, added_special=True) return format_autotokenizer @@ -708,7 +713,7 @@ def format_tokenizer_config( bos_token=bos_token, eos_token=eos_token, ) - return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token]) + return ChatFormatterResponse(prompt=prompt, stop=[eos_token, bos_token], added_special=True) return format_tokenizer_config @@ -918,7 +923,7 @@ def format_llama2( messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: - _system_template = "[INST] <>\n{system_message}\n<>" + _system_template = "[INST] <>\n{system_message}\n<>" _roles = dict(user="[INST]", assistant="[/INST]") _messages = _map_roles(messages, _roles) system_message = _get_system_message(messages) @@ -940,11 +945,10 @@ def format_llama3( user="<|start_header_id|>user<|end_header_id|>\n\n", assistant="<|start_header_id|>assistant<|end_header_id|>\n\n", ) - _begin_token = "<|begin_of_text|>" _sep = "<|eot_id|>" _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = _format_no_colon_single(_begin_token, _messages, _sep) + _prompt = _format_no_colon_single("", _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) @@ -958,7 +962,7 @@ def format_alpaca( _sep2 = "" system_message = _get_system_message(messages) _messages = _map_roles(messages, _roles) - _prompt = "" + _format_add_colon_two(system_message, _messages, _sep, _sep2) + _prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2) return ChatFormatterResponse(prompt=_prompt) @@ -991,7 +995,7 @@ def format( system_message = _system_message _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = "" + _format_add_colon_two(system_message, _messages, _sep, _sep2) + _prompt = _format_add_colon_two(system_message, _messages, _sep, _sep2) return ChatFormatterResponse(prompt=_prompt) @@ -1007,7 +1011,7 @@ def format_oasst_llama( system_message = _system_template.format(system_message=system_message) _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = "" + _format_no_colon_single(system_message, _messages, _sep) + _prompt = _format_no_colon_single(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt) @@ -1229,10 +1233,9 @@ def format_mistral_instruct( messages: List[llama_types.ChatCompletionRequestMessage], **kwargs: Any, ) -> ChatFormatterResponse: - bos = "" eos = "" stop = eos - prompt = bos + prompt = "" for message in messages: if ( message["role"] == "user" @@ -1280,7 +1283,7 @@ def format_openchat( _sep = "<|end_of_turn|>" _messages = _map_roles(messages, _roles) _messages.append((_roles["assistant"], None)) - _prompt = "" + _format_chatml(system_message, _messages, _sep) + _prompt = _format_chatml(system_message, _messages, _sep) return ChatFormatterResponse(prompt=_prompt, stop=_sep) From 2e26f2d4d17d615dacd5a3dc0e4bffcfc326c876 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 11 May 2024 10:42:37 +0200 Subject: [PATCH 07/12] just let tokenizer do the job --- llama_cpp/llama_chat_format.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 307752aea..564f5fd45 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -548,11 +548,7 @@ def chat_completion_handler( tools=tools, tool_choice=tool_choice, ) - - prompt = [llama.token_bos()] if not result.added_special and llama._model.add_bos_token() != 0 else [] - prompt += llama.tokenize(result.prompt.encode("utf-8"), add_bos=False, special=True) - prompt += [llama.token_eos()] if not result.added_special and llama._model.vocab_type() == 1 and llama._model.add_eos_token() != 0 else [] - + prompt += llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True) if result.stop is not None: stop = [] if stop is None else [stop] if isinstance(stop, str) else stop rstop = result.stop if isinstance(result.stop, list) else [result.stop] From aa25cd3dbb30444387f0269b183a20a3e4813fae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 11 May 2024 10:44:28 +0200 Subject: [PATCH 08/12] typo-- --- llama_cpp/llama_chat_format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 564f5fd45..252267e94 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -548,7 +548,7 @@ def chat_completion_handler( tools=tools, tool_choice=tool_choice, ) - prompt += llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True) + prompt = llama.tokenize(result.prompt.encode("utf-8"), add_bos=not result.added_special, special=True) if result.stop is not None: stop = [] if stop is None else [stop] if isinstance(stop, str) else stop rstop = result.stop if isinstance(result.stop, list) else [result.stop] From aef3b1c31a2ad372338106a5a90fa24331ab1dfe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 11 May 2024 10:51:34 +0200 Subject: [PATCH 09/12] align test with new response --- tests/test_llama_chat_format.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_llama_chat_format.py b/tests/test_llama_chat_format.py index c10aee42e..f031bf72b 100644 --- a/tests/test_llama_chat_format.py +++ b/tests/test_llama_chat_format.py @@ -21,12 +21,13 @@ def test_mistral_instruct(): response = llama_chat_format.format_mistral_instruct( messages=messages, ) + prompt = ("" if response.added_special else "") + response.prompt reference = chat_formatter.render( messages=messages, bos_token="", eos_token="", ) - assert response.prompt == reference + assert prompt == reference mistral_7b_tokenizer_config = """{ From b9a1e61f2490d91620e371f98a7831d668c2bb25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 22 May 2024 11:21:34 +0200 Subject: [PATCH 10/12] changed to a warning --- llama_cpp/llama.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 25b45c9b7..a1dc4d60c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1016,9 +1016,8 @@ def _create_completion( ) model_name: str = model if model is not None else self.model_path - # User may have added an unwanted extra BOS if prompt_tokens[:2] == [self.token_bos()] * 2: - del prompt_tokens[0] + print(f'*** WARNING: Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...', file=sys.stderr) # NOTE: This likely doesn't work correctly for the first token in the prompt # because of the extra space added to the start of the prompt_tokens From a6e5917ca4009285562204887b1d020c85cc5d1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 29 May 2024 20:24:42 +0200 Subject: [PATCH 11/12] move to another PR --- llama_cpp/_internals.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index 043f588a6..750bd965d 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -170,14 +170,6 @@ def token_eot(self) -> int: assert self.model is not None return llama_cpp.llama_token_eot(self.model) - def add_bos_token(self) -> int: - assert self.model is not None - return llama_cpp.llama_add_bos_token(self.model) - - def add_eos_token(self) -> int: - assert self.model is not None - return llama_cpp.llama_add_eos_token(self.model) - # Tokenization def tokenize(self, text: bytes, add_bos: bool, special: bool): From 71805353ef3fd7f9f560ff36030f7f5336f12283 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 4 Jun 2024 10:13:15 -0400 Subject: [PATCH 12/12] Use python warnings module --- llama_cpp/llama.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 3f0f5d3f9..45e1526ef 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -8,6 +8,7 @@ import ctypes import typing import fnmatch +import warnings import multiprocessing from typing import ( @@ -1020,7 +1021,10 @@ def _create_completion( model_name: str = model if model is not None else self.model_path if prompt_tokens[:2] == [self.token_bos()] * 2: - print(f'*** WARNING: Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...', file=sys.stderr) + warnings.warn( + f'Detected duplicate leading "{self._model.token_get_text(self.token_bos())}" in prompt, this will likely reduce response quality, consider removing it...', + RuntimeWarning, + ) # NOTE: This likely doesn't work correctly for the first token in the prompt # because of the extra space added to the start of the prompt_tokens