From 84d0874b7d17ebe4c28377166ddb4f9b91b05d8f Mon Sep 17 00:00:00 2001
From: jeffrey-fong <jeffreyfong94@gmail.com>
Date: Fri, 26 Apr 2024 11:16:32 +0800
Subject: [PATCH 1/4] fix completion tokens tracking, prompt forming

---
 llama_cpp/llama_chat_format.py | 66 +++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 32 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 17b570a0f..f11f2c631 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -1828,27 +1828,35 @@ def prepare_messages_for_inference(
         version: Literal["v1", "v2"],
         functions: Optional[List[llama_types.ChatCompletionFunctions]] = None,
         tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+        tool_choice: Union[Dict, str] = "auto",
     ):
         all_messages: List[llama_types.ChatCompletionRequestMessage] = []
-        if functions is not None:
+        if tool_choice == "none":
             all_messages.append(
                 llama_types.ChatCompletionRequestSystemMessage(
-                    role="system", content=generate_schema_from_functions(functions)
+                    role="system", content=generate_schema_from_functions([])
                 )
             )
-        elif tools is not None:
-            all_messages.append(
-                llama_types.ChatCompletionRequestSystemMessage(
-                    role="system",
-                    content=generate_schema_from_functions(
-                        [
-                            tool["function"]
-                            for tool in tools
-                            if tool["type"] == "function"
-                        ]
-                    ),
+        else:
+            if functions is not None:
+                all_messages.append(
+                    llama_types.ChatCompletionRequestSystemMessage(
+                        role="system", content=generate_schema_from_functions(functions)
+                    )
+                )
+            elif tools is not None and tool_choice != "none":
+                all_messages.append(
+                    llama_types.ChatCompletionRequestSystemMessage(
+                        role="system",
+                        content=generate_schema_from_functions(
+                            [
+                                tool["function"]
+                                for tool in tools
+                                if tool["type"] == "function"
+                            ]
+                        ),
+                    )
                 )
-            )
 
         all_messages.append(
             llama_types.ChatCompletionRequestSystemMessage(
@@ -1888,7 +1896,7 @@ def prepare_messages_for_inference(
         function_call = "auto"
 
     prompt = prepare_messages_for_inference(
-        messages, tokenizer, version, functions, tools
+        messages, tokenizer, version, functions, tools, function_call
     )
 
     # If no tools/functions are provided
@@ -1985,17 +1993,12 @@ def create_completion(stop):
 
     content = ""
     function_calls, function_bodies = [], []
+    completion_tokens = 0
 
     if version == "v1":
         # If no or "auto" tool_choice/function_call
         if isinstance(function_call, str) and function_call == "auto":
             stops = ["\n", END_ASSISTANT_TOKEN]
-        # If tool_choice/function_call is "none"
-        elif isinstance(function_call, str) and function_call == "none":
-            prompt = prepare_messages_for_inference(
-                messages, tokenizer, version, [], []
-            )
-            stops = END_ASSISTANT_TOKEN
         # If tool_choice/function_call is provided
         elif isinstance(function_call, dict):
             prompt += f"{START_FUNCTION_CALL_TOKEN}{function_call['name']}:\n"
@@ -2009,12 +2012,15 @@ def create_completion(stop):
 
         completion = create_completion(stop=stops)
         completion_text = completion["choices"][0]["text"]
+        completion_tokens += completion["usage"]["completion_tokens"]
+        
 
         # If the generation does not involve a function call
         if (
             START_FUNCTION_CALL_TOKEN not in prompt
             and START_FUNCTION_CALL_TOKEN not in completion_text
         ):
+            completion["usage"]["completion_tokens"] = completion_tokens
             return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
         # If the generation involves a function call in completion, generate the parameters
         elif (
@@ -2032,23 +2038,14 @@ def create_completion(stop):
             )
             grammar = get_grammar(function_calls[-1])
             completion = create_completion(stop=END_FUNCTION_CALL_TOKEN)
+            completion_tokens += completion["usage"]["completion_tokens"]
             function_bodies.append(completion["choices"][0]["text"].strip())
         # If the prompt involves a function call, just append generated parameters to function_bodies
         else:
             function_bodies.append(completion_text.strip())
     else:
-        # If tool_choice/function_call is "none"
-        if isinstance(function_call, str) and function_call == "none":
-            prompt = (
-                prepare_messages_for_inference(messages, tokenizer, version, [], [])
-                + "all\n<|content|>"
-            )
-            stops = [STOP_TOKEN, FROM_TOKEN]
-            completion = create_completion(stop=stops)
-            completion["choices"][0]["text"] = completion["choices"][0]["text"].strip()
-            return _convert_completion_to_chat(completion, stream=stream)  # type: ignore
         # If tool_choice/function_call is provided
-        elif isinstance(function_call, dict):
+        if isinstance(function_call, dict):
             prompt += f"{function_call['name']}\n{CONTENT_TOKEN}"
             function_call = function_call["name"]
             function_calls.append(function_call)
@@ -2056,6 +2053,7 @@ def create_completion(stop):
             stops = [STOP_TOKEN, FROM_TOKEN]
             completion = create_completion(stop=stops)
             completion_text = completion["choices"][0]["text"]
+            completion_tokens += completion["usage"]["completion_tokens"]
             function_bodies.append(completion_text.strip())
         # If "auto" or no tool_choice/function_call
         elif isinstance(function_call, str) and function_call == "auto":
@@ -2065,6 +2063,7 @@ def create_completion(stop):
                 stops = CONTENT_TOKEN
                 completion = create_completion(stop=stops)
                 completion_text = completion["choices"][0]["text"]
+                completion_tokens += completion["usage"]["completion_tokens"]
                 function_name = completion_text.strip()
                 if function_name == "all":
                     prompt += "all\n<|content|>"
@@ -2077,6 +2076,7 @@ def create_completion(stop):
                 stops = [RECIPIENT_TOKEN, STOP_TOKEN]
                 completion = create_completion(stop=stops)
                 completion_text = completion["choices"][0]["text"]
+                completion_tokens += completion["usage"]["completion_tokens"]
                 if function_name == "all":
                     content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
                     content = content.lstrip()
@@ -2092,6 +2092,7 @@ def create_completion(stop):
                     prompt += completion_text.strip()
                     grammar = None
                     completion = create_completion(stop=stops)
+                    completion_tokens += completion["usage"]["completion_tokens"]
                     if "<|from|> assistant" in completion["choices"][0]["text"] or "<|from|>assistant" in completion["choices"][0]["text"]:
                         prompt += "\n<|from|>assistant\n<|recipient|>"
                     else:
@@ -2126,6 +2127,7 @@ def create_completion(stop):
             "arguments": tool_calls[0]["function"]["arguments"],
         }
     } if len(tool_calls) == 1 else {}
+    completion["usage"]["completion_tokens"] = completion_tokens
     return llama_types.CreateChatCompletionResponse(
         id="chat" + completion["id"],
         object="chat.completion",

From c4b1cccbf952c81d5a89ae09c0a6b70c3700237d Mon Sep 17 00:00:00 2001
From: jeffrey-fong <jeffreyfong94@gmail.com>
Date: Fri, 26 Apr 2024 12:38:58 +0800
Subject: [PATCH 2/4] fix 'function_call' and 'tool_calls' depending on
 'functions' and 'tools', incompatibility with python 3.8

---
 llama_cpp/llama_chat_format.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index f11f2c631..27abdac06 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2078,11 +2078,21 @@ def create_completion(stop):
                 completion_text = completion["choices"][0]["text"]
                 completion_tokens += completion["usage"]["completion_tokens"]
                 if function_name == "all":
-                    content += completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n")
+                    if completion_text.endswith("\n<|from|>assistant\n"):
+                        content += completion_text[:-len("\n<|from|>assistant\n")]
+                    if completion_text.endswith("\n<|from|> assistant\n"):
+                        content += completion_text[-len("\n<|from|> assistant\n")]
+                    else:
+                        content += completion_text
                     content = content.lstrip()
                     # Check whether the model wants to generate another turn
                     if "<|from|> assistant" in completion_text or "<|from|>assistant" in completion_text:
-                        cleaned_completion_text = completion_text.removesuffix("\n<|from|>assistant\n").removesuffix("\n<|from|> assistant\n").strip()
+                        if completion_text.endswith("\n<|from|>assistant\n"):
+                            cleaned_completion_text = completion_text[:-len("\n<|from|>assistant\n")].strip()
+                        elif completion_text.endswith("\n<|from|> assistant\n"):
+                            cleaned_completion_text = completion_text[-len("\n<|from|> assistant\n")].strip()
+                        else:
+                            cleaned_completion_text = completion_text.strip()
                         prompt += f"{cleaned_completion_text}\n<|from|>assistant\n<|recipient|>"
                     else:
                         break
@@ -2125,8 +2135,9 @@ def create_completion(stop):
         "function_call": {
             "name": tool_calls[0]["function"]["name"],
             "arguments": tool_calls[0]["function"]["arguments"],
-        }
-    } if len(tool_calls) == 1 else {}
+        } if len(tool_calls) > 0 and tools is None else None,
+        "tool_calls": tool_calls if len(tool_calls) > 0 and tools is not None else None,
+    } 
     completion["usage"]["completion_tokens"] = completion_tokens
     return llama_types.CreateChatCompletionResponse(
         id="chat" + completion["id"],
@@ -2140,7 +2151,7 @@ def create_completion(stop):
                 "message": {
                     "role": "assistant",
                     "content": None if content == "" else content,
-                    "tool_calls": tool_calls,
+                    "tool_calls": tool_calls if tools is not None else None,
                     **function_call_dict,
                 },
                 "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",

From 7f6a41af1d4d2f73be41ff9ae05b12821280c142 Mon Sep 17 00:00:00 2001
From: jeffrey-fong <jeffreyfong94@gmail.com>
Date: Fri, 26 Apr 2024 15:15:13 +0800
Subject: [PATCH 3/4] Updated README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index b5e7d2035..a33524cab 100644
--- a/README.md
+++ b/README.md
@@ -484,6 +484,8 @@ Due to discrepancies between llama.cpp and HuggingFace's tokenizers, it is requi
   tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.2-GGUF")
 )
 ```
+
+**NOTE**: There is no need to provide the default system messages used in Functionary as they are added automatically in the Functionary chat handler. Thus, the messages should contain just the chat messages and/or system messages that provide additional context for the model (e.g.: datetime, etc.).
 </details>
 
 ### Multi-modal Models

From cbe960477d1892df6f0c5ecb417396828b0231ea Mon Sep 17 00:00:00 2001
From: jeffrey-fong <jeffreyfong94@gmail.com>
Date: Fri, 26 Apr 2024 16:01:14 +0800
Subject: [PATCH 4/4] fix for openai server compatibility

---
 llama_cpp/llama_chat_format.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
index 27abdac06..71aac8061 100644
--- a/llama_cpp/llama_chat_format.py
+++ b/llama_cpp/llama_chat_format.py
@@ -2131,13 +2131,15 @@ def create_completion(stop):
         )
 
     # TODO: support stream mode
-    function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = { 
-        "function_call": {
-            "name": tool_calls[0]["function"]["name"],
-            "arguments": tool_calls[0]["function"]["arguments"],
-        } if len(tool_calls) > 0 and tools is None else None,
-        "tool_calls": tool_calls if len(tool_calls) > 0 and tools is not None else None,
-    } 
+    function_call_dict: Union[Dict[str, str], Dict[Literal["function_call"], llama_types.ChatCompletionRequestAssistantMessageFunctionCall]] = {}
+    if len(tool_calls) > 0:
+        if tools is not None:
+            function_call_dict["tool_calls"] = tool_calls
+        else:
+            function_call_dict["function_call"] = {
+                "name": tool_calls[0]["function"]["name"],
+                "arguments": tool_calls[0]["function"]["arguments"],
+            }
     completion["usage"]["completion_tokens"] = completion_tokens
     return llama_types.CreateChatCompletionResponse(
         id="chat" + completion["id"],
@@ -2151,7 +2153,6 @@ def create_completion(stop):
                 "message": {
                     "role": "assistant",
                     "content": None if content == "" else content,
-                    "tool_calls": tool_calls if tools is not None else None,
                     **function_call_dict,
                 },
                 "finish_reason": "tool_calls" if len(tool_calls) > 0 else "stop",