abetlen
diff --git a/‎llama_cpp/llama_chat_format.py
Lines changed: 215 additions & 72 deletions b/‎llama_cpp/llama_chat_format.py
Lines changed: 215 additions & 72 deletions
@@ -2286,9 +2286,9 @@ def __call__(
             stream=stream,
         )
 
-
-@register_chat_completion_handler("chatml-function-calling")
-def chatml_function_calling(
+def base_function_calling(
+    function_calling_template,
+    end_token,
     llama: llama.Llama,
     messages: List[llama_types.ChatCompletionRequestMessage],
     functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
@@ -2320,65 +2320,13 @@ def chatml_function_calling(
 ) -> Union[
     llama_types.CreateChatCompletionResponse,
     Iterator[llama_types.CreateChatCompletionStreamResponse],
-]:
-    print(logprobs)
-    function_calling_template = (
-        "{% for message in messages %}"
-        "<|im_start|>{{ message.role }}\n"
-        # System message
-        "{% if message.role == 'system' %}"
-        "{{ message.content }}"
-        "{% if tool_calls %}"
-        "\n\nYou have access to the following functions:\n"
-        "{% for tool in tools %}"
-        "\nfunctions.{{ tool.function.name }}:\n"
-        "{{ tool.function.parameters | tojson }}"
-        "\n{% endfor %}"
-        "\n\nYou can respond to users messages with either a single message or one or more function calls."
-        "\n\nTo respond with a message begin the message with 'message:', use the following format:"
-        "\n\nmessage:"
-        "\n<message>"
-        "\n\nTo respond with one or more function calls begin the message with 'functions.<function_name>:', use the following format:"
-        "\n\nfunctions.<function_name>:"
-        '\n{ "arg1": "value1", "arg2": "value2" }'
-        "\nfunctions.<function_name>:"
-        '\n{ "arg1": "value1", "arg2": "value2" }'
-        "{% endif %}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        # User message
-        "{% if message.role == 'user' %}"
-        "{{ message.content }}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        # Assistant message
-        "{% if message.role == 'assistant' %}"
-        ## Reglar message
-        "{% if message.content and message.content | length > 0 %}"
-        "{% if tool_calls %}"
-        "message:\n"
-        "{% endif %}"
-        "{{ message.content }}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        ## Function calls
-        "{% if 'tool_calls' in message %}"
-        "{% for tool_call in message.tool_calls %}"
-        "functions.{{ tool_call.function.name }}:\n"
-        "{{ tool_call.function.arguments }}"
-        "{% endfor %}"
-        "<|im_end|>\n"
-        "{% endif %}"
-        "{% endif %}"
-        "{% endfor %}"
-        "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
-    )
+]:  
+    
     template_renderer = jinja2.Environment(
         loader=jinja2.BaseLoader(),
         autoescape=jinja2.select_autoescape(["html", "xml"]),
-        undefined=jinja2.StrictUndefined,
+        undefined=jinja2.DebugUndefined,
     ).from_string(function_calling_template)
-
     # Convert legacy functions to tools
     if functions is not None:
         tools = [
@@ -2403,8 +2351,7 @@ def chatml_function_calling(
                 },
             }
 
-    stop = [stop, "<|im_end|>"] if isinstance(stop, str) else stop + ["<|im_end|>"] if stop else ["<|im_end|>"]
-
+    stop = [stop, end_token] if isinstance(stop, str) else stop + [end_token] if stop else [end_token]
     # Case 1: No tool choice by user
     if (
         tool_choice is None
@@ -2418,7 +2365,6 @@ def chatml_function_calling(
             tool_calls=None,
             add_generation_prompt=True,
         )
-
         if response_format is not None and response_format["type"] == "json_object":
             grammar = _grammar_for_response_format(response_format)
 
@@ -2506,14 +2452,18 @@ def chatml_function_calling(
     function_names = " | ".join(
         [f'''"functions.{tool['function']['name']}:"''' for tool in tools]
     )
+
     initial_gbnf_tool_grammar = (
         """root   ::= functions | "message:"\n"""
         f"""functions ::= {function_names}\n"""
     )
+
     follow_up_gbnf_tool_grammar = (
-        """root   ::= functions | "<|im_end|>"\n"""
+        f"""root   ::= functions | "</done>"\n"""
         f"""functions ::= {function_names}\n"""
     )
+    
+
     prompt = template_renderer.render(
         messages=messages,
         tools=tools,
@@ -2522,14 +2472,14 @@ def chatml_function_calling(
     )
     completion_or_chunks = llama.create_completion(
         prompt=prompt,
-        temperature=0,
+        temperature=temperature,
         top_p=top_p,
         top_k=top_k,
         min_p=min_p,
         typical_p=typical_p,
         stream=False,
-        stop=[":"],
-        max_tokens=None,
+        stop=stop,
+        max_tokens=max_tokens,
         presence_penalty=presence_penalty,
         frequency_penalty=frequency_penalty,
         repeat_penalty=repeat_penalty,
@@ -2555,7 +2505,7 @@ def chatml_function_calling(
                 min_p=min_p,
                 typical_p=typical_p,
                 stream=stream,
-                stop=["<|im_end|>"],
+                stop=["</s>"],
                 logprobs=top_logprobs if logprobs else None,
                 max_tokens=None,
                 presence_penalty=presence_penalty,
@@ -2567,15 +2517,12 @@ def chatml_function_calling(
                 mirostat_eta=mirostat_eta,
                 model=model,
                 logits_processor=logits_processor,
-                grammar=llama_grammar.LlamaGrammar.from_string(
-                    follow_up_gbnf_tool_grammar, verbose=llama.verbose
-                ),
             ),
             stream=stream,
         )
 
     # One or more function calls
-    tool_name = text[len("functions.") :]
+    tool_name = text[len("functions.") :].replace(":", "")
     tool = next((tool for tool in tools if tool["function"]["name"] == tool_name), None)
     if not stream:
         completions: List[llama_types.CreateCompletionResponse] = []
@@ -2621,7 +2568,6 @@ def chatml_function_calling(
             completions_tool_name.append(tool_name)
             prompt += completion_or_chunks["choices"][0]["text"]
             prompt += "\n"
-
             response = llama.create_completion(
                 prompt=prompt,
                 temperature=temperature,
@@ -2644,10 +2590,14 @@ def chatml_function_calling(
                 grammar=llama_grammar.LlamaGrammar.from_string(
                     follow_up_gbnf_tool_grammar, verbose=llama.verbose
                 ),
+                
             )
+            
             response = cast(llama_types.CreateCompletionResponse, response)
 
-            tool_name = response["choices"][0]["text"][len("functions.") :]
+            if response["choices"][0]["text"] == "</done>":
+                break
+            tool_name = response["choices"][0]["text"][len("functions.") :].replace(":", "")
             tool = next(
                 (tool for tool in tools if tool["function"]["name"] == tool_name), None
             )
@@ -2710,3 +2660,196 @@ def chatml_function_calling(
         }
 
     raise ValueError("Automatic streaming tool choice is not supported")
+    
+
+@register_chat_completion_handler("chatml-function-calling")
+def chatml_function_calling(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    min_p: float = 0.05,
+    typical_p: float = 1.0,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+    logprobs: Optional[bool] = None,
+    top_logprobs: Optional[int] = None,
+    **kwargs,  # type: ignore
+) -> Union[
+    llama_types.CreateChatCompletionResponse,
+    Iterator[llama_types.CreateChatCompletionStreamResponse],
+]:
+    function_calling_template = (
+        "{% for message in messages %}"
+        "<|im_start|>{{ message.role }}\n"
+        # System message
+        "{% if message.role == 'system' %}"
+        "{{ message.content }}"
+        "{% if tool_calls %}"
+        "\n\nYou have access to the following functions:\n"
+        "{% for tool in tools %}"
+        "\nfunctions.{{ tool.function.name }}:\n"
+        "{{ tool.function.parameters | tojson }}"
+        "\n{% endfor %}"
+        "\n\nYou can respond to users messages with either a single message or one or more function calls."
+        "\n\nTo respond with a message begin the message with 'message:', use the following format:"
+        "\n\nmessage:"
+        "\n<message>"
+        "\n\nTo respond with one or more function calls begin the message with 'functions.<function_name>:', use the following format:"
+        "\n\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" };'
+        "\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" }'
+        "\n\nWhen you are done with the function calls, end the message with </done>."
+        "{% endif %}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "{{ message.content }}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        ## Reglar message
+        "{% if message.content and message.content | length > 0 %}"
+        "{% if tool_calls %}"
+        "message:\n"
+        "{% endif %}"
+        "{{ message.content }}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        ## Function calls
+        "{% if tool_calls %}"
+        "{% for tool_call in message.tool_calls %}"
+        "functions.{{ tool_call.function.name }}:\n"
+        "{{ (tool_call.arguments | default('{}') | tojson) }}"
+        "{% if not loop.last %};{% endif %}"  # Semicolon separator if not the last function call
+        "{% endfor %}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        "{% endif %}"
+        # Tool message (treated as Assistant response)
+        "{% if message.role == 'tool' %}"
+        "ASSISTANT:\n"
+        "Function response: {{ message.content | default('No response available') }}"
+        "<|im_end|>\n"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
+    )
+    return base_function_calling(end_token="<|im_end|>", 
+                          **locals())
+
+@register_chat_completion_handler("vicuna-function-calling")
+def vicuna_function_calling(
+    llama: llama.Llama,
+    messages: List[llama_types.ChatCompletionRequestMessage],
+    functions: Optional[List[llama_types.ChatCompletionFunction]] = None,
+    function_call: Optional[llama_types.ChatCompletionRequestFunctionCall] = None,
+    tools: Optional[List[llama_types.ChatCompletionTool]] = None,
+    tool_choice: Optional[llama_types.ChatCompletionToolChoiceOption] = None,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    min_p: float = 0.05,
+    typical_p: float = 1.0,
+    stream: bool = False,
+    stop: Optional[Union[str, List[str]]] = [],
+    response_format: Optional[llama_types.ChatCompletionRequestResponseFormat] = None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+    repeat_penalty: float = 1.1,
+    tfs_z: float = 1.0,
+    mirostat_mode: int = 0,
+    mirostat_tau: float = 5.0,
+    mirostat_eta: float = 0.1,
+    model: Optional[str] = None,
+    logits_processor: Optional[llama.LogitsProcessorList] = None,
+    grammar: Optional[llama.LlamaGrammar] = None,
+    logprobs: Optional[bool] = None,
+    top_logprobs: Optional[int] = None,
+    **kwargs,  # type: ignore
+) -> Union[
+    llama_types.CreateChatCompletionResponse,
+    Iterator[llama_types.CreateChatCompletionStreamResponse],
+]:
+    function_calling_template = (
+        "{% for message in messages %}"
+        "{{ message.role.upper() }}\n"  # Vicuna uses upper case for roles
+        # System message
+        "{% if message.role == 'system' %}"
+        "{{ message.content }}"
+        "{% if tool_calls %}"
+        "\n\nYou have access to the following functions:\n"
+        "{% for tool in tools %}"
+        "\nfunctions.{{ tool.function.name }}:\n"
+        "{{ tool.function.parameters | tojson }}"
+        "\n{% endfor %}"
+        "\n\nYou can respond to users messages with either a single message or multiple function calls."
+        "\n\nTo respond with a message begin the message with 'message:', use the following format:"
+        "\n\nmessage:"
+        "\n<message>"
+        "\n\nTo respond with one or more function calls begin the message with 'functions.<function_name>:', use the following format:"
+        "\n\nfunctions.<function_name>:"
+        '\n{ "arg1": "value1", "arg2": "value2" };'
+        "\nfunctions.<another_function_name>:"
+        '\n{ "arg1": "value3", "arg2": "value4" }'
+        "\n\nWhen you are done with the function calls, end the message with </done>."
+        "{% endif %}"
+        "</s>\n"
+        "{% endif %}"
+        # User message
+        "{% if message.role == 'user' %}"
+        "{{ message.content }}"
+        "</s>\n"
+        "{% endif %}"
+        # Assistant message
+        "{% if message.role == 'assistant' %}"
+        ## Regular message
+        "{% if message.content and message.content | length > 0 %}"
+        "{% if tool_calls %}"
+        "message:\n"
+        "{% endif %}"
+        "{{ message.content }}"
+        "</s>\n"
+        "{% endif %}"
+        ## Function calls
+        "{% if tool_calls %}"
+        "{% for tool_call in message.tool_calls %}"
+        "functions.{{ tool_call.function.name }}:\n"
+        "{{ (tool_call.arguments | default('{}') | tojson) }}"
+        "{% if not loop.last %};{% endif %}"  # Semicolon separator if not the last function call
+        "{% endfor %}"
+        "</s>\n"
+        "{% endif %}"
+        "{% endif %}"
+        # Tool message (treated as Assistant response)
+        "{% if message.role == 'tool' %}"
+        "ASSISTANT:\n"
+        "Function response: {{ message.content | default('No response available') }}"
+        "</s>\n"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}</s>ASSISTANT\n{% endif %}"  # Vicuna adds the role for prompt continuation
+    )
+    return base_function_calling(end_token="</s>", 
+                          **locals())