From b85ad93ed72b58f6c0c913dddb0c5a0e8e51a21b Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Tue, 10 Sep 2024 17:02:24 -0300
Subject: [PATCH 01/12] Add a tool parser for the Llama 3.1 tool use

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/cli_args.py           |   2 +-
 vllm/entrypoints/openai/serving_chat.py       |   3 +
 .../openai/tool_parsers/__init__.py           |   3 +-
 .../openai/tool_parsers/llama_tool_parser.py  | 105 ++++++++++++++++++
 4 files changed, 111 insertions(+), 2 deletions(-)
 create mode 100644 vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py

diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 7ccee0b6b55..a7bb38811cc 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -174,7 +174,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument(
         "--tool-call-parser",
         type=str,
-        choices=["mistral", "hermes"],
+        choices=["mistral", "hermes", "llama"],
         default=None,
         help=
         "Select the tool call parser depending on the model that you're using."
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a81d2aa989a..aa14eeff47a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -28,6 +28,7 @@
                                                     PromptAdapterPath,
                                                     TextTokensPrompt)
 from vllm.entrypoints.openai.tool_parsers import (Hermes2ProToolParser,
+                                                  LlamaToolParser,
                                                   MistralToolParser,
                                                   ToolParser)
 from vllm.inputs import TokensPrompt
@@ -83,6 +84,8 @@ def __init__(self,
                 self.tool_parser = MistralToolParser
             elif tool_parser == "hermes":
                 self.tool_parser = Hermes2ProToolParser
+            elif tool_parser == "llama":
+                self.tool_parser = LlamaToolParser
             else:
                 raise TypeError("Error: --enable-auto-tool-choice requires "
                                 "--tool-call-parser")
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 5d5d53784fe..1b614eee1c2 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,5 +1,6 @@
 from .abstract_tool_parser import ToolParser
 from .hermes_tool_parser import Hermes2ProToolParser
 from .mistral_tool_parser import MistralToolParser
+from .llama_tool_parser import LlamaToolParser
 
-__all__ = ["ToolParser", "Hermes2ProToolParser", "MistralToolParser"]
\ No newline at end of file
+__all__ = ["ToolParser", "Hermes2ProToolParser", "MistralToolParser", "LlamaToolParse"]
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
new file mode 100644
index 00000000000..2e76ca60f47
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -0,0 +1,105 @@
+import json
+from json import JSONDecoder
+import re
+from typing import Dict, List, Sequence, Union
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall,
+                                              InitialDeltaToolCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser)
+from vllm.entrypoints.openai.tool_parsers.utils import (
+    extract_intermediate_diff)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+
+class LlamaToolParser(ToolParser):
+    """
+    Tool call parser for Llama 3.1 models intended for use with the
+    examples/tool_chat_template_llama.jinja template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+        # initialize properties used for state when parsing tool calls in
+        # streaming mode
+        self.prev_tool_call_arr: List[Dict] = []
+        self.current_tool_id: int = -1
+        self.current_tool_name_sent: bool = False
+        self.current_tool_initial_sent: bool = False
+        self.streamed_args_for_tool: List[str] = [
+        ]  # map what has been streamed for each tool so far to a list
+        self.bot_token = "<|python_tag|>"
+        self.bot_token_id = self.model_tokenizer.vocab[self.bot_token]
+        self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
+
+    def extract_tool_calls(self,
+                           model_output: str) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response. Requires
+        find-and-replacing single quotes with double quotes for JSON parsing,
+        make sure your tool call arguments don't ever include quotes!
+        """
+
+        try:
+            # load the JSON, and then use it to build the Function and
+            # Tool Call
+            dec = JSONDecoder()
+            function_call_arr = []
+
+            # depending on the prompt format the Llama model may or may not
+            # prefix the output with the <|python_tag|> token
+            start_idx = len(self.bot_token) if model_output.startswith(self.bot_token) else 0
+            while start_idx < len(model_output):
+                (obj, end_idx) = dec.raw_decode(model_output[start_idx:])
+                start_idx += end_idx + len('; ')
+                function_call_arr.append(obj)
+
+            tool_calls: List[ToolCall] = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=raw_function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(raw_function_call["arguments"] \
+                                if "arguments" in raw_function_call \
+                                else raw_function_call["parameters"])))
+                for raw_function_call in function_call_arr
+            ]
+
+            # get any content before  the tool call
+            ret = ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=None)
+            return ret
+
+        except Exception as e:
+            logger.error("Error in extracting tool call from response: %s", e)
+            print("ERROR", e)
+            # return information to just treat the tool call as regular JSON
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> Union[DeltaMessage, None]:
+        raise NotImplementedError("streaming tool calls not supported for Llama 3.1 yet")

From 1abbe937b012e6acf8e1c092f11357ca215d37be Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Wed, 11 Sep 2024 23:27:33 -0300
Subject: [PATCH 02/12] Add streaming support for llama tool use

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 .../openai/tool_parsers/llama_tool_parser.py  | 172 +++++++++++++++++-
 1 file changed, 165 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 2e76ca60f47..adb7f0d382b 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -1,5 +1,5 @@
 import json
-from json import JSONDecoder
+from json import JSONDecoder, JSONDecodeError
 import re
 from typing import Dict, List, Sequence, Union
 
@@ -21,6 +21,20 @@
 logger = init_logger(__name__)
 
 
+# partial_json_parser doesn't support extra data and
+# JSONDecorder.raw_decode doesn't support partial JSON
+def partial_json_loads(input_str):
+    try:
+        return (partial_json_parser.loads(input_str, Allow.ALL & ~Allow.STR),
+                len(input_str))
+    except JSONDecodeError as e:
+        if "Extra data" in e.msg:
+            dec = JSONDecoder()
+            return dec.raw_decode(input_str)
+        else:
+            raise
+
+
 class LlamaToolParser(ToolParser):
     """
     Tool call parser for Llama 3.1 models intended for use with the
@@ -51,6 +65,12 @@ def extract_tool_calls(self,
         find-and-replacing single quotes with double quotes for JSON parsing,
         make sure your tool call arguments don't ever include quotes!
         """
+        # case -- if a tool call token is not present, return a text response
+        if not (model_output.startswith(self.bot_token)
+                or model_output.startswith('{')):
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
 
         try:
             # load the JSON, and then use it to build the Function and
@@ -60,7 +80,8 @@ def extract_tool_calls(self,
 
             # depending on the prompt format the Llama model may or may not
             # prefix the output with the <|python_tag|> token
-            start_idx = len(self.bot_token) if model_output.startswith(self.bot_token) else 0
+            start_idx = len(self.bot_token) if model_output.startswith(
+                self.bot_token) else 0
             while start_idx < len(model_output):
                 (obj, end_idx) = dec.raw_decode(model_output[start_idx:])
                 start_idx += end_idx + len('; ')
@@ -79,10 +100,9 @@ def extract_tool_calls(self,
             ]
 
             # get any content before  the tool call
-            ret = ExtractedToolCallInformation(
-                tools_called=True,
-                tool_calls=tool_calls,
-                content=None)
+            ret = ExtractedToolCallInformation(tools_called=True,
+                                               tool_calls=tool_calls,
+                                               content=None)
             return ret
 
         except Exception as e:
@@ -102,4 +122,142 @@ def extract_tool_calls_streaming(
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
     ) -> Union[DeltaMessage, None]:
-        raise NotImplementedError("streaming tool calls not supported for Llama 3.1 yet")
+
+        if not (current_text.startswith(self.bot_token)
+                or current_text.startswith('{')):
+            return DeltaMessage(content=delta_text)
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_arr = []
+            try:
+                # depending on the prompt format the Llama model may or may not
+                # prefix the output with the <|python_tag|> token
+                start_idx = len(self.bot_token) if current_text.startswith(
+                    self.bot_token) else 0
+                while start_idx < len(current_text):
+                    (obj,
+                     end_idx) = partial_json_loads(current_text[start_idx:])
+                    start_idx += end_idx + len('; ')
+                    # depending on the promt Llama can use
+                    # either arguments or parameters
+                    if "parameters" in obj:
+                        assert "arguments" not in obj, "model generated both parameters and arguments"
+                        obj["arguments"] = obj["parameters"]
+                    tool_call_arr.append(obj)
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # select as the current tool call the one we're on the state at
+            current_tool_call: Dict = tool_call_arr[self.current_tool_id] \
+                if len(tool_call_arr) > 0 else {}
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if len(tool_call_arr) == 0:
+                return None
+
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            elif (len(tool_call_arr) > 0
+                  and len(tool_call_arr) > self.current_tool_id + 1):
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments)
+                        argument_diff = cur_args_json[
+                            len(self.streamed_args_for_tool[self.
+                                                            current_tool_id]):]
+
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+                    else:
+                        delta = None
+                else:
+                    delta = None
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.current_tool_initial_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # case: update an existing tool - this is handled below
+
+            # if the current tool initial data incl. the id, type=function
+            # and idx not sent, send that
+            if not self.current_tool_initial_sent:
+                self.current_tool_initial_sent = True
+                delta = DeltaMessage(tool_calls=[
+                    InitialDeltaToolCall(
+                        index=self.current_tool_id).model_dump(
+                            exclude_none=True)
+                ])
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+                else:
+                    delta = None
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+
+                cur_arguments = current_tool_call.get("arguments")
+
+                if not cur_arguments:
+                    delta = None
+                else:
+                    cur_args_json = json.dumps(cur_arguments)
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    argument_diff = cur_args_json[sent:-1]  # remove final "}"
+                    logger.debug("got arguments diff: %s", argument_diff)
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      function=DeltaFunctionCall(
+                                          arguments=argument_diff).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.streamed_args_for_tool[
+                        self.current_tool_id] += argument_diff
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception as e:
+            logger.error("Error trying to handle streaming tool call: %s", e)
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None

From fa6ebb949afa8fbb44707df720fa2beaa0a98058 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Wed, 11 Sep 2024 23:45:34 -0300
Subject: [PATCH 03/12] bring tool parser up to date with latest changes

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 .../openai/tool_parsers/__init__.py           |  4 ++-
 .../openai/tool_parsers/llama_tool_parser.py  | 25 ++++++-------------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 1b614eee1c2..5f503d08224 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -3,4 +3,6 @@
 from .mistral_tool_parser import MistralToolParser
 from .llama_tool_parser import LlamaToolParser
 
-__all__ = ["ToolParser", "Hermes2ProToolParser", "MistralToolParser", "LlamaToolParse"]
+__all__ = [
+    "ToolParser", "Hermes2ProToolParser", "MistralToolParser", "LlamaToolParse"
+]
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index adb7f0d382b..28a49edcd30 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -9,14 +9,14 @@
 from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
                                               ExtractedToolCallInformation,
-                                              FunctionCall,
-                                              InitialDeltaToolCall, ToolCall)
+                                              FunctionCall, ToolCall)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser)
 from vllm.entrypoints.openai.tool_parsers.utils import (
     extract_intermediate_diff)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
@@ -51,11 +51,11 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.prev_tool_call_arr: List[Dict] = []
         self.current_tool_id: int = -1
         self.current_tool_name_sent: bool = False
-        self.current_tool_initial_sent: bool = False
         self.streamed_args_for_tool: List[str] = [
         ]  # map what has been streamed for each tool so far to a list
         self.bot_token = "<|python_tag|>"
-        self.bot_token_id = self.model_tokenizer.vocab[self.bot_token]
+        self.bot_token_id = self.model_tokenizer.tokenizer.encode(
+            self.bot_token, add_special_tokens=False)[0]
         self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
 
     def extract_tool_calls(self,
@@ -196,31 +196,20 @@ def extract_tool_calls_streaming(
                 # re-set stuff pertaining to progress in the current tool
                 self.current_tool_id = len(tool_call_arr) - 1
                 self.current_tool_name_sent = False
-                self.current_tool_initial_sent = False
                 self.streamed_args_for_tool.append("")
                 logger.debug("starting on new tool %d", self.current_tool_id)
                 return delta
 
-            # case: update an existing tool - this is handled below
-
-            # if the current tool initial data incl. the id, type=function
-            # and idx not sent, send that
-            if not self.current_tool_initial_sent:
-                self.current_tool_initial_sent = True
-                delta = DeltaMessage(tool_calls=[
-                    InitialDeltaToolCall(
-                        index=self.current_tool_id).model_dump(
-                            exclude_none=True)
-                ])
-
             # if the current tool name hasn't been sent, send if available
             # - otherwise send nothing
-            elif not self.current_tool_name_sent:
+            if not self.current_tool_name_sent:
                 function_name = current_tool_call.get("name")
                 if function_name:
 
                     delta = DeltaMessage(tool_calls=[
                         DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=f"chatcmpl-tool-{random_uuid()}",
                                       function=DeltaFunctionCall(
                                           name=function_name).model_dump(
                                               exclude_none=True))

From be5eeabcd2e9bd8b4fbeb371058debfe333497c0 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Thu, 12 Sep 2024 12:14:57 -0300
Subject: [PATCH 04/12] add unit tests and chat template

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 examples/tool_chat_template_llama.jinja       | 110 ++++++++++++++++++
 tests/tool_use/test_chat_completions.py       |  16 +--
 tests/tool_use/test_parallel_tool_calls.py    |  30 +++--
 tests/tool_use/test_tool_calls.py             |  21 ++--
 tests/tool_use/utils.py                       |  72 +++++++++++-
 .../openai/tool_parsers/llama_tool_parser.py  |   2 +-
 6 files changed, 227 insertions(+), 24 deletions(-)
 create mode 100644 examples/tool_chat_template_llama.jinja

diff --git a/examples/tool_chat_template_llama.jinja b/examples/tool_chat_template_llama.jinja
new file mode 100644
index 00000000000..e70a3896527
--- /dev/null
+++ b/examples/tool_chat_template_llama.jinja
@@ -0,0 +1,110 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
+
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 038ff81d2b6..e54e922ee26 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -3,18 +3,19 @@
 import openai
 import pytest
 
-from .utils import MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL
+from .utils import MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig, adapt_prompt_to_model
 
 
 # test: make sure chat completions without tools provided work even when tools
 # are enabled. This makes sure tool call chat templates work, AND that the tool
 # parser stream processing doesn't change the output of the model.
 @pytest.mark.asyncio
-async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):
+async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
+                                             server_config: ServerConfig):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=MESSAGES_WITHOUT_TOOLS,
+        messages=adapt_prompt_to_model(MESSAGES_WITHOUT_TOOLS, server_config),
         temperature=0,
         max_tokens=150,
         model=model_name,
@@ -34,7 +35,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):
 
     # make the same request, streaming
     stream = await client.chat.completions.create(
-        messages=MESSAGES_WITHOUT_TOOLS,
+        messages=adapt_prompt_to_model(MESSAGES_WITHOUT_TOOLS, server_config),
         temperature=0,
         max_tokens=150,
         model=model_name,
@@ -77,11 +78,12 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI):
 # tools, to make sure we can still get normal chat completion responses
 # and that they won't be parsed as tools
 @pytest.mark.asyncio
-async def test_chat_completion_with_tools(client: openai.AsyncOpenAI):
+async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
+                                          server_config: ServerConfig):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=MESSAGES_WITHOUT_TOOLS,
+        messages=adapt_prompt_to_model(MESSAGES_WITHOUT_TOOLS, server_config),
         temperature=0,
         max_tokens=150,
         model=model_name,
@@ -102,7 +104,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI):
 
     # make the same request, streaming
     stream = await client.chat.completions.create(
-        messages=MESSAGES_WITHOUT_TOOLS,
+        messages=adapt_prompt_to_model(MESSAGES_WITHOUT_TOOLS, server_config),
         temperature=0,
         max_tokens=150,
         model=model_name,
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index b03b5a2075a..51d33781639 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -6,7 +6,7 @@
 
 from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
                     MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
-                    WEATHER_TOOL)
+                    WEATHER_TOOL, ServerConfig, adapt_prompt_to_model)
 
 
 # test: getting the model to generate parallel tool calls (streaming/not)
@@ -14,11 +14,18 @@
 # may be added in the future. e.g. llama 3.1 models are not designed to support
 # parallel tool calls.
 @pytest.mark.asyncio
-async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
+async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
+                                   server_config: ServerConfig):
+
+    if not server_config.get("supports_parallel", True):
+        pytest.skip("The {} model doesn't support parallel tool calls".format(
+            server_config["model"]))
+
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        messages=adapt_prompt_to_model(MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+                                       server_config),
         temperature=0,
         max_tokens=200,
         model=model_name,
@@ -55,7 +62,8 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
     # make the same request, streaming
     stream = await client.chat.completions.create(
         model=model_name,
-        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        messages=adapt_prompt_to_model(MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+                                       server_config),
         temperature=0,
         max_tokens=200,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
@@ -136,11 +144,18 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI):
 # test: providing parallel tool calls back to the model to get a response
 # (streaming/not)
 @pytest.mark.asyncio
-async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI):
+async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
+                                                server_config: ServerConfig):
+
+    if not server_config.get("supports_parallel", True):
+        pytest.skip("The {} model doesn't support parallel tool calls".format(
+            server_config["model"]))
+
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+        messages=adapt_prompt_to_model(MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+                                       server_config),
         temperature=0,
         max_tokens=200,
         model=model_name,
@@ -158,7 +173,8 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI):
     assert "78" in choice.message.content  # Orlando temp in tool response
 
     stream = await client.chat.completions.create(
-        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+        messages=adapt_prompt_to_model(MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
+                                       server_config),
         temperature=0,
         max_tokens=200,
         model=model_name,
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index c3abe9e1f50..e2c4bb89999 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -5,17 +5,20 @@
 import pytest
 
 from .utils import (MESSAGES_ASKING_FOR_TOOLS, MESSAGES_WITH_TOOL_RESPONSE,
-                    SEARCH_TOOL, WEATHER_TOOL)
+                    SEARCH_TOOL, WEATHER_TOOL, ServerConfig,
+                    adapt_prompt_to_model)
 
 
 # test: request a chat completion that should return tool calls, so we know they
 # are parsable
 @pytest.mark.asyncio
-async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
+async def test_tool_call_and_choice(client: openai.AsyncOpenAI,
+                                    server_config: ServerConfig):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=MESSAGES_ASKING_FOR_TOOLS,
+        messages=adapt_prompt_to_model(MESSAGES_ASKING_FOR_TOOLS,
+                                       server_config),
         temperature=0,
         max_tokens=100,
         model=model_name,
@@ -59,7 +62,8 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
     # make the same request, streaming
     stream = await client.chat.completions.create(
         model=model_name,
-        messages=MESSAGES_ASKING_FOR_TOOLS,
+        messages=adapt_prompt_to_model(MESSAGES_ASKING_FOR_TOOLS,
+                                       server_config),
         temperature=0,
         max_tokens=100,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
@@ -136,11 +140,13 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
 # test: providing tools and results back to model to get a non-tool response
 # (streaming/not)
 @pytest.mark.asyncio
-async def test_tool_call_with_results(client: openai.AsyncOpenAI):
+async def test_tool_call_with_results(client: openai.AsyncOpenAI,
+                                      server_config: ServerConfig):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=MESSAGES_WITH_TOOL_RESPONSE,
+        messages=adapt_prompt_to_model(MESSAGES_WITH_TOOL_RESPONSE,
+                                       server_config),
         temperature=0,
         max_tokens=100,
         model=model_name,
@@ -157,7 +163,8 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI):
     assert "98" in choice.message.content  # the temperature from the response
 
     stream = await client.chat.completions.create(
-        messages=MESSAGES_WITH_TOOL_RESPONSE,
+        messages=adapt_prompt_to_model(MESSAGES_WITH_TOOL_RESPONSE,
+                                       server_config),
         temperature=0,
         max_tokens=100,
         model=model_name,
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index e447469e334..d35aeb75f4f 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -1,15 +1,64 @@
-from typing import Dict, List
+from typing import Any, Callable, Dict, List, Optional
 
 from openai.types.chat import (ChatCompletionMessageParam,
                                ChatCompletionToolParam)
 from typing_extensions import TypedDict
 
 from tests.utils import VLLM_PATH
+import json
+from copy import deepcopy
 
 
-class ServerConfig(TypedDict):
+class ServerConfig(TypedDict, total=False):
     model: str
     arguments: List[str]
+    system_prompt: Optional[str]
+    supports_parallel: Optional[bool]
+    format_tool_output: Optional[Callable[[str], str]]
+
+
+def format_llama_tool_output(output: str) -> str:
+    return json.dumps({"output": output})
+
+
+def format_tool_output_id(output: str) -> str:
+    return output
+
+
+def patch_tool_output(messages: List[Dict[str, Any]],
+                      config: ServerConfig) -> List[Dict[str, Any]]:
+    fmt_fun = config.get("format_tool_output")
+    if not fmt_fun:
+        return messages
+    new_messages = deepcopy(messages)
+    for message in new_messages:
+        if message["role"] == "tool":
+            message["content"] = fmt_fun(message["content"])
+    return new_messages
+
+
+def patch_system_prompt(messages: List[Dict[str, Any]],
+                        system_prompt: str) -> List[Dict[str, Any]]:
+    new_messages = deepcopy(messages)
+    if new_messages[0]["role"] == "system":
+        new_messages[0]["content"] = system_prompt
+    else:
+        new_messages.insert(0, {"role": "system", "content": system_prompt})
+    return new_messages
+
+
+def ensure_system_prompt(messages: List[Dict[str, Any]],
+                         config: ServerConfig) -> List[Dict[str, Any]]:
+    prompt = config.get("system_prompt")
+    if prompt:
+        return patch_system_prompt(messages, prompt)
+    else:
+        return messages
+
+
+def adapt_prompt_to_model(messages: List[Dict[str, Any]],
+                          config: ServerConfig) -> List[Dict[str, Any]]:
+    return ensure_system_prompt(patch_tool_output(messages, config), config)
 
 
 # universal args for all models go here. also good if you need to test locally
@@ -25,6 +74,25 @@ class ServerConfig(TypedDict):
             str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
         ]
     },
+    "llama": {
+        "model":
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "arguments": [
+            "--tool-call-parser", "llama", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama.jinja")
+        ],
+        "system_prompt":
+        "You are a helpful assistant with tool calling capabilities. "
+        "Only reply with a tool call if the function exists in the "
+        "library provided by the user. If it doesn't exist, just "
+        "reply directly in natural language. When you receive a tool "
+        "call response, use the output to format an answer to the "
+        "original user question.",
+        "supports_parallel":
+        False,
+        "format_tool_output":
+        format_llama_tool_output
+    },
     "mistral": {
         "model":
         "mistralai/Mistral-7B-Instruct-v0.3",
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 28a49edcd30..5b51d854633 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -144,7 +144,7 @@ def extract_tool_calls_streaming(
                     (obj,
                      end_idx) = partial_json_loads(current_text[start_idx:])
                     start_idx += end_idx + len('; ')
-                    # depending on the promt Llama can use
+                    # depending on the prompt Llama can use
                     # either arguments or parameters
                     if "parameters" in obj:
                         assert "arguments" not in obj, "model generated both parameters and arguments"

From d8d6de456048b4c4dd938ef55c3c0e3f7b3886eb Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Thu, 12 Sep 2024 14:01:44 -0300
Subject: [PATCH 05/12] Improve streaming logic to support partial results

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 tests/tool_use/test_chat_completions.py       |  3 +-
 tests/tool_use/utils.py                       |  4 +-
 .../openai/tool_parsers/__init__.py           |  5 +-
 .../openai/tool_parsers/llama_tool_parser.py  | 77 ++++++++++++-------
 4 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index e54e922ee26..41fe6a87c1a 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -3,7 +3,8 @@
 import openai
 import pytest
 
-from .utils import MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig, adapt_prompt_to_model
+from .utils import (MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig,
+                    adapt_prompt_to_model)
 
 
 # test: make sure chat completions without tools provided work even when tools
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index d35aeb75f4f..20b86fab9e2 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -1,3 +1,5 @@
+import json
+from copy import deepcopy
 from typing import Any, Callable, Dict, List, Optional
 
 from openai.types.chat import (ChatCompletionMessageParam,
@@ -5,8 +7,6 @@
 from typing_extensions import TypedDict
 
 from tests.utils import VLLM_PATH
-import json
-from copy import deepcopy
 
 
 class ServerConfig(TypedDict, total=False):
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 5f503d08224..393b721c327 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,8 +1,9 @@
 from .abstract_tool_parser import ToolParser
 from .hermes_tool_parser import Hermes2ProToolParser
-from .mistral_tool_parser import MistralToolParser
 from .llama_tool_parser import LlamaToolParser
+from .mistral_tool_parser import MistralToolParser
 
 __all__ = [
-    "ToolParser", "Hermes2ProToolParser", "MistralToolParser", "LlamaToolParse"
+    "ToolParser", "Hermes2ProToolParser", "MistralToolParser",
+    "LlamaToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 5b51d854633..96798cc3053 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -1,6 +1,6 @@
 import json
-from json import JSONDecoder, JSONDecodeError
 import re
+from json import JSONDecodeError, JSONDecoder
 from typing import Dict, List, Sequence, Union
 
 import partial_json_parser
@@ -12,8 +12,7 @@
                                               FunctionCall, ToolCall)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser)
-from vllm.entrypoints.openai.tool_parsers.utils import (
-    extract_intermediate_diff)
+from vllm.entrypoints.openai.tool_parsers.utils import find_common_prefix
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
@@ -23,10 +22,9 @@
 
 # partial_json_parser doesn't support extra data and
 # JSONDecorder.raw_decode doesn't support partial JSON
-def partial_json_loads(input_str):
+def partial_json_loads(input_str, flags):
     try:
-        return (partial_json_parser.loads(input_str, Allow.ALL & ~Allow.STR),
-                len(input_str))
+        return (partial_json_parser.loads(input_str, flags), len(input_str))
     except JSONDecodeError as e:
         if "Extra data" in e.msg:
             dec = JSONDecoder()
@@ -35,6 +33,14 @@ def partial_json_loads(input_str):
             raise
 
 
+def is_complete_json(input_str):
+    try:
+        json.loads(input_str)
+        return True
+    except JSONDecodeError:
+        return False
+
+
 class LlamaToolParser(ToolParser):
     """
     Tool call parser for Llama 3.1 models intended for use with the
@@ -135,6 +141,7 @@ def extract_tool_calls_streaming(
             else Allow.ALL & ~Allow.STR
         try:
             tool_call_arr = []
+            is_complete = []
             try:
                 # depending on the prompt format the Llama model may or may not
                 # prefix the output with the <|python_tag|> token
@@ -142,12 +149,17 @@ def extract_tool_calls_streaming(
                     self.bot_token) else 0
                 while start_idx < len(current_text):
                     (obj,
-                     end_idx) = partial_json_loads(current_text[start_idx:])
+                     end_idx) = partial_json_loads(current_text[start_idx:],
+                                                   flags)
+                    is_complete.append(
+                        is_complete_json(current_text[start_idx:start_idx +
+                                                      end_idx]))
                     start_idx += end_idx + len('; ')
                     # depending on the prompt Llama can use
                     # either arguments or parameters
                     if "parameters" in obj:
-                        assert "arguments" not in obj, "model generated both parameters and arguments"
+                        assert "arguments" not in obj, \
+                            "model generated both parameters and arguments"
                         obj["arguments"] = obj["parameters"]
                     tool_call_arr.append(obj)
             except partial_json_parser.core.exceptions.MalformedJSON:
@@ -176,9 +188,9 @@ def extract_tool_calls_streaming(
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
                         cur_args_json = json.dumps(cur_arguments)
-                        argument_diff = cur_args_json[
-                            len(self.streamed_args_for_tool[self.
-                                                            current_tool_id]):]
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
 
                         logger.debug("got arguments diff: %s", argument_diff)
                         delta = DeltaMessage(tool_calls=[
@@ -202,7 +214,7 @@ def extract_tool_calls_streaming(
 
             # if the current tool name hasn't been sent, send if available
             # - otherwise send nothing
-            if not self.current_tool_name_sent:
+            elif not self.current_tool_name_sent:
                 function_name = current_tool_call.get("name")
                 if function_name:
 
@@ -221,25 +233,36 @@ def extract_tool_calls_streaming(
             # now we know we're on the same tool call and we're streaming
             # arguments
             else:
-
                 cur_arguments = current_tool_call.get("arguments")
+                delta = None
 
-                if not cur_arguments:
-                    delta = None
-                else:
-                    cur_args_json = json.dumps(cur_arguments)
+                if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    argument_diff = cur_args_json[sent:-1]  # remove final "}"
-                    logger.debug("got arguments diff: %s", argument_diff)
-                    delta = DeltaMessage(tool_calls=[
-                        DeltaToolCall(index=self.current_tool_id,
-                                      function=DeltaFunctionCall(
-                                          arguments=argument_diff).model_dump(
-                                              exclude_none=True))
-                    ])
-                    self.streamed_args_for_tool[
-                        self.current_tool_id] += argument_diff
+                    cur_args_json = json.dumps(cur_arguments)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                    argument_diff = None
+                    if is_complete[self.current_tool_id]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments)
+                        if cur_args_json != prev_args_json:
+
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
 
             self.prev_tool_call_arr = tool_call_arr
             return delta

From 37336f31b51b8b4114eca193783fdf3c71837b00 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Thu, 12 Sep 2024 16:16:12 -0300
Subject: [PATCH 06/12] address review comments

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 examples/tool_chat_template_llama.jinja       | 40 +++++--------------
 vllm/entrypoints/openai/cli_args.py           |  2 +-
 vllm/entrypoints/openai/serving_chat.py       |  6 +--
 .../openai/tool_parsers/__init__.py           |  4 +-
 .../openai/tool_parsers/llama_tool_parser.py  |  6 +--
 5 files changed, 18 insertions(+), 40 deletions(-)

diff --git a/examples/tool_chat_template_llama.jinja b/examples/tool_chat_template_llama.jinja
index e70a3896527..838e5e12e72 100644
--- a/examples/tool_chat_template_llama.jinja
+++ b/examples/tool_chat_template_llama.jinja
@@ -3,7 +3,7 @@
     {%- set tools = custom_tools %}
 {%- endif %}
 {%- if not tools_in_user_message is defined %}
-    {%- set tools_in_user_message = true %}
+    {%- set tools_in_user_message = false %}
 {%- endif %}
 {%- if not date_string is defined %}
     {%- set date_string = "26 Jul 2024" %}
@@ -22,14 +22,11 @@
 
 {#- System message + builtin tools #}
 {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
-{%- if builtin_tools is defined or tools is not none %}
+{%- if tools is not none %}
     {{- "Environment: ipython\n" }}
 {%- endif %}
-{%- if builtin_tools is defined %}
-    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
-{%- endif %}
-{{- "Cutting Knowledge Date: December 2023\n" }}
-{{- "Today Date: " + date_string + "\n\n" }}
+{{- "Knowledge Cutoff Date: December 2023\n" }}
+{{- "Today's Date: " + date_string + "\n\n" }}
 {%- if tools is not none and not tools_in_user_message %}
     {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
     {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
@@ -71,29 +68,12 @@
             {{- raise_exception("This model only supports single tool-calls at once!") }}
         {%- endif %}
         {%- set tool_call = message.tool_calls[0].function %}
-        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
-            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
-            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
-            {%- for arg_name, arg_val in tool_call.arguments | items %}
-                {{- arg_name + '="' + arg_val + '"' }}
-                {%- if not loop.last %}
-                    {{- ", " }}
-                {%- endif %}
-                {%- endfor %}
-            {{- ")" }}
-        {%- else  %}
-            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
-            {{- '{"name": "' + tool_call.name + '", ' }}
-            {{- '"parameters": ' }}
-            {{- tool_call.arguments | tojson }}
-            {{- "}" }}
-        {%- endif %}
-        {%- if builtin_tools is defined %}
-            {#- This means we're in ipython mode #}
-            {{- "<|eom_id|>" }}
-        {%- else %}
-            {{- "<|eot_id|>" }}
-        {%- endif %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+        {{- '{"name": "' + tool_call.name + '", ' }}
+        {{- '"parameters": ' }}
+        {{- tool_call.arguments | tojson }}
+        {{- "}" }}
+        {{- "<|eot_id|>" }}
     {%- elif message.role == "tool" or message.role == "ipython" %}
         {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
         {%- if message.content is mapping or message.content is iterable %}
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index a7bb38811cc..0ae1e3ed860 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -174,7 +174,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
     parser.add_argument(
         "--tool-call-parser",
         type=str,
-        choices=["mistral", "hermes", "llama"],
+        choices=["mistral", "hermes", "llama3_json"],
         default=None,
         help=
         "Select the tool call parser depending on the model that you're using."
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d5f0d4a8bf5..49509e629f8 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -28,7 +28,7 @@
                                                     PromptAdapterPath,
                                                     TextTokensPrompt)
 from vllm.entrypoints.openai.tool_parsers import (Hermes2ProToolParser,
-                                                  LlamaToolParser,
+                                                  Llama3JsonToolParser,
                                                   MistralToolParser,
                                                   ToolParser)
 from vllm.inputs import TokensPrompt
@@ -84,8 +84,8 @@ def __init__(self,
                 self.tool_parser = MistralToolParser
             elif tool_parser == "hermes":
                 self.tool_parser = Hermes2ProToolParser
-            elif tool_parser == "llama":
-                self.tool_parser = LlamaToolParser
+            elif tool_parser == "llama3_json":
+                self.tool_parser = Llama3JsonToolParser
             else:
                 raise TypeError("Error: --enable-auto-tool-choice requires "
                                 "--tool-call-parser")
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 393b721c327..0069a2b8044 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,9 +1,9 @@
 from .abstract_tool_parser import ToolParser
 from .hermes_tool_parser import Hermes2ProToolParser
-from .llama_tool_parser import LlamaToolParser
+from .llama_tool_parser import Llama3JsonToolParser
 from .mistral_tool_parser import MistralToolParser
 
 __all__ = [
     "ToolParser", "Hermes2ProToolParser", "MistralToolParser",
-    "LlamaToolParser"
+    "Llama3JsonToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 96798cc3053..15579ded18a 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -41,7 +41,7 @@ def is_complete_json(input_str):
         return False
 
 
-class LlamaToolParser(ToolParser):
+class Llama3JsonToolParser(ToolParser):
     """
     Tool call parser for Llama 3.1 models intended for use with the
     examples/tool_chat_template_llama.jinja template.
@@ -67,9 +67,7 @@ def __init__(self, tokenizer: AnyTokenizer):
     def extract_tool_calls(self,
                            model_output: str) -> ExtractedToolCallInformation:
         """
-        Extract the tool calls from a complete model response. Requires
-        find-and-replacing single quotes with double quotes for JSON parsing,
-        make sure your tool call arguments don't ever include quotes!
+        Extract the tool calls from a complete model response.
         """
         # case -- if a tool call token is not present, return a text response
         if not (model_output.startswith(self.bot_token)

From e7d34dc01dca6911f8d0ed77995e5dfe700d1246 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Fri, 13 Sep 2024 18:07:32 -0300
Subject: [PATCH 07/12] Add documentation about the llama 3 tool use

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 .../serving/openai_compatible_server.md       | 26 +++++++++++++++++--
 ...a => tool_chat_template_llama3_json.jinja} |  0
 2 files changed, 24 insertions(+), 2 deletions(-)
 rename examples/{tool_chat_template_llama.jinja => tool_chat_template_llama3_json.jinja} (100%)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index eb4ea0fb565..e0eba7f09bd 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -157,10 +157,10 @@ vLLM will use guided decoding to ensure the response matches the tool parameter
 To enable this feature, you should set the following flags:
 * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it 
 deems appropriate.
-* `--tool-call-parser` -- select the tool parser to use - currently either `hermes` or `mistral`. Additional tool parsers 
+* `--tool-call-parser` -- select the tool parser to use - currently either `hermes`, `mistral` or `llama3_json`. Additional tool parsers 
 will continue to be added in the future.
 * `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages 
-that contain previously generated tool calls. Hermes and Mistral models have tool-compatible chat templates in their 
+that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their 
 `tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat 
 template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates)
 from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json)
@@ -197,3 +197,25 @@ when tools are provided, that results in much better reliability when working wi
 
 
 Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+
+#### Llama Models
+Supported models:
+* `meta-llama/Meta-Llama-3.1-8B-Instruct`
+* `meta-llama/Meta-Llama-3.1-70B-Instruct`
+* `meta-llama/Meta-Llama-3.1-405B-Instruct`
+* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8`
+
+The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling).
+Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
+
+Known issues:
+1. Parallel tool calls are not supported. 
+2. The model can generate parameters with a wrong format, such as generating
+   an array serialized as string instead of an array.
+
+The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that
+it works better with vLLM.
+
+Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
+
+
diff --git a/examples/tool_chat_template_llama.jinja b/examples/tool_chat_template_llama3_json.jinja
similarity index 100%
rename from examples/tool_chat_template_llama.jinja
rename to examples/tool_chat_template_llama3_json.jinja

From 3bb941a435f4c2ebb3b240c553c7aed4dda62b16 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Tue, 24 Sep 2024 21:41:33 -0300
Subject: [PATCH 08/12] Move more logic to the chat template

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 examples/tool_chat_template_llama3_json.jinja | 12 ++--
 tests/tool_use/test_chat_completions.py       | 10 +--
 tests/tool_use/test_parallel_tool_calls.py    | 14 ++--
 tests/tool_use/test_tool_calls.py             | 21 ++----
 tests/tool_use/utils.py                       | 67 +++++--------------
 .../openai/tool_parsers/llama_tool_parser.py  |  2 +-
 6 files changed, 41 insertions(+), 85 deletions(-)

diff --git a/examples/tool_chat_template_llama3_json.jinja b/examples/tool_chat_template_llama3_json.jinja
index 838e5e12e72..d5f4206422c 100644
--- a/examples/tool_chat_template_llama3_json.jinja
+++ b/examples/tool_chat_template_llama3_json.jinja
@@ -3,7 +3,7 @@
     {%- set tools = custom_tools %}
 {%- endif %}
 {%- if not tools_in_user_message is defined %}
-    {%- set tools_in_user_message = false %}
+    {%- set tools_in_user_message = true %}
 {%- endif %}
 {%- if not date_string is defined %}
     {%- set date_string = "26 Jul 2024" %}
@@ -17,7 +17,7 @@
     {%- set system_message = messages[0]['content']|trim %}
     {%- set messages = messages[1:] %}
 {%- else %}
-    {%- set system_message = "" %}
+    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
 {%- endif %}
 
 {#- System message + builtin tools #}
@@ -25,8 +25,8 @@
 {%- if tools is not none %}
     {{- "Environment: ipython\n" }}
 {%- endif %}
-{{- "Knowledge Cutoff Date: December 2023\n" }}
-{{- "Today's Date: " + date_string + "\n\n" }}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
 {%- if tools is not none and not tools_in_user_message %}
     {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
     {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
@@ -76,10 +76,10 @@
         {{- "<|eot_id|>" }}
     {%- elif message.role == "tool" or message.role == "ipython" %}
         {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
-        {%- if message.content is mapping or message.content is iterable %}
+        {%- if message.content is mapping %}
             {{- message.content | tojson }}
         {%- else %}
-            {{- message.content }}
+            {{- { "output": message.content } | tojson }}
         {%- endif %}
         {{- "<|eot_id|>" }}
     {%- endif %}
diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py
index 41fe6a87c1a..8e7cb9f5d3d 100644
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
@@ -4,7 +4,7 @@
 import pytest
 
 from .utils import (MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig,
-                    adapt_prompt_to_model)
+                    ensure_system_prompt)
 
 
 # test: make sure chat completions without tools provided work even when tools
@@ -16,7 +16,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_WITHOUT_TOOLS, server_config),
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
         temperature=0,
         max_tokens=150,
         model=model_name,
@@ -36,7 +36,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI,
 
     # make the same request, streaming
     stream = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_WITHOUT_TOOLS, server_config),
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
         temperature=0,
         max_tokens=150,
         model=model_name,
@@ -84,7 +84,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_WITHOUT_TOOLS, server_config),
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
         temperature=0,
         max_tokens=150,
         model=model_name,
@@ -105,7 +105,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI,
 
     # make the same request, streaming
     stream = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_WITHOUT_TOOLS, server_config),
+        messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config),
         temperature=0,
         max_tokens=150,
         model=model_name,
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index 51d33781639..ed7ac8afe1b 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -6,7 +6,7 @@
 
 from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
                     MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL,
-                    WEATHER_TOOL, ServerConfig, adapt_prompt_to_model)
+                    WEATHER_TOOL, ServerConfig)
 
 
 # test: getting the model to generate parallel tool calls (streaming/not)
@@ -24,8 +24,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
-                                       server_config),
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
         temperature=0,
         max_tokens=200,
         model=model_name,
@@ -62,8 +61,7 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI,
     # make the same request, streaming
     stream = await client.chat.completions.create(
         model=model_name,
-        messages=adapt_prompt_to_model(MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
-                                       server_config),
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
         temperature=0,
         max_tokens=200,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
@@ -154,8 +152,7 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
-                                       server_config),
+        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
         temperature=0,
         max_tokens=200,
         model=model_name,
@@ -173,8 +170,7 @@ async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI,
     assert "78" in choice.message.content  # Orlando temp in tool response
 
     stream = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
-                                       server_config),
+        messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE,
         temperature=0,
         max_tokens=200,
         model=model_name,
diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py
index e2c4bb89999..c3abe9e1f50 100644
--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
@@ -5,20 +5,17 @@
 import pytest
 
 from .utils import (MESSAGES_ASKING_FOR_TOOLS, MESSAGES_WITH_TOOL_RESPONSE,
-                    SEARCH_TOOL, WEATHER_TOOL, ServerConfig,
-                    adapt_prompt_to_model)
+                    SEARCH_TOOL, WEATHER_TOOL)
 
 
 # test: request a chat completion that should return tool calls, so we know they
 # are parsable
 @pytest.mark.asyncio
-async def test_tool_call_and_choice(client: openai.AsyncOpenAI,
-                                    server_config: ServerConfig):
+async def test_tool_call_and_choice(client: openai.AsyncOpenAI):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_ASKING_FOR_TOOLS,
-                                       server_config),
+        messages=MESSAGES_ASKING_FOR_TOOLS,
         temperature=0,
         max_tokens=100,
         model=model_name,
@@ -62,8 +59,7 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI,
     # make the same request, streaming
     stream = await client.chat.completions.create(
         model=model_name,
-        messages=adapt_prompt_to_model(MESSAGES_ASKING_FOR_TOOLS,
-                                       server_config),
+        messages=MESSAGES_ASKING_FOR_TOOLS,
         temperature=0,
         max_tokens=100,
         tools=[WEATHER_TOOL, SEARCH_TOOL],
@@ -140,13 +136,11 @@ async def test_tool_call_and_choice(client: openai.AsyncOpenAI,
 # test: providing tools and results back to model to get a non-tool response
 # (streaming/not)
 @pytest.mark.asyncio
-async def test_tool_call_with_results(client: openai.AsyncOpenAI,
-                                      server_config: ServerConfig):
+async def test_tool_call_with_results(client: openai.AsyncOpenAI):
     models = await client.models.list()
     model_name: str = models.data[0].id
     chat_completion = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_WITH_TOOL_RESPONSE,
-                                       server_config),
+        messages=MESSAGES_WITH_TOOL_RESPONSE,
         temperature=0,
         max_tokens=100,
         model=model_name,
@@ -163,8 +157,7 @@ async def test_tool_call_with_results(client: openai.AsyncOpenAI,
     assert "98" in choice.message.content  # the temperature from the response
 
     stream = await client.chat.completions.create(
-        messages=adapt_prompt_to_model(MESSAGES_WITH_TOOL_RESPONSE,
-                                       server_config),
+        messages=MESSAGES_WITH_TOOL_RESPONSE,
         temperature=0,
         max_tokens=100,
         model=model_name,
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 20b86fab9e2..d7166b6f3d7 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -1,6 +1,5 @@
-import json
 from copy import deepcopy
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Dict, List, Optional
 
 from openai.types.chat import (ChatCompletionMessageParam,
                                ChatCompletionToolParam)
@@ -14,27 +13,6 @@ class ServerConfig(TypedDict, total=False):
     arguments: List[str]
     system_prompt: Optional[str]
     supports_parallel: Optional[bool]
-    format_tool_output: Optional[Callable[[str], str]]
-
-
-def format_llama_tool_output(output: str) -> str:
-    return json.dumps({"output": output})
-
-
-def format_tool_output_id(output: str) -> str:
-    return output
-
-
-def patch_tool_output(messages: List[Dict[str, Any]],
-                      config: ServerConfig) -> List[Dict[str, Any]]:
-    fmt_fun = config.get("format_tool_output")
-    if not fmt_fun:
-        return messages
-    new_messages = deepcopy(messages)
-    for message in new_messages:
-        if message["role"] == "tool":
-            message["content"] = fmt_fun(message["content"])
-    return new_messages
 
 
 def patch_system_prompt(messages: List[Dict[str, Any]],
@@ -56,11 +34,6 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
         return messages
 
 
-def adapt_prompt_to_model(messages: List[Dict[str, Any]],
-                          config: ServerConfig) -> List[Dict[str, Any]]:
-    return ensure_system_prompt(patch_tool_output(messages, config), config)
-
-
 # universal args for all models go here. also good if you need to test locally
 # and change type or KV cache quantization or something.
 ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "8096"]
@@ -72,26 +45,23 @@ def adapt_prompt_to_model(messages: List[Dict[str, Any]],
         "arguments": [
             "--tool-call-parser", "hermes", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja")
-        ]
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
     },
     "llama": {
         "model":
         "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "arguments": [
-            "--tool-call-parser", "llama", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_llama.jinja")
+            "--tool-call-parser", "llama3_json", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3_json.jinja")
         ],
-        "system_prompt":
-        "You are a helpful assistant with tool calling capabilities. "
-        "Only reply with a tool call if the function exists in the "
-        "library provided by the user. If it doesn't exist, just "
-        "reply directly in natural language. When you receive a tool "
-        "call response, use the output to format an answer to the "
-        "original user question.",
         "supports_parallel":
         False,
-        "format_tool_output":
-        format_llama_tool_output
     },
     "mistral": {
         "model":
@@ -100,7 +70,13 @@ def adapt_prompt_to_model(messages: List[Dict[str, Any]],
             "--tool-call-parser", "mistral", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"),
             "--ignore-patterns=\"consolidated.safetensors\""
-        ]
+        ],
+        "system_prompt":
+        "You are a helpful assistant with access to tools. If a tool"
+        " that you have would be helpful to answer a user query, "
+        "call the tool. Otherwise, answer the user's query directly "
+        "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
+        "to the user's question - just respond to it normally."
     }
 }
 
@@ -165,15 +141,6 @@ def adapt_prompt_to_model(messages: List[Dict[str, Any]],
 }
 
 MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{
-    "role":
-    "system",
-    "content":
-    "You are a helpful assistant with access to tools. If a tool"
-    " that you have would be helpful to answer a user query, "
-    "call the tool. Otherwise, answer the user's query directly "
-    "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
-    "to the user's question - just respond to it normally."
-}, {
     "role":
     "user",
     "content":
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 15579ded18a..ad6eb171b73 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -60,7 +60,7 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.streamed_args_for_tool: List[str] = [
         ]  # map what has been streamed for each tool so far to a list
         self.bot_token = "<|python_tag|>"
-        self.bot_token_id = self.model_tokenizer.tokenizer.encode(
+        self.bot_token_id = self.model_tokenizer.encode(
             self.bot_token, add_special_tokens=False)[0]
         self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
 

From f66c0b18e609f6acf342f0de0cbc7b5f07103b9f Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Tue, 24 Sep 2024 22:10:41 -0300
Subject: [PATCH 09/12] correct chat template typo again

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 examples/tool_chat_template_llama3_json.jinja | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/tool_chat_template_llama3_json.jinja b/examples/tool_chat_template_llama3_json.jinja
index d5f4206422c..805238ba2d7 100644
--- a/examples/tool_chat_template_llama3_json.jinja
+++ b/examples/tool_chat_template_llama3_json.jinja
@@ -25,8 +25,8 @@
 {%- if tools is not none %}
     {{- "Environment: ipython\n" }}
 {%- endif %}
-{{- "Cutting Knowledge Date: December 2023\n" }}
-{{- "Today Date: " + date_string + "\n\n" }}
+{{- "Knowledge Cutoff Date: December 2023\n" }}
+{{- "Today's Date: " + date_string + "\n\n" }}
 {%- if tools is not none and not tools_in_user_message %}
     {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
     {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}

From 932a0933ec7debe67dcbcc6c0d8d5bc17e5be605 Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Wed, 25 Sep 2024 10:07:19 -0300
Subject: [PATCH 10/12] fix mypy problem

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index ad6eb171b73..f98dca16674 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -5,6 +5,7 @@
 
 import partial_json_parser
 from partial_json_parser.core.options import Allow
+from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
@@ -14,7 +15,6 @@
     ToolParser)
 from vllm.entrypoints.openai.tool_parsers.utils import find_common_prefix
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -49,7 +49,7 @@ class Llama3JsonToolParser(ToolParser):
     Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
         super().__init__(tokenizer)
 
         # initialize properties used for state when parsing tool calls in
@@ -60,8 +60,8 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.streamed_args_for_tool: List[str] = [
         ]  # map what has been streamed for each tool so far to a list
         self.bot_token = "<|python_tag|>"
-        self.bot_token_id = self.model_tokenizer.encode(
-            self.bot_token, add_special_tokens=False)[0]
+        self.bot_token_id = tokenizer.encode(self.bot_token,
+                                             add_special_tokens=False)[0]
         self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
 
     def extract_tool_calls(self,

From 669fe675066e3ac29c14325fe0078c2202ab034b Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Thu, 26 Sep 2024 16:45:37 -0300
Subject: [PATCH 11/12] add template version specialized for llama 3.1

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 .../tool_chat_template_llama3.1_json.jinja    | 94 +++++++++++++++++++
 examples/tool_chat_template_llama3_json.jinja | 17 ++--
 tests/tool_use/utils.py                       | 10 ++
 3 files changed, 114 insertions(+), 7 deletions(-)
 create mode 100644 examples/tool_chat_template_llama3.1_json.jinja

diff --git a/examples/tool_chat_template_llama3.1_json.jinja b/examples/tool_chat_template_llama3.1_json.jinja
new file mode 100644
index 00000000000..c24a7e51335
--- /dev/null
+++ b/examples/tool_chat_template_llama3.1_json.jinja
@@ -0,0 +1,94 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {#- Llama 3.1 doesn't pass all tests if the tools are in the system prompt #}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
+{%- endif %}
+
+{#- System message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+        {{- '{"name": "' + tool_call.name + '", ' }}
+        {{- '"parameters": ' }}
+        {{- tool_call.arguments | tojson }}
+        {{- "}" }}
+        {{- "<|eot_id|>" }}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- { "output": message.content } | tojson }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/examples/tool_chat_template_llama3_json.jinja b/examples/tool_chat_template_llama3_json.jinja
index 805238ba2d7..7e24777726a 100644
--- a/examples/tool_chat_template_llama3_json.jinja
+++ b/examples/tool_chat_template_llama3_json.jinja
@@ -3,10 +3,14 @@
     {%- set tools = custom_tools %}
 {%- endif %}
 {%- if not tools_in_user_message is defined %}
-    {%- set tools_in_user_message = true %}
+    {%- set tools_in_user_message = false %}
 {%- endif %}
 {%- if not date_string is defined %}
-    {%- set date_string = "26 Jul 2024" %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
 {%- endif %}
 {%- if not tools is defined %}
     {%- set tools = none %}
@@ -20,13 +24,13 @@
     {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
 {%- endif %}
 
-{#- System message + builtin tools #}
+{#- System message #}
 {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
 {%- if tools is not none %}
     {{- "Environment: ipython\n" }}
 {%- endif %}
-{{- "Knowledge Cutoff Date: December 2023\n" }}
-{{- "Today's Date: " + date_string + "\n\n" }}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
 {%- if tools is not none and not tools_in_user_message %}
     {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
     {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
@@ -47,7 +51,7 @@
         {%- set messages = messages[1:] %}
     {%- else %}
         {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
-{%- endif %}
+    {%- endif %}
     {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
     {{- "Given the following functions, please respond with a JSON for a function call " }}
     {{- "with its proper arguments that best answers the given prompt.\n\n" }}
@@ -87,4 +91,3 @@
 {%- if add_generation_prompt %}
     {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
 {%- endif %}
-
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index d7166b6f3d7..418e5f3a219 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -56,6 +56,16 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
     "llama": {
         "model":
         "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "arguments": [
+            "--tool-call-parser", "llama3_json", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja")
+        ],
+        "supports_parallel":
+        False,
+    },
+    "llama3.2": {
+        "model":
+        "meta-llama/Llama-3.2-3B-Instruct",
         "arguments": [
             "--tool-call-parser", "llama3_json", "--chat-template",
             str(VLLM_PATH / "examples/tool_chat_template_llama3_json.jinja")

From 871c56865e8879ac96bd690c92481a36fc8bf86a Mon Sep 17 00:00:00 2001
From: Max de Bayser <mbayser@br.ibm.com>
Date: Thu, 26 Sep 2024 18:02:19 -0300
Subject: [PATCH 12/12] rename chat template

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 ...llama3_json.jinja => tool_chat_template_llama3.2_json.jinja} | 0
 tests/tool_use/utils.py                                         | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename examples/{tool_chat_template_llama3_json.jinja => tool_chat_template_llama3.2_json.jinja} (100%)

diff --git a/examples/tool_chat_template_llama3_json.jinja b/examples/tool_chat_template_llama3.2_json.jinja
similarity index 100%
rename from examples/tool_chat_template_llama3_json.jinja
rename to examples/tool_chat_template_llama3.2_json.jinja
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 418e5f3a219..1a840f8a51c 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -68,7 +68,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
         "meta-llama/Llama-3.2-3B-Instruct",
         "arguments": [
             "--tool-call-parser", "llama3_json", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_llama3_json.jinja")
+            str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja")
         ],
         "supports_parallel":
         False,