triton-inference-server · nnshah1 · Jan 12, 2024 · Jan 12, 2024 · Jan 12, 2024 · Jan 12, 2024
diff --git a/python/test/test_api.py b/python/test/test_api.py
@@ -71,6 +71,119 @@ def test_create_request(self):
 
 
 class AllocatorTests(unittest.TestCase):
+    class MockMemoryAllocator(tritonserver.MemoryAllocator):
+        def __init__(self):
+            pass
+
+        def allocate(self, *args, **kwargs):
+            raise Exception("foo")
+
+    @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
+    def test_memory_fallback_to_cpu(self):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
+
+        self.assertTrue(server.ready())
+
+        allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
+
+        del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
+
+        server.load(
+            "test",
+            {
+                "config": json.dumps(
+                    {
+                        "backend": "python",
+                        "parameters": {
+                            "decoupled": {"string_value": "False"},
+                            "request_gpu_memory": {"string_value": "True"},
+                        },
+                    }
+                )
+            },
+        )
+
+        fp16_input = numpy.random.rand(1, 100).astype(dtype=numpy.float16)
+
+        for response in server.model("test").infer(
+            inputs={"fp16_input": fp16_input},
+        ):
+            self.assertEqual(
+                response.outputs["fp16_output"].memory_type, tritonserver.MemoryType.CPU
+            )
+            fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
+            self.assertEqual(fp16_input[0][0], fp16_output[0][0])
+
+        tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] = allocator
+
+    def test_memory_allocator_exception(self):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
+
+        self.assertTrue(server.ready())
+
+        server.load(
+            "test",
+            {
+                "config": json.dumps(
+                    {
+                        "backend": "python",
+                        "parameters": {"decoupled": {"string_value": "False"}},
+                    }
+                )
+            },
+        )
+
+        with self.assertRaises(tritonserver.InternalError):
+            for response in server.model("test").infer(
+                inputs={
+                    "string_input": tritonserver.Tensor.from_string_array([["hello"]])
+                },
+                output_memory_type="gpu",
+                output_memory_allocator=AllocatorTests.MockMemoryAllocator(),
+            ):
+                pass
+
+    def test_unsupported_memory_type(self):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
+
+        self.assertTrue(server.ready())
+
+        server.load(
+            "test",
+            {
+                "config": json.dumps(
+                    {
+                        "backend": "python",
+                        "parameters": {"decoupled": {"string_value": "False"}},
+                    }
+                )
+            },
+        )
+
+        if tritonserver.MemoryType.GPU in tritonserver.default_memory_allocators:
+            allocator = tritonserver.default_memory_allocators[
+                tritonserver.MemoryType.GPU
+            ]
+
+            del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
+        else:
+            allocator = None
+
+        with self.assertRaises(tritonserver.InvalidArgumentError):
+            for response in server.model("test").infer(
+                inputs={
+                    "string_input": tritonserver.Tensor.from_string_array([["hello"]])
+                },
+                output_memory_type="gpu",
+            ):
+                pass
+
+        if allocator is not None:
+            tritonserver.default_memory_allocators[
+                tritonserver.MemoryType.GPU
+            ] = allocator
+
+    @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
     def test_allocate_on_cpu_and_reshape(self):
         allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.CPU]
 
@@ -203,7 +316,8 @@ def test_ready(self):
 
 
 class InferenceTests(unittest.TestCase):
-    def test_basic_inference(self):
+    @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
+    def test_gpu_output(self):
         server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
         self.assertTrue(server.ready())
@@ -222,14 +336,6 @@ def test_basic_inference(self):
 
         fp16_input = numpy.random.rand(1, 100).astype(dtype=numpy.float16)
 
-        for response in server.model("test").infer(
-            inputs={"fp16_input": fp16_input},
-            output_memory_type="cpu",
-            raise_on_error=True,
-        ):
-            fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
-            numpy.testing.assert_array_equal(fp16_input, fp16_output)
-
         for response in server.model("test").infer(
             inputs={"fp16_input": fp16_input},
             output_memory_type="gpu",
@@ -251,4 +357,32 @@ def test_basic_inference(self):
             text_output = response.outputs["string_output"].to_string_array()
             text_output = response.outputs["string_output"].to_string_array()
             self.assertEqual(text_output[0][0], "hello")
+
+    def test_basic_inference(self):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
+
+        self.assertTrue(server.ready())
+
+        server.load(
+            "test",
+            {
+                "config": json.dumps(
+                    {
+                        "backend": "python",
+                        "parameters": {"decoupled": {"string_value": "False"}},
+                    }
+                )
+            },
+        )
+
+        fp16_input = numpy.random.rand(1, 100).astype(dtype=numpy.float16)
+
+        for response in server.model("test").infer(
+            inputs={"fp16_input": fp16_input},
+            output_memory_type="cpu",
+            raise_on_error=True,
+        ):
+            fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
+            numpy.testing.assert_array_equal(fp16_input, fp16_output)
+
         server.stop()
diff --git a/python/test/test_api_models/test/1/model.py b/python/test/test_api_models/test/1/model.py
@@ -3,6 +3,11 @@
 import numpy as np
 import triton_python_backend_utils as pb_utils
 
+try:
+    import cupy
+except:
+    cupy = None
+
 
 class TritonPythonModel:
     @staticmethod
@@ -44,6 +49,14 @@ def initialize(self, args):
         self._decoupled = self._model_config.get("model_transaction_policy", {}).get(
             "decoupled"
         )
+        self._request_gpu_memory = False
+        if "parameters" in self._model_config:
+            parameters = self._model_config["parameters"]
+            if (
+                "request_gpu_memory" in parameters
+                and parameters["request_gpu_memory"]["string_value"] == "True"
+            ):
+                self._request_gpu_memory = True
 
     def execute_decoupled(self, requests):
         for request in requests:
@@ -67,9 +80,17 @@ def execute(self, requests):
             output_tensors = []
             for input_tensor in request.inputs():
                 input_value = input_tensor.as_numpy()
-                output_tensor = pb_utils.Tensor(
-                    input_tensor.name().replace("input", "output"), input_value
-                )
+
+                if self._request_gpu_memory:
+                    input_value = cupy.array(input_value)
+
+                    output_tensor = pb_utils.Tensor.from_dlpack(
+                        input_tensor.name().replace("input", "output"), input_value
+                    )
+                else:
+                    output_tensor = pb_utils.Tensor(
+                        input_tensor.name().replace("input", "output"), input_value
+                    )
                 output_tensors.append(output_tensor)
 
             responses.append(pb_utils.InferenceResponse(output_tensors=output_tensors))

diff --git a/python/tritonserver/_api/_allocators.py b/python/tritonserver/_api/_allocators.py
@@ -38,6 +38,7 @@
     DLPackObject,
     parse_device_or_memory_type,
 )
+from tritonserver._api._logging import LogLevel, LogMessage
 from tritonserver._c import TRITONSERVER_ResponseAllocator
 from tritonserver._c.triton_bindings import (
     InvalidArgumentError,
@@ -228,6 +229,14 @@ def __init__(
             self._memory_type, self._memory_type_id = parse_device_or_memory_type(
                 device_or_memory_type
             )
+        if (
+            self._memory_type is not None
+            and self._memory_allocator is None
+            and self._memory_type not in default_memory_allocators
+        ):
+            raise InvalidArgumentError(
+                f"Memory type {self._memory_type} not supported by default_memory_allocators: {default_memory_allocators}"
+            )
 
     def allocate(
         self,
@@ -238,24 +247,38 @@ def allocate(
         memory_type_id,
         _user_object,
     ):
-        if self._memory_type is not None:
-            memory_type = self._memory_type
-            memory_type_id = self._memory_type_id
-
-        memory_allocator = self._memory_allocator
-        if memory_allocator is None:
-            memory_allocator = default_memory_allocators[memory_type]
-
-        memory_buffer = memory_allocator.allocate(
-            byte_size, memory_type, memory_type_id
-        )
+        try:
+            if self._memory_type is not None:
+                memory_type = self._memory_type
+                memory_type_id = self._memory_type_id
+
+            memory_allocator = self._memory_allocator
+            if memory_allocator is None:
+                if memory_type in default_memory_allocators:
+                    memory_allocator = default_memory_allocators[memory_type]
+                else:
+                    LogMessage(
+                        LogLevel.WARN,
+                        f"Requested memory type {memory_type} is not supported, falling back to {MemoryType.CPU}",
+                    )
+                    memory_type = MemoryType.CPU
+                    memory_type_id = 0
+                    memory_allocator = default_memory_allocators[memory_type]
+
+            memory_buffer = memory_allocator.allocate(
+                byte_size, memory_type, memory_type_id
+            )
 
-        return (
-            memory_buffer.data_ptr,
-            memory_buffer,
-            memory_buffer.memory_type,
-            memory_buffer.memory_type_id,
-        )
+            return (
+                memory_buffer.data_ptr,
+                memory_buffer,
+                memory_buffer.memory_type,
+                memory_buffer.memory_type_id,
+            )
+        except Exception as e:
+            message = f"Catastrophic failure in allocator: {e}, returning NULL"
+            LogMessage(LogLevel.ERROR, message)
+            return (0, None, MemoryType.CPU, 0)
 
     def release(
         self,

diff --git a/python/tritonserver/_api/_logging.py b/python/tritonserver/_api/_logging.py
@@ -0,0 +1,61 @@
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Logging Utilities"""
+import inspect
+import os
+
+from tritonserver._c.triton_bindings import TRITONSERVER_LogLevel as LogLevel
+from tritonserver._c.triton_bindings import TRITONSERVER_LogMessage
+
+
+def LogMessage(level: LogLevel, message: str):
+    """Log Message using Triton Inference Server Logger
+
+    Parameters
+    ----------
+    level : LogLevel
+        log level one of LogLevel.WARN, LogLevel.ERROR, LogLevel.INFO
+    message : str
+        message
+
+    Examples
+    --------
+
+    LogMessage(LogLevel.ERROR,"I've got a bad feeling about this ...")
+
+    """
+
+    filename, line_number = "unknown", -1
+    try:
+        current_frame = inspect.stack()[-1]
+        filename, line_number = (
+            os.path.basename(current_frame.filename),
+            current_frame.lineno,
+        )
+    except Exception as e:
+        pass
+    TRITONSERVER_LogMessage(level, filename, line_number, message)