diff --git a/python/test/test_api.py b/python/test/test_api.py index 6de69571f..f84385936 100644 --- a/python/test/test_api.py +++ b/python/test/test_api.py @@ -71,6 +71,119 @@ def test_create_request(self): class AllocatorTests(unittest.TestCase): + class MockMemoryAllocator(tritonserver.MemoryAllocator): + def __init__(self): + pass + + def allocate(self, *args, **kwargs): + raise Exception("foo") + + @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") + def test_memory_fallback_to_cpu(self): + server = tritonserver.Server(server_options).start(wait_until_ready=True) + + self.assertTrue(server.ready()) + + allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] + + del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] + + server.load( + "test", + { + "config": json.dumps( + { + "backend": "python", + "parameters": { + "decoupled": {"string_value": "False"}, + "request_gpu_memory": {"string_value": "True"}, + }, + } + ) + }, + ) + + fp16_input = numpy.random.rand(1, 100).astype(dtype=numpy.float16) + + for response in server.model("test").infer( + inputs={"fp16_input": fp16_input}, + ): + self.assertEqual( + response.outputs["fp16_output"].memory_type, tritonserver.MemoryType.CPU + ) + fp16_output = numpy.from_dlpack(response.outputs["fp16_output"]) + self.assertEqual(fp16_input[0][0], fp16_output[0][0]) + + tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] = allocator + + def test_memory_allocator_exception(self): + server = tritonserver.Server(server_options).start(wait_until_ready=True) + + self.assertTrue(server.ready()) + + server.load( + "test", + { + "config": json.dumps( + { + "backend": "python", + "parameters": {"decoupled": {"string_value": "False"}}, + } + ) + }, + ) + + with self.assertRaises(tritonserver.InternalError): + for response in server.model("test").infer( + inputs={ + "string_input": tritonserver.Tensor.from_string_array([["hello"]]) + }, + output_memory_type="gpu", + output_memory_allocator=AllocatorTests.MockMemoryAllocator(), + ): + pass + + def test_unsupported_memory_type(self): + server = tritonserver.Server(server_options).start(wait_until_ready=True) + + self.assertTrue(server.ready()) + + server.load( + "test", + { + "config": json.dumps( + { + "backend": "python", + "parameters": {"decoupled": {"string_value": "False"}}, + } + ) + }, + ) + + if tritonserver.MemoryType.GPU in tritonserver.default_memory_allocators: + allocator = tritonserver.default_memory_allocators[ + tritonserver.MemoryType.GPU + ] + + del tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] + else: + allocator = None + + with self.assertRaises(tritonserver.InvalidArgumentError): + for response in server.model("test").infer( + inputs={ + "string_input": tritonserver.Tensor.from_string_array([["hello"]]) + }, + output_memory_type="gpu", + ): + pass + + if allocator is not None: + tritonserver.default_memory_allocators[ + tritonserver.MemoryType.GPU + ] = allocator + + @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed") def test_allocate_on_cpu_and_reshape(self): allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.CPU] @@ -203,7 +316,8 @@ def test_ready(self): class InferenceTests(unittest.TestCase): - def test_basic_inference(self): + @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") + def test_gpu_output(self): server = tritonserver.Server(server_options).start(wait_until_ready=True) self.assertTrue(server.ready()) @@ -222,14 +336,6 @@ def test_basic_inference(self): fp16_input = numpy.random.rand(1, 100).astype(dtype=numpy.float16) - for response in server.model("test").infer( - inputs={"fp16_input": fp16_input}, - output_memory_type="cpu", - raise_on_error=True, - ): - fp16_output = numpy.from_dlpack(response.outputs["fp16_output"]) - numpy.testing.assert_array_equal(fp16_input, fp16_output) - for response in server.model("test").infer( inputs={"fp16_input": fp16_input}, output_memory_type="gpu", @@ -251,4 +357,32 @@ def test_basic_inference(self): text_output = response.outputs["string_output"].to_string_array() text_output = response.outputs["string_output"].to_string_array() self.assertEqual(text_output[0][0], "hello") + + def test_basic_inference(self): + server = tritonserver.Server(server_options).start(wait_until_ready=True) + + self.assertTrue(server.ready()) + + server.load( + "test", + { + "config": json.dumps( + { + "backend": "python", + "parameters": {"decoupled": {"string_value": "False"}}, + } + ) + }, + ) + + fp16_input = numpy.random.rand(1, 100).astype(dtype=numpy.float16) + + for response in server.model("test").infer( + inputs={"fp16_input": fp16_input}, + output_memory_type="cpu", + raise_on_error=True, + ): + fp16_output = numpy.from_dlpack(response.outputs["fp16_output"]) + numpy.testing.assert_array_equal(fp16_input, fp16_output) + server.stop() diff --git a/python/test/test_api_models/test/1/model.py b/python/test/test_api_models/test/1/model.py index 67eaf3769..e107b20bc 100644 --- a/python/test/test_api_models/test/1/model.py +++ b/python/test/test_api_models/test/1/model.py @@ -3,6 +3,11 @@ import numpy as np import triton_python_backend_utils as pb_utils +try: + import cupy +except: + cupy = None + class TritonPythonModel: @staticmethod @@ -44,6 +49,14 @@ def initialize(self, args): self._decoupled = self._model_config.get("model_transaction_policy", {}).get( "decoupled" ) + self._request_gpu_memory = False + if "parameters" in self._model_config: + parameters = self._model_config["parameters"] + if ( + "request_gpu_memory" in parameters + and parameters["request_gpu_memory"]["string_value"] == "True" + ): + self._request_gpu_memory = True def execute_decoupled(self, requests): for request in requests: @@ -67,9 +80,17 @@ def execute(self, requests): output_tensors = [] for input_tensor in request.inputs(): input_value = input_tensor.as_numpy() - output_tensor = pb_utils.Tensor( - input_tensor.name().replace("input", "output"), input_value - ) + + if self._request_gpu_memory: + input_value = cupy.array(input_value) + + output_tensor = pb_utils.Tensor.from_dlpack( + input_tensor.name().replace("input", "output"), input_value + ) + else: + output_tensor = pb_utils.Tensor( + input_tensor.name().replace("input", "output"), input_value + ) output_tensors.append(output_tensor) responses.append(pb_utils.InferenceResponse(output_tensors=output_tensors)) diff --git a/python/tritonserver/_api/_allocators.py b/python/tritonserver/_api/_allocators.py index 474a78b8d..cb6fff651 100644 --- a/python/tritonserver/_api/_allocators.py +++ b/python/tritonserver/_api/_allocators.py @@ -38,6 +38,7 @@ DLPackObject, parse_device_or_memory_type, ) +from tritonserver._api._logging import LogLevel, LogMessage from tritonserver._c import TRITONSERVER_ResponseAllocator from tritonserver._c.triton_bindings import ( InvalidArgumentError, @@ -228,6 +229,14 @@ def __init__( self._memory_type, self._memory_type_id = parse_device_or_memory_type( device_or_memory_type ) + if ( + self._memory_type is not None + and self._memory_allocator is None + and self._memory_type not in default_memory_allocators + ): + raise InvalidArgumentError( + f"Memory type {self._memory_type} not supported by default_memory_allocators: {default_memory_allocators}" + ) def allocate( self, @@ -238,24 +247,38 @@ def allocate( memory_type_id, _user_object, ): - if self._memory_type is not None: - memory_type = self._memory_type - memory_type_id = self._memory_type_id - - memory_allocator = self._memory_allocator - if memory_allocator is None: - memory_allocator = default_memory_allocators[memory_type] - - memory_buffer = memory_allocator.allocate( - byte_size, memory_type, memory_type_id - ) + try: + if self._memory_type is not None: + memory_type = self._memory_type + memory_type_id = self._memory_type_id + + memory_allocator = self._memory_allocator + if memory_allocator is None: + if memory_type in default_memory_allocators: + memory_allocator = default_memory_allocators[memory_type] + else: + LogMessage( + LogLevel.WARN, + f"Requested memory type {memory_type} is not supported, falling back to {MemoryType.CPU}", + ) + memory_type = MemoryType.CPU + memory_type_id = 0 + memory_allocator = default_memory_allocators[memory_type] + + memory_buffer = memory_allocator.allocate( + byte_size, memory_type, memory_type_id + ) - return ( - memory_buffer.data_ptr, - memory_buffer, - memory_buffer.memory_type, - memory_buffer.memory_type_id, - ) + return ( + memory_buffer.data_ptr, + memory_buffer, + memory_buffer.memory_type, + memory_buffer.memory_type_id, + ) + except Exception as e: + message = f"Catastrophic failure in allocator: {e}, returning NULL" + LogMessage(LogLevel.ERROR, message) + return (0, None, MemoryType.CPU, 0) def release( self, diff --git a/python/tritonserver/_api/_logging.py b/python/tritonserver/_api/_logging.py new file mode 100644 index 000000000..0b8fd1136 --- /dev/null +++ b/python/tritonserver/_api/_logging.py @@ -0,0 +1,61 @@ +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Logging Utilities""" +import inspect +import os + +from tritonserver._c.triton_bindings import TRITONSERVER_LogLevel as LogLevel +from tritonserver._c.triton_bindings import TRITONSERVER_LogMessage + + +def LogMessage(level: LogLevel, message: str): + """Log Message using Triton Inference Server Logger + + Parameters + ---------- + level : LogLevel + log level one of LogLevel.WARN, LogLevel.ERROR, LogLevel.INFO + message : str + message + + Examples + -------- + + LogMessage(LogLevel.ERROR,"I've got a bad feeling about this ...") + + """ + + filename, line_number = "unknown", -1 + try: + current_frame = inspect.stack()[-1] + filename, line_number = ( + os.path.basename(current_frame.filename), + current_frame.lineno, + ) + except Exception as e: + pass + TRITONSERVER_LogMessage(level, filename, line_number, message) diff --git a/python/tritonserver/_api/_response.py b/python/tritonserver/_api/_response.py index cfc6a298a..8dc505298 100644 --- a/python/tritonserver/_api/_response.py +++ b/python/tritonserver/_api/_response.py @@ -39,6 +39,7 @@ if TYPE_CHECKING: from tritonserver._api._model import Model +from tritonserver._api._logging import LogMessage from tritonserver._api._tensor import Tensor from tritonserver._c.triton_bindings import ( InternalError, @@ -46,7 +47,6 @@ TRITONSERVER_InferenceRequest, ) from tritonserver._c.triton_bindings import TRITONSERVER_LogLevel as LogLevel -from tritonserver._c.triton_bindings import TRITONSERVER_LogMessage as LogMessage from tritonserver._c.triton_bindings import ( TRITONSERVER_ResponseCompleteFlag, TRITONSERVER_Server, @@ -186,17 +186,8 @@ def _response_callback(self, response, flags, unused): del self._request self._request = None except Exception as e: - current_frame = inspect.currentframe() - if current_frame is not None: - line_number = current_frame.f_lineno - else: - line_number = -1 - LogMessage( - LogLevel.ERROR, - __file__, - line_number, - str(e), - ) + message = f"Catastrophic failure in response callback: {e}" + LogMessage(LogLevel.ERROR, message) # catastrophic failure raise e from None @@ -321,18 +312,8 @@ def _response_callback(self, response, flags, unused): del self._request self._request = None except Exception as e: - current_frame = inspect.currentframe() - if current_frame is not None: - line_number = current_frame.f_lineno - else: - line_number = -1 - - LogMessage( - LogLevel.ERROR, - __file__, - line_number, - str(e), - ) + message = f"Catastrophic failure in response callback: {e}" + LogMessage(LogLevel.ERROR, message) # catastrophic failure raise e from None @@ -396,7 +377,7 @@ def _from_tritonserver_inference_response( try: response.throw_if_response_error() except TritonError as error: - error.args += tuple(asdict(result).items()) + error.args += (result,) result.error = error name, version = response.model @@ -426,7 +407,7 @@ def _from_tritonserver_inference_response( result.outputs = outputs except Exception as e: error = InternalError(f"Unexpected error in creating response object: {e}") - error.args += tuple(asdict(result).items()) + error.args += (result,) result.error = error # TODO: support classification diff --git a/python/tritonserver/_api/_server.py b/python/tritonserver/_api/_server.py index 97ca344b0..13c1fe66f 100644 --- a/python/tritonserver/_api/_server.py +++ b/python/tritonserver/_api/_server.py @@ -41,6 +41,7 @@ TRITONSERVER_InstanceGroupKind as InstanceGroupKind, ) from tritonserver._c.triton_bindings import TRITONSERVER_LogFormat as LogFormat +from tritonserver._c.triton_bindings import TRITONSERVER_LogLevel as LogLevel from tritonserver._c.triton_bindings import TRITONSERVER_Metric from tritonserver._c.triton_bindings import TRITONSERVER_MetricFamily as MetricFamily from tritonserver._c.triton_bindings import TRITONSERVER_MetricFormat as MetricFormat