vllm-project · vllm-bot · May 13, 2025 · Feb 24, 2025 · houseroad · May 9, 2025
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -26,7 +26,13 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
   float dst = std::nearbyint(x);
 
   // saturate
-  dst = std::clamp(dst, i8_min, i8_max);
+
+  // See https://github.com/pytorch/pytorch/issues/127666
+  // See https://github.com/llvm/llvm-project/issues/95183
+  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
+  // Arch/gcc14. The following replaces std::clamp usage with similar logic
+  // dst = std::clamp(dst, i8_min, i8_max);
+  dst = (dst < i8_min) ? i8_min : (dst > i8_max) ? i8_max : dst;
   return static_cast<int8_t>(dst);
 #else
   // CUDA path
@@ -79,7 +85,13 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
       static_cast<int32_t>(std::numeric_limits<int8_t>::max());
 
   // saturate
-  int32_t dst = std::clamp(x, i8_min, i8_max);
+
+  // See https://github.com/pytorch/pytorch/issues/127666
+  // See https://github.com/llvm/llvm-project/issues/95183
+  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
+  // Arch/gcc14. The following replaces std::clamp usage with similar logic
+  // int32_t dst = std::clamp(x, i8_min, i8_max);
+  int32_t dst = (x < i8_min) ? i8_min : (x > i8_max) ? i8_max : x;
   return static_cast<int8_t>(dst);
 #else
   // CUDA path

diff --git a/csrc/quantization/fused_kernels/quant_conversions.cuh b/csrc/quantization/fused_kernels/quant_conversions.cuh
@@ -21,7 +21,13 @@ static __device__ __forceinline__ int8_t float_to_int8_rn(float const x) {
   // round
   float dst = std::nearbyint(x);
   // saturate
-  dst = std::clamp(dst, i8_min, i8_max);
+
+  // See https://github.com/pytorch/pytorch/issues/127666
+  // See https://github.com/llvm/llvm-project/issues/95183
+  // hip-clang std::clamp __glibcxx_assert_fail host function when building on
+  // Arch/gcc14. The following replaces std::clamp usage with similar logic
+  // dst = std::clamp(dst, i8_min, i8_max);
+  dst = (dst < i8_min) ? i8_min : (dst > i8_max) ? i8_max : dst;
   return static_cast<int8_t>(dst);
 #else
   // CUDA path