@@ -26,7 +26,13 @@ static inline __device__ int8_t float_to_int8_rn(float x) {
26
26
float dst = std::nearbyint (x);
27
27
28
28
// saturate
29
- dst = std::clamp (dst, i8_min, i8_max);
29
+
30
+ // See https://github.com/pytorch/pytorch/issues/127666
31
+ // See https://github.com/llvm/llvm-project/issues/95183
32
+ // hip-clang std::clamp __glibcxx_assert_fail host function when building on
33
+ // Arch/gcc14. The following replaces std::clamp usage with similar logic
34
+ // dst = std::clamp(dst, i8_min, i8_max);
35
+ dst = (dst < i8_min) ? i8_min : (dst > i8_max) ? i8_max : dst;
30
36
return static_cast <int8_t >(dst);
31
37
#else
32
38
// CUDA path
@@ -79,7 +85,13 @@ static inline __device__ int8_t int32_to_int8(int32_t x) {
79
85
static_cast <int32_t >(std::numeric_limits<int8_t >::max ());
80
86
81
87
// saturate
82
- int32_t dst = std::clamp (x, i8_min, i8_max);
88
+
89
+ // See https://github.com/pytorch/pytorch/issues/127666
90
+ // See https://github.com/llvm/llvm-project/issues/95183
91
+ // hip-clang std::clamp __glibcxx_assert_fail host function when building on
92
+ // Arch/gcc14. The following replaces std::clamp usage with similar logic
93
+ // int32_t dst = std::clamp(x, i8_min, i8_max);
94
+ int32_t dst = (x < i8_min) ? i8_min : (x > i8_max) ? i8_max : x;
83
95
return static_cast <int8_t >(dst);
84
96
#else
85
97
// CUDA path
0 commit comments