update condition slightly

rshaw@neuralmagic.com · rshaw@neuralmagic.com · commit 5f2cb452526b · 2024-07-17T18:52:53.000Z
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -36,9 +36,8 @@ def __init__(self, strategy: str, is_static_input_scheme: bool):
     def process_weights_after_loading(self, layer) -> None:
         # If per tensor, when we have a fused module (e.g. QKV) with per
         # tensor scales (thus N scales being passed to the kernel),
-        # requantize so we can always run per tensor with torch._scaled_mm
-        if (self.strategy == QuantizationStrategy.TENSOR
-                or not self.cutlass_fp8_supported):
+        # requantize so we can always run per tensor
+        if self.strategy == QuantizationStrategy.TENSOR:
             max_w_scale, weight = requantize_with_max_scale(
                 weight=layer.weight,
                 weight_scale=layer.weight_scale,