Skip to content

Commit 3eeb148

Browse files
authored
[Misc] Pass cutlass_fp8_supported correctly in fbgemm_fp8 (#6871)
1 parent b1366a9 commit 3eeb148

File tree

1 file changed

+11
-8
lines changed

1 file changed

+11
-8
lines changed

vllm/model_executor/layers/quantization/fbgemm_fp8.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
UnquantizedLinearMethod)
1010
from vllm.model_executor.layers.quantization.base_config import (
1111
QuantizationConfig, QuantizeMethodBase)
12+
from vllm.model_executor.layers.quantization.fp8 import cutlass_fp8_supported
1213
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
1314
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
1415
from vllm.model_executor.layers.quantization.utils.quant_utils import (
@@ -72,6 +73,7 @@ class FBGEMMFp8LinearMethod(LinearMethodBase):
7273

7374
def __init__(self, quant_config: FBGEMMFp8Config):
7475
self.quant_config = quant_config
76+
self.cutlass_fp8_supported = cutlass_fp8_supported()
7577

7678
def create_weights(
7779
self,
@@ -139,11 +141,12 @@ def apply(self,
139141
size_k=layer.input_size_per_partition,
140142
bias=bias)
141143

142-
return apply_fp8_linear(input=x,
143-
weight=layer.weight,
144-
weight_scale=layer.weight_scale,
145-
input_scale=None,
146-
input_scale_ub=layer.input_scale_ub,
147-
bias=bias,
148-
cutlass_fp8_supported=True,
149-
use_per_token_if_dynamic=True)
144+
return apply_fp8_linear(
145+
input=x,
146+
weight=layer.weight,
147+
weight_scale=layer.weight_scale,
148+
input_scale=None,
149+
input_scale_ub=layer.input_scale_ub,
150+
bias=bias,
151+
cutlass_fp8_supported=self.cutlass_fp8_supported,
152+
use_per_token_if_dynamic=True)

0 commit comments

Comments
 (0)