Skip to content

Commit e7aecdc

Browse files
robertgshaw2-redhatfialhocoelho
authored andcommitted
[ Kernel ] Fp8 Channelwise Weight Support (vllm-project#6487)
1 parent e57a59c commit e7aecdc

File tree

4 files changed

+76
-35
lines changed

4 files changed

+76
-35
lines changed

vllm/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,8 @@ def _verify_quantization(self) -> None:
238238
f"{self.quantization} quantization is currently not "
239239
f"supported in ROCm.")
240240
if (self.quantization
241-
not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin")):
241+
not in ("fp8", "marlin", "gptq_marlin_24", "gptq_marlin",
242+
"compressed_tensors")):
242243
logger.warning(
243244
"%s quantization is not fully "
244245
"optimized yet. The speed can be slower than "

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
CompressedTensorsWNA16)
1414
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
1515
CompressionFormat, QuantizationArgs, QuantizationStrategy,
16-
QuantizationType, find_first_name_or_class_match)
16+
QuantizationType, find_first_name_or_class_match,
17+
is_activation_quantization_format)
1718
from vllm.platforms import current_platform
1819

1920

@@ -132,10 +133,11 @@ def _is_fp8_w8a8(self, weight_quant: BaseModel,
132133
# Confirm weight scheme is supported.
133134
is_symmetric_weight = weight_quant.symmetric
134135
is_static_weight = not weight_quant.dynamic
135-
is_per_tensor_weight = (
136-
weight_quant.strategy == QuantizationStrategy.TENSOR)
136+
is_per_tensor_or_channel_weight = (weight_quant.strategy in [
137+
QuantizationStrategy.TENSOR, QuantizationStrategy.CHANNEL
138+
])
137139
if not (is_symmetric_weight and is_static_weight
138-
and is_per_tensor_weight):
140+
and is_per_tensor_or_channel_weight):
139141
return False
140142

141143
# Dynamic quantization is always supported if weights supported.
@@ -167,6 +169,7 @@ def _is_wNa16_group_channel(self, weight_quant: BaseModel,
167169
def _get_schema(self, weight_quant: BaseModel,
168170
input_quant: BaseModel) -> "CompressedTensorsScheme":
169171

172+
# Detect If Mixed Precision
170173
if self._is_wNa16_group_channel(weight_quant, input_quant):
171174
self._check_gptq_and_marlin_can_run()
172175
if (self.quant_format == CompressionFormat.marlin_24.value
@@ -182,11 +185,12 @@ def _get_schema(self, weight_quant: BaseModel,
182185
strategy=weight_quant.strategy,
183186
group_size=weight_quant.group_size)
184187

185-
if (self.quant_format == CompressionFormat.int_quantized.value or
186-
self.quant_format == CompressionFormat.float_quantized.value):
188+
# Detect If Activation Quantization.
189+
if is_activation_quantization_format(self.quant_format):
187190
if self._is_fp8_w8a8(weight_quant, input_quant):
188191
return CompressedTensorsW8A8Fp8(
189-
input_dynamic=input_quant.dynamic)
192+
strategy=weight_quant.strategy,
193+
is_static_input_scheme=(not input_quant.dynamic))
190194

191195
if self._is_static_tensor_w8a8(weight_quant, input_quant):
192196
return CompressedTensorsW8A8Int8(

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
from typing import Callable, List, Optional
22

33
import torch
4+
from torch.nn import Parameter
45

56
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
67
CompressedTensorsScheme)
8+
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
9+
QuantizationStrategy)
710
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
8-
apply_fp8_linear, create_per_tensor_scale_param, cutlass_fp8_supported,
11+
apply_fp8_linear, create_per_channel_scale_param,
12+
create_per_tensor_scale_param, cutlass_fp8_supported,
913
requantize_with_max_scale)
1014
from vllm.model_executor.utils import set_weight_attrs
1115

@@ -14,39 +18,56 @@
1418

1519
class CompressedTensorsW8A8Fp8(CompressedTensorsScheme):
1620

17-
def __init__(self, input_dynamic: bool):
18-
self.input_dynamic = input_dynamic
21+
def __init__(self, strategy: str, is_static_input_scheme: bool):
22+
self.strategy = strategy
23+
self.is_static_input_scheme = is_static_input_scheme
1924
self.cutlass_fp8_supported = cutlass_fp8_supported()
2025

21-
# W8A8-Fp8 kernels support only per-tensor and per-channel cases.
22-
# So if we have a fused module (QKV, MLP) with per tensor scales (thus N
23-
# scales being passed to the kernel), we requantize with a single scale.
26+
# On Lovelace, fail for now if channelwise.
27+
# TODO: (@tms) fallback
28+
if (not self.cutlass_fp8_supported
29+
and self.strategy == QuantizationStrategy.CHANNEL):
30+
raise ValueError(
31+
"Channelwise fp8 quantization requires vLLM's custom "
32+
"cutlass kernels, which are not supported on your device."
33+
"Consider quantizing with per tensor scales or upgrading "
34+
"to Hopper.")
35+
2436
def process_weights_after_loading(self, layer) -> None:
25-
# Dequant -> Quant with max scale.
26-
max_w_scale, weight = requantize_with_max_scale(
27-
weight=layer.weight,
28-
weight_scale=layer.weight_scale,
29-
logical_widths=layer.logical_widths,
30-
)
31-
32-
# Update layer with new values.
33-
layer.weight = torch.nn.Parameter(weight.t(), requires_grad=False)
34-
layer.weight_scale = torch.nn.Parameter(max_w_scale,
35-
requires_grad=False)
36-
if self.input_dynamic:
37-
layer.input_scale = None
37+
# If per tensor, when we have a fused module (e.g. QKV) with per
38+
# tensor scales (thus N scales being passed to the kernel),
39+
# requantize so we can always run per tensor
40+
if self.strategy == QuantizationStrategy.TENSOR:
41+
max_w_scale, weight = requantize_with_max_scale(
42+
weight=layer.weight,
43+
weight_scale=layer.weight_scale,
44+
logical_widths=layer.logical_widths,
45+
)
46+
47+
layer.weight = Parameter(weight.t(), requires_grad=False)
48+
layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
49+
50+
# If channelwise, scales are already lined up, so just transpose.
51+
elif self.strategy == QuantizationStrategy.CHANNEL:
52+
assert self.cutlass_fp8_supported
53+
weight = layer.weight
54+
layer.weight = Parameter(weight.t(), requires_grad=False)
55+
56+
else:
57+
raise ValueError(f"Unknown quantization strategy {self.strategy}")
58+
59+
# INPUT SCALE
60+
if self.is_static_input_scheme:
61+
layer.input_scale = Parameter(layer.input_scale.max(),
62+
requires_grad=False)
3863
else:
39-
layer.input_scale = torch.nn.Parameter(layer.input_scale.max(),
40-
requires_grad=False)
64+
layer.input_scale = None
4165

4266
def create_weights(self, layer: torch.nn.Module,
4367
output_partition_sizes: List[int],
4468
input_size_per_partition: int,
4569
params_dtype: torch.dtype, weight_loader: Callable,
4670
**kwargs):
47-
48-
del params_dtype
49-
5071
output_size_per_partition = sum(output_partition_sizes)
5172
layer.logical_widths = output_partition_sizes
5273

@@ -63,12 +84,17 @@ def create_weights(self, layer: torch.nn.Module,
6384
})
6485

6586
# WEIGHT SCALE
66-
weight_scale = create_per_tensor_scale_param(
67-
output_partition_sizes, weight_loader=weight_loader)
87+
if self.strategy == QuantizationStrategy.CHANNEL:
88+
weight_scale = create_per_channel_scale_param(
89+
output_partition_sizes, weight_loader=weight_loader)
90+
else:
91+
assert self.strategy == QuantizationStrategy.TENSOR
92+
weight_scale = create_per_tensor_scale_param(
93+
output_partition_sizes, weight_loader=weight_loader)
6894
layer.register_parameter("weight_scale", weight_scale)
6995

7096
# INPUT SCALE
71-
if not self.input_dynamic:
97+
if self.is_static_input_scheme:
7298
input_scale = create_per_tensor_scale_param(
7399
output_partition_sizes, weight_loader=weight_loader)
74100
layer.register_parameter("input_scale", input_scale)

vllm/model_executor/layers/quantization/compressed_tensors/utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
class CompressionFormat(Enum):
1010
dense = "dense"
1111
sparse_bitmask = "sparse-bitmask"
12+
naive_quantized = "naive-quantized"
1213
float_quantized = "float-quantized"
1314
int_quantized = "int-quantized"
1415
pack_quantized = "pack-quantized"
@@ -76,6 +77,15 @@ class QuantizationArgs(BaseModel):
7677
)
7778

7879

80+
def is_activation_quantization_format(format: str) -> bool:
81+
_ACTIVATION_QUANTIZATION_FORMATS = [
82+
CompressionFormat.naive_quantized.value,
83+
CompressionFormat.int_quantized.value,
84+
CompressionFormat.float_quantized.value
85+
]
86+
return format in _ACTIVATION_QUANTIZATION_FORMATS
87+
88+
7989
def find_first_name_or_class_match(
8090
name: str,
8191
module: Module,

0 commit comments

Comments
 (0)