Update torchao api reference and add contributor guide

jerryzh168 · jerryzh168 · commit 0654ea1656a8 · 2024-11-12T15:51:08.000-08:00
Summary: 1. updated torchao api reference for quantization to include the APIs we want to expose, renamed torchao/quantization/linear_activation_weight_observer.py and removed the safe_int_mm and int_scaled_matmul from quant_primitives.py 2. added pytorch#391 to torchao docs Test Plan: CI Reviewers: Subscribers: Tasks: Tags:
diff --git a/docs/source/api_ref_dtypes.rst b/docs/source/api_ref_dtypes.rst
@@ -12,9 +12,11 @@ torchao.dtypes
 
     to_nf4
     to_affine_quantized_intx
-    to_affine_quantized_floatx
     to_affine_quantized_intx_static
+    to_affine_quantized_floatx
     to_affine_quantized_floatx_static
+    to_affine_quantized_fpx
+    NF4Tensor
     AffineQuantizedTensor
 
 ..
diff --git a/docs/source/api_ref_intro.rst b/docs/source/api_ref_intro.rst
@@ -1,16 +1,13 @@
 ``torchao`` API Reference
 =========================
 
-This section introduces the torchao API reference.
-Dive into the details of how torchao integrates with PyTorch to
-optimize your machine learning models.
+This section introduces the torchao API reference. Dive into the details of how torchao integrates with PyTorch to optimize your machine learning models.
 
 .. toctree::
    :glob:
    :maxdepth: 1
    :caption: Python API Reference
 
-   api_ref_sparsity
-   api_ref_quantization
    api_ref_dtypes
-   api_ref_kernel
+   api_ref_quantization
+   api_ref_sparsity
diff --git a/docs/source/api_ref_quantization.rst b/docs/source/api_ref_quantization.rst
@@ -9,15 +9,40 @@ torchao.quantization
 .. autosummary::
     :toctree: generated/
     :nosignatures:
-       
-    SmoothFakeDynQuantMixin
-    SmoothFakeDynamicallyQuantizedLinear
-    swap_linear_with_smooth_fq_linear
-    smooth_fq_linear_to_inference
-    Int4WeightOnlyGPTQQuantizer
-    Int4WeightOnlyQuantizer
+    autoquant
+    
     quantize_
     int8_dynamic_activation_int4_weight
     int8_dynamic_activation_int8_weight
     int4_weight_only
     int8_weight_only
+    float8_weight_only
+    float8_dynamic_activation_float8_weight
+    float8_static_activation_float8_weight
+    uintx_weight_only
+    fpx_weight_only
+
+    to_linear_activation_quantized
+    to_linear_activation_weight_observed
+
+    swap_linear_with_smooth_fq_linear
+    smooth_fq_linear_to_inference
+
+    choose_qparams_affine
+    choose_qparams_affine_with_min_max
+    choose_qparams_affine_floatx
+    quantize_affine
+    quantize_affine_floatx
+    dequantize_affine
+    dequantize_affine_floatx
+    choose_qparams_and_quantize_affine_hqq
+    fake_quantize_affine
+    fake_quantize_affine_cachemask
+
+    safe_int_mm
+    int_scaled_matmul
+
+    MappingType
+    ZeroPointDomain
+    TorchAODType
+
diff --git a/docs/source/contributor_guide.rst b/docs/source/contributor_guide.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -1,9 +1,7 @@
 Welcome to the torchao Documentation
 =======================================
 
-**torchao** is an open-source library that provides the functionality
-to quantize and prune your models using native PyTorch. Our documentation is under development
-with more content coming soon.
+`**torchao** <https://github.com/pytorch/ao>`__ is a ibrary for custom data types & optimizations. Quantize and sparsify weights, gradients, optimizers & activations for inference and training using native PyTorch. Please checkout torchao `README <https://github.com/pytorch/ao#torchao-pytorch-architecture-optimization>`__ for an overall introduction to the library and recent highlight and updates. The documentation here will focus on 1. API Reference 2. Developer / Researcher Contribution Guide 3. Tutorials.
 
 ..
    .. grid:: 3
@@ -81,13 +79,19 @@ with more content coming soon.
    :maxdepth: 1
    :caption: API Reference
 
-   api_ref_sparsity
-   api_ref_intro
-   api_ref_quantization
    api_ref_dtypes
+   api_ref_quantization
+   api_ref_sparsity
 ..
       api_ref_kernel
-      
+
+.. toctree::
+   :glob:
+   :maxdepth: 1
+   :caption: Contributor Guide
+
+   contributor_guide
+
 .. toctree::
    :glob:
    :maxdepth: 1
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -34,8 +34,10 @@
     change_linear_weights_to_int8_woqtensors,
     change_linear_weights_to_int4_woqtensors,
 )
-from torchao.quantization.quant_primitives import (
+from torchao.quantization import (
     safe_int_mm,
+)
+from torchao.quantization.quant_primitives import (
     choose_qparams_affine,
     quantize_affine,
     dequantize_affine,
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -31,10 +31,12 @@
     choose_qparams_and_quantize_affine_hqq,
     dequantize_affine,
     dequantize_affine_floatx,
-    int_scaled_matmul,
     quantize_affine,
     quantize_affine_floatx,
 )
+from torchao.kernel import (
+    int_scaled_matmul,
+)
 from torchao.quantization.utils import (
     pack_tinygemm_scales_and_zeros,
 )
diff --git a/torchao/kernel/__init__.py b/torchao/kernel/__init__.py
@@ -0,0 +1,7 @@
+from torchao.kernel.intmm import int_scaled_matmul
+from torchao.kernel.intmm import safe_int_mm
+
+__all__ = [
+    "safe_int_mm",
+    "int_scaled_matmul",
+]
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -24,6 +24,10 @@
     PerTensor,
     PerToken,
 )
+from torchao.kernel import (
+    safe_int_mm,
+    int_scaled_matmul,
+)
 from .linear_activation_quantized_tensor import (
     LinearActivationQuantizedTensor,
     to_linear_activation_quantized,
@@ -70,52 +74,79 @@
     compute_error,
 )
 from .weight_only import WeightOnlyInt8QuantLinear
+from .linear_activation_weight_observed_tensor import (
+    to_linear_activation_weight_observed,
+)
 
 __all__ = [
-    "swap_conv2d_1x1_to_linear",
+    # top level API - auto
     "autoquant",
     "DEFAULT_AUTOQUANT_CLASS_LIST",
     "DEFAULT_INT4_AUTOQUANT_CLASS_LIST",
     "OTHER_AUTOQUANT_CLASS_LIST",
-    "get_scale",
-    "SmoothFakeDynQuantMixin",
-    "SmoothFakeDynamicallyQuantizedLinear",
-    "swap_linear_with_smooth_fq_linear",
-    "smooth_fq_linear_to_inference",
-    "set_smooth_fq_attribute",
-    "compute_error",
-    "Int4WeightOnlyGPTQQuantizer",
-    "Int4WeightOnlyQuantizer",
-    "quantize_affine",
-    "dequantize_affine",
-    "choose_qparams_affine",
+
+    # top level API - manual
     "quantize_",
     "int8_dynamic_activation_int4_weight",
     "int8_dynamic_activation_int8_weight",
     "int8_dynamic_activation_int8_semi_sparse_weight",
     "int4_weight_only",
     "int8_weight_only",
+    "float8_weight_only",
+    "float8_dynamic_activation_float8_weight",
+    "float8_static_activation_float8_weight"
     "uintx_weight_only",
     "fpx_weight_only",
-    "LinearActivationQuantizedTensor",
+
+    # smooth quant - subject to change
+    "swap_conv2d_1x1_to_linear"
+    "get_scale",
+    "SmoothFakeDynQuantMixin",
+    "SmoothFakeDynamicallyQuantizedLinear",
+    "swap_linear_with_smooth_fq_linear",
+    "smooth_fq_linear_to_inference",
+    "set_smooth_fq_attribute",
+    "compute_error",
+
+    # building blocks
     "to_linear_activation_quantized",
     "to_weight_tensor_with_linear_activation_scale_metadata",
-    "float8_weight_only",
-    "float8_dynamic_activation_float8_weight",
-    "float8_static_activation_float8_weight",
-    "Int8DynActInt4WeightGPTQQuantizer",
-    "Int8DynActInt4WeightQuantizer",
-    "Int8DynActInt4WeightLinear",
-    "WeightOnlyInt8QuantLinear",
-    "TwoStepQuantizer",
-    "Quantizer",
-    "ZeroPointDomain",
-    "MappingType",
     "AffineQuantizedMinMaxObserver",
     "AffineQuantizedObserverBase",
+
+    # quant primitive ops
+    "choose_qprams_affine",
+    "choose_qparams_affine_with_min_max",
+    "choose_qparams_affine_floatx",
+    "quantize_affine",
+    "quantize_affine_floatx",
+    "dequantize_affine",
+    "dequantize_affine_floatx",
+    "choose_qparams_and_quantize_affine_hqq",
+    "fake_quantize_affine",
+    "fake_quantize_affine_cachemask",
+
+    # operators/kernels
+    "safe_int_mm",
+    "int_scaled_matmul",
+
+    # dataclasses and types
+    "MappingType",
+    "ZeroPointDomain",
+    "TorchAODType",
     "PerTensor",
     "PerAxis",
     "PerGroup",
     "PerRow",
     "PerToken",
+
+    "LinearActivationQuantizedTensor",
+    "Int4WeightOnlyGPTQQuantizer",
+    "Int4WeightOnlyQuantizer",
+    "Int8DynActInt4WeightGPTQQuantizer",
+    "Int8DynActInt4WeightQuantizer",
+    "Int8DynActInt4WeightLinear",
+    "WeightOnlyInt8QuantLinear",
+    "TwoStepQuantizer",
+    "Quantizer",
 ]
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -24,13 +24,14 @@
     PerRow,
     PerTensor,
 )
-from .quant_primitives import safe_int_mm
+from torchao.kernel import safe_int_mm
 from .subclass import (  # noqa
     Int8DynamicallyQuantizedLinearWeight,
     Int8WeightOnlyQuantizedLinearWeight,
     QuantizedLinearWeightBase,
 )
 
+
 __all__ = [
     "AutoQuantizableLinearWeight",
     "autoquant",
diff --git a/torchao/quantization/linear_activation_quantized_tensor.py b/torchao/quantization/linear_activation_quantized_tensor.py
@@ -19,7 +19,7 @@
 class LinearActivationQuantizedTensor(TorchAOBaseTensor):
     """
     Applies activation quantization for linear operator, this is used to support
-    dynamic quantization or static quantization, user can pass in a `input_quant_func`
+    dynamic quantization, user can pass in a `input_quant_func`
     that is used to quantize the activation
 
     Args:
@@ -60,7 +60,7 @@ def __init__(
         self.quant_kwargs = quant_kwargs
 
     def __repr__(self):
-        return f"LinearActivationQuantizedTensor({self.original_weight_tensor}, {self.input_quant_func}, quant_kwargs={self.quant_kwargs}))"
+        return f"{self.__class__.__name__}({self.original_weight_tensor}, {self.input_quant_func}, quant_kwargs={self.quant_kwargs}))"
 
     def __tensor_flatten__(self):
         return ["original_weight_tensor"], [self.input_quant_func, self.quant_kwargs]
diff --git a/torchao/quantization/linear_activation_weight_observed_tensor.py b/torchao/quantization/linear_activation_weight_observed_tensor.py
@@ -11,6 +11,7 @@
 
 __all__ = [
     "LinearActivationWeightObservedTensor",
+    "to_linear_activation_weight_observed",
 ]
 
 aten = torch.ops.aten
@@ -147,6 +148,8 @@ def _(func, types, args, kwargs):
         args[0].to(*args[1:], **kwargs)._apply_fn_to_data(torch.clone),
     )
 
+to_linear_activation_weight_observed = LinearActivationWeightObservedTensor.from_float
+
 
 if TORCH_VERSION_AT_LEAST_2_5:
     # Allow a model with LinearActivationQuantizedTensor weights to be loaded with `weights_only=True`
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -385,7 +385,7 @@ def insert_observers_(
     def convert_to_linear_observer(linear_module: nn.Linear):
         # Wrap the weight with LinearActivationWeightObservedTensor and then with nn.Parameter
         linear_module.weight = nn.Parameter(
-            LinearActivationWeightObservedTensor.from_float(
+            to_linear_activation_weight_observed(
                 linear_module.weight,
                 input_observer=input_observer,
                 weight_observer=weight_observer,
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -24,8 +24,6 @@
 )
 
 __all__ = [
-    "safe_int_mm",
-    "int_scaled_matmul",
     "choose_qparams_affine",
     "choose_qparams_affine_with_min_max",
     "choose_qparams_affine_floatx",
@@ -36,6 +34,9 @@
     "fake_quantize_affine",
     "fake_quantize_affine_cachemask",
     "choose_qparams_and_quantize_affine_hqq",
+    "MappingType",
+    "ZeroPointDomain",
+    "TorchAODType",
 ]
 
 
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -9,12 +9,14 @@
 import torch
 from torch.utils._python_dispatch import TorchDispatchMode
 
+from torchao.kernel import (
+    int_scaled_matmul,
+)
 from torchao.quantization.quant_primitives import (
     MappingType,
     ZeroPointDomain,
     choose_qparams_affine,
     dequantize_affine,
-    int_scaled_matmul,
     quantize_affine,
 )
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
diff --git a/torchao/quantization/weight_tensor_linear_activation_quantization.py b/torchao/quantization/weight_tensor_linear_activation_quantization.py
@@ -70,7 +70,7 @@ def __init__(
         self.quant_kwargs = quant_kwargs
 
     def __repr__(self):
-        return f"LinearActivationQuantizedTensor({self.original_weight_tensor}, {self.input_quant_func_static}, scale={self.scale}, zero_point={self.zero_point}, quant_kwargs={self.quant_kwargs})"
+        return f"{self.__class__.__name__}({self.original_weight_tensor}, {self.input_quant_func_static}, scale={self.scale}, zero_point={self.zero_point}, quant_kwargs={self.quant_kwargs})"
 
     def __tensor_flatten__(self):
         tensor_data = ["original_weight_tensor", "scale"]

Original file line number	Diff line number	Diff line change
`@@ -31,10 +31,12 @@`
`31`	`31`	`choose_qparams_and_quantize_affine_hqq,`
`32`	`32`	`dequantize_affine,`
`33`	`33`	`dequantize_affine_floatx,`
`34`		`- int_scaled_matmul,`
`35`	`34`	`quantize_affine,`
`36`	`35`	`quantize_affine_floatx,`
`37`	`36`	`)`
	`37`	`+from torchao.kernel import (`
	`38`	`+ int_scaled_matmul,`
	`39`	`+)`
`38`	`40`	`from torchao.quantization.utils import (`
`39`	`41`	`pack_tinygemm_scales_and_zeros,`
`40`	`42`	`)`
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@`
`11`	`11`
`12`	`12`	`__all__ = [`
`13`	`13`	`"LinearActivationWeightObservedTensor",`
	`14`	`+ "to_linear_activation_weight_observed",`
`14`	`15`	`]`
`15`	`16`
`16`	`17`	`aten = torch.ops.aten`
`@@ -147,6 +148,8 @@ def _(func, types, args, kwargs):`
`147`	`148`	`args[0].to(args[1:], *kwargs)._apply_fn_to_data(torch.clone),`
`148`	`149`	`)`
`149`	`150`
	`151`	`+to_linear_activation_weight_observed = LinearActivationWeightObservedTensor.from_float`
	`152`	`+`
`150`	`153`
`151`	`154`	`if TORCH_VERSION_AT_LEAST_2_5:`
`152`	`155`	# Allow a model with LinearActivationQuantizedTensor weights to be loaded with `weights_only=True`