Fixed the lowering pass

cehongwang · cehongwang · commit cf104c695316 · 2025-06-13T22:27:17.000Z
diff --git a/examples/apps/flux_demo.py b/examples/apps/flux_demo.py
@@ -44,11 +44,6 @@ def compile_model(
         torch_dtype=torch.float16,
     ).to(torch.float16)
 
-    if args.debug:
-        pipe.transformer = FluxTransformer2DModel(
-            num_layers=1, num_single_layers=1, guidance_embeds=True
-        ).to(torch.float16)
-
     if args.low_vram_mode:
         pipe.enable_model_cpu_offload()
     else:
@@ -266,10 +261,5 @@ def main(args):
         action="store_true",
         help="Use dynamic shapes",
     )
-    parser.add_argument(
-        "--debug",
-        action="store_true",
-        help="Use debug mode",
-    )
     args = parser.parse_args()
     main(args)
diff --git a/examples/dynamo/register_sdpa.py b/examples/dynamo/register_sdpa.py
@@ -32,153 +32,88 @@
 
 
 @_aten_lowering_pass
-def lower_scaled_dot_product_attention(
+def replace_variants_of_sdpa(
     gm: torch.fx.GraphModule, settings: CompilationSettings
 ) -> torch.fx.GraphModule:
-    """Replace specific versions of scaled_dot_product_attention with an equivalent
-    implementation which can be easily converted to TRT
+    """Replace scaled_dot_product_attention with an equivalent
+    implementation which can be accurately converted to TRT
     """
-    original_fns, replacement = scaled_dot_product_attention_replacement()
-    replaced_nodes = []
-    # For each original function, search for it in the graph and replace
-    for original in original_fns:
-        replaced_nodes += torch.fx.subgraph_rewriter.replace_pattern_with_filters(
-            gm,
-            original,
-            replacement,
-            ignore_literals=True,
-        )
-
-    if replaced_nodes:
-        # Repair instances which use the kwargs field (specifically the "scale" kwarg)
-        # Also repair instances which specified the is_causal or attn_bias fields
-        for match in replaced_nodes:
-            attention_node_replaced = None
-            # Seek the attention operator being replaced
-            for node in match.nodes_map:
-                if node.target in REPLACEABLE_ATEN_OPS:
-                    attention_node_replaced = match.nodes_map[node]
-                    break
-
-            assert attention_node_replaced is not None
-            assert len(match.replacements) == 1
-
-            new_attention_node = match.replacements[0]
-
-            assert (
-                new_attention_node.target
-                == torch.nn.functional.scaled_dot_product_attention
-            )
-
-            # Copy the metadata of the replaced attention node to the new node
-            # TODO: Investigate why there are multiple FakeTensors in the metadata.
-            # We only use the first one as it contains the output shape information for this node.
-            if "val" in attention_node_replaced.meta:
-                new_attention_node.meta["val"] = copy.copy(
-                    attention_node_replaced.meta["val"][0]
-                )
-
-            # If the attention operator had keyword-args, copy them to the new node
-            if attention_node_replaced.kwargs:
-                new_attention_node.kwargs = {**attention_node_replaced.kwargs}
-
-            # Set default args in new node:
-            # Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False
-            new_attention_node.args = new_attention_node.args + (None, 0.0, False)
-
-            # The `is_causal` argument was specified
+    attn_mask = None
+    is_causal = True
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target in REPLACEABLE_ATEN_OPS:
             if (
-                (
-                    attention_node_replaced.target
-                    == torch.ops.aten._scaled_dot_product_flash_attention.default
-                )
-                and args_bounds_check(attention_node_replaced.args, 4, False)
-            ) or (
-                (
-                    attention_node_replaced.target
-                    == torch.ops.aten._scaled_dot_product_efficient_attention.default
-                )
-                and args_bounds_check(attention_node_replaced.args, 6, False)
+                node.target
+                == torch.ops.aten._scaled_dot_product_efficient_attention.default
+            ):
+                if len(node.args) == 7:
+                    (
+                        query,
+                        key,
+                        value,
+                        attn_bias,
+                        compute_log_sumexp,
+                        dropout_p,
+                        is_causal,
+                    ) = node.args
+                elif len(node.args) == 5:
+                    query, key, value, attn_mask, is_causal = node.args
+                    dropout_p = 0.0
+                else:
+                    raise ValueError(
+                        f"Unexpected number of arguments for {node.target} in the graph"
+                    )
+            elif (
+                node.target
+                == torch.ops.aten._scaled_dot_product_flash_attention.default
             ):
-                new_attention_node.args = (
-                    new_attention_node.args[:5] + (True,) + new_attention_node.args[6:]
+                if len(node.args) == 6:
+                    query, key, value, dropout_p, is_causal, return_debug_mask = (
+                        node.args
+                    )
+                elif len(node.args) == 3:
+                    query, key, value = node.args
+                    dropout_p = 0.0
+                    is_causal = True
+                else:
+                    raise ValueError(
+                        f"Unexpected number of arguments for {node.target} in the graph"
+                    )
+            if attn_mask is not None:
+                logger.warning(
+                    f"This current version of SDPA converter does not support attn_mask for {node.target} in the graph. Ignoring it and using is_causal=True configuration."
                 )
 
-            # The `attn_bias` argument was specified
-            if (
-                attention_node_replaced.target
-                == torch.ops.aten._scaled_dot_product_efficient_attention.default
-            ) and args_bounds_check(attention_node_replaced.args, 3) is not None:
-                new_attention_node.args = (
-                    new_attention_node.args[:3]
-                    + attention_node_replaced.args[3]
-                    + new_attention_node.args[4:]
-                )
+            modified_input_args = (query, key, value, None, dropout_p, is_causal)
 
-        gm = clean_up_graph_after_modifications(gm)
-        logger.debug(f"Graph after lowering scaled dot product attention:\n{gm.graph}")
+            # Create a new node with torch.nn.functional.scaled_dot_product_attention
+            # The input args is (query, key, value, is_causal). kwargs has scale
+            with gm.graph.inserting_after(node):
+                new_node = gm.graph.call_function(
+                    torch.nn.functional.scaled_dot_product_attention,
+                    args=modified_input_args,
+                    kwargs={"scale": node.kwargs.get("scale", None)},
+                )
 
+                # Deep copy encounters RuntimeError: Cannot access data pointer of Tensor (e.g. FakeTensor, FunctionalTensor). So we use copy instead.
+                new_node.meta = copy.copy(node.meta)
+                # Check if there's a getitem node following this attention node
+                for user in list(node.users):
+                    if user.op == "call_function" and user.target == operator.getitem:
+                        # If the getitem is extracting the first element (the output tensor)
+                        if user.args[1] == 0:
+                            # Replace all uses of the getitem with the new attention node
+                            user.replace_all_uses_with(new_node)
+                            new_node.meta["val"] = new_node.meta["val"][0]
+                # Replace all uses of the original node with the new node
+                node.replace_all_uses_with(new_node)
+
+            gm.graph.erase_node(node)
+
+    # Clean up the graph
+    clean_up_graph_after_modifications(gm)
+
+    logger.info(
+        "Replaced variants of scaled_dot_product_attention with torch.nn.functional.scaled_dot_product_attention"
+    )
     return gm
-
-
-def scaled_dot_product_attention_replacement() -> Tuple[
-    Sequence[Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]],
-    Callable[[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor],
-]:
-    """Constructs the original and replacement functions for efficient attention"""
-
-    # Efficient Attention original graph
-    def efficient(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
-        outputs = torch.ops.aten._scaled_dot_product_efficient_attention.default(
-            q,
-            k,
-            v,
-            None,
-            False,
-        )
-        out = operator.getitem(outputs, 0)
-        return out
-
-    # Flash Attention original graph
-    def flash(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
-        outputs = torch.ops.aten._scaled_dot_product_flash_attention.default(
-            q,
-            k,
-            v,
-        )
-        out = operator.getitem(outputs, 0)
-        return out
-
-    # Efficient Attention w/Scale original graph
-    def efficient_scale(
-        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
-    ) -> torch.Tensor:
-        outputs = torch.ops.aten._scaled_dot_product_efficient_attention.default(
-            q,
-            k,
-            v,
-            None,
-            False,
-            scale=1.0,
-        )
-        out = operator.getitem(outputs, 0)
-        return out
-
-    # Flash Attention w/Scale original graph
-    def flash_scale(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
-        outputs = torch.ops.aten._scaled_dot_product_flash_attention.default(
-            q,
-            k,
-            v,
-            scale=1.0,
-        )
-        out = operator.getitem(outputs, 0)
-        return out
-
-    # Replacement graph consists of the functional version of scaled_dot_product_attention
-    def replacement(
-        query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
-    ) -> torch.Tensor:
-        return torch.nn.functional.scaled_dot_product_attention(query, key, value)
-
-    return (efficient, flash, efficient_scale, flash_scale), replacement
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
@@ -75,7 +75,6 @@ def quantize(
             dtype = trt.DataType.FP8
             max_bound = 448
 
-
         axis = None
         # int8 weight quantization is per-channel quantization(it can have one or multiple amax values)
         if dtype == trt.DataType.INT8 and amax.numel() > 1:
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py
@@ -100,7 +100,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         # Set of known quantization ops to be excluded from constant folding.
         # Currently, we exclude all quantization ops coming from modelopt library.
-        self.quantization_ops = set()
+        self.quantization_ops: Set[torch._ops.OpOverload] = set()
         try:
             # modelopt import ensures torch.ops.tensorrt.quantize_op.default is registered
             import modelopt.torch.quantization as mtq
diff --git a/tools/perf/Flux/flux_perf.py b/tools/perf/Flux/flux_perf.py
@@ -56,11 +56,6 @@ def main(args):
         action="store_true",
         help="Use dynamic shapes",
     )
-    parser.add_argument(
-        "--debug",
-        action="store_true",
-        help="Use debug mode",
-    )
     parser.add_argument("--max_batch_size", type=int, default=1)
     args = parser.parse_args()
     main(args)