From 50049459317be80e2a27bec81758a0312f05fb2b Mon Sep 17 00:00:00 2001
From: DannyYuyang-quic <quic_yuyazhua@quicinc.com>
Date: Fri, 2 May 2025 17:55:45 +0800
Subject: [PATCH] Qualcomm AI Engine Direct - multi-method support in
 to_edge_transform_and_lower_to_qnn

Summary:
- add support for multi-method in to_edge_transform_and_lower_to_qnn
- deprecate capture_program in llama.py
---
 backends/qualcomm/tests/test_qnn_delegate.py |  66 ++++-----
 backends/qualcomm/utils/utils.py             | 142 ++++++++++++++-----
 examples/qualcomm/oss_scripts/llama/llama.py |  58 ++++----
 3 files changed, 156 insertions(+), 110 deletions(-)

diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
index 71d953a5a7b..6ca14beef59 100644
--- a/backends/qualcomm/tests/test_qnn_delegate.py
+++ b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -47,7 +47,6 @@
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
     PyQnnManagerAdaptor,
-    QnnPartitioner,
     rewrite_prepared_observer,
     skip_annotation,
     to_edge_transform_and_lower_to_qnn,
@@ -89,12 +88,8 @@
 from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel
 
 from executorch.examples.models.wav2letter import Wav2LetterModel
-from executorch.exir import EdgeProgramManager, to_edge
-from executorch.exir.backend.backend_api import (
-    disable_validation,
-    MethodProgramsPartitionerSpec,
-    to_backend,
-)
+from executorch.exir import to_edge
+from executorch.exir.backend.backend_api import disable_validation
 
 
 class TestQNNFloatingPointOperator(TestQNN):
@@ -2701,22 +2696,18 @@ def test_qnn_backend_multi_graphs(self):
             )
             for graph_name in graph_names
         ]
-        # TODO: retire capture_program once we figure out how to extract
-        #       intermediate graph from official lowering API
-        edge_progs = {
-            graph_name: capture_program(module, sample_input).exported_program
-            for graph_name, module, sample_input in zip(
-                graph_names, modules, sample_inputs
-            )
-        }
-        partitioners = {
-            graph_name: QnnPartitioner(compiler_spec)
-            for graph_name, compiler_spec in zip(graph_names, compiler_specs)
-        }
-        lowered_ep_dict = to_backend(
-            MethodProgramsPartitionerSpec(edge_progs, partitioners)
+
+        modules_dict = {}
+        sample_inputs_dict = {}
+        compiler_specs_dict = {}
+        for i, graph_name in enumerate(graph_names):
+            modules_dict[graph_name] = modules[i]
+            sample_inputs_dict[graph_name] = sample_inputs[i]
+            compiler_specs_dict[graph_name] = compiler_specs[i]
+        delegated_program = to_edge_transform_and_lower_to_qnn(
+            modules_dict, sample_inputs_dict, compiler_specs_dict
         )
-        executorch_prog = EdgeProgramManager(lowered_ep_dict).to_executorch()
+        executorch_prog = delegated_program.to_executorch()
         for index, module in enumerate(modules):
             self.verify_output(
                 module=module,
@@ -3375,28 +3366,21 @@ def test_qnn_backend_multi_graphs(self):
             )
             for graph_name in graph_names
         ]
-        # TODO: retire capture_program once we figure out how to extract
-        #       intermediate graph from official lowering API
-        for i, module in enumerate(modules):
-            module_exported = torch.export.export(module, sample_inputs[i]).module()
+        modules_dict = {}
+        sample_inputs_dict = {}
+        compiler_specs_dict = {}
+        for i, graph_name in enumerate(graph_names):
+            module_exported = torch.export.export(modules[i], sample_inputs[i]).module()
             module_prepared = prepare_pt2e(module_exported, make_quantizer())
             module_prepared(*sample_inputs[i])
-            modules[i] = convert_pt2e(module_prepared)
-
-        edge_progs = {
-            graph_name: capture_program(module, sample_input).exported_program
-            for graph_name, module, sample_input in zip(
-                graph_names, modules, sample_inputs
-            )
-        }
-        partitioners = {
-            graph_name: QnnPartitioner(compiler_spec)
-            for graph_name, compiler_spec in zip(graph_names, compiler_specs)
-        }
-        lowered_ep_dict = to_backend(
-            MethodProgramsPartitionerSpec(edge_progs, partitioners)
+            modules_dict[graph_name] = convert_pt2e(module_prepared)
+            sample_inputs_dict[graph_name] = sample_inputs[i]
+            compiler_specs_dict[graph_name] = compiler_specs[i]
+        delegated_program = to_edge_transform_and_lower_to_qnn(
+            modules_dict, sample_inputs_dict, compiler_specs_dict
         )
-        executorch_prog = EdgeProgramManager(lowered_ep_dict).to_executorch()
+
+        executorch_prog = delegated_program.to_executorch()
         for index, module in enumerate(modules):
             self.verify_output(
                 module=module,
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
index 7ecef7ababe..24c90efaa64 100644
--- a/backends/qualcomm/utils/utils.py
+++ b/backends/qualcomm/utils/utils.py
@@ -317,60 +317,126 @@ def get_decomp_table(passes_job) -> Dict[torch._ops.OperatorBase, Callable]:
 
 
 def to_edge_transform_and_lower_to_qnn(
-    module: Union[torch.nn.Module, torch.fx.GraphModule],
-    inputs: Tuple[torch.Tensor],
-    compiler_specs: List[CompileSpec],
+    module: Union[
+        torch.nn.Module,
+        torch.fx.GraphModule,
+        Dict[str, torch.nn.Module],
+        Dict[str, torch.fx.GraphModule],
+    ],
+    inputs: Union[Tuple[torch.Tensor], Dict[str, Tuple[torch.Tensor]]],
+    compiler_specs: Union[List[Any], Dict[str, List[Any]]],
     constant_methods: Optional[Dict[str, Any]] = None,
     dynamic_shapes: Optional[Dict] = None,
     dep_table: Optional[Dict] = None,
-    passes_job: Optional[OrderedDict] = None,
+    passes_job: Optional[Union[OrderedDict, Dict[str, OrderedDict]]] = None,
     skip_node_id_set: Optional[set] = None,
     skip_node_op_set: Optional[set] = None,
 ) -> EdgeProgramManager:
     """
-    Transforms and lowers a given PyTorch module to QNN backend.
+    Transforms and lowers a given PyTorch module to the QNN backend.
 
     Args:
-        module (Union[torch.nn.Module, torch.fx.GraphModule]): The PyTorch module or fx.GraphModule to be transformed.
-        inputs (Tuple[torch.Tensor]): The input tensors for the module.
-        compiler_specs (List[CompileSpec]): Compiler specs for Qualcomm AI Engine Direct.
-        constant_methods (Optional[Dict[str, Any]]): An optional dictionary of method name to the constant value
-            returned by that method in eager mode. Often used to store config information on
-            Edge models.
-        dynamic_shapes (Optional[Dict]): Information about dynamic shapes.
-        dep_table (Optional[Dict]): Dependency table for the transformation passes.
-        passes_job (Optional[OrderedDict]): Ordered dictionary of transformation passes.
-        skip_node_id_set (Optional[set]): Set of node IDs to skip during partitioning.
-        skip_node_op_set (Optional[set]): Set of node operations to skip during partitioning.
+        module (Union[torch.nn.Module, torch.fx.GraphModule,Dict[str, torch.nn.Module], Dict[str, torch.fx.GraphModule]]):
+            The PyTorch module or fx.GraphModule to be transformed.
+        inputs (Union[Tuple[torch.Tensor], Dict[str, Tuple[torch.Tensor]]]):
+            The input tensors for the module.
+        compiler_specs (Union[List[Any], Dict[str, List[Any]]]):
+            Compiler specifications for Qualcomm AI Engine Direct.
+        constant_methods (Optional[Dict[str, Any]]):
+            An optional dictionary mapping method names to constant values returned by those methods in eager mode.
+            Often used to store configuration information on Edge models.
+        dynamic_shapes (Optional[Dict]):
+            Information about dynamic shapes.
+        dep_table (Optional[Dict]):
+            Dependency table for the transformation passes.
+        passes_job (Optional[Union[OrderedDict, Dict[str, OrderedDict]]]):
+            Ordered dictionary of transformation passes.
+        skip_node_id_set (Optional[set]):
+            Set of node IDs to skip during partitioning.
+        skip_node_op_set (Optional[set]):
+            Set of node operations to skip during partitioning.
 
     Returns:
-        EdgeProgramManager: The manager for the edge program after transformation and lowering.
+        EdgeProgramManager:
+            The manager for the edge program after transformation and lowering.
     """
-    ep = torch.export.export(module, inputs, dynamic_shapes=dynamic_shapes, strict=True)
-    # This transformation is primarily intended for the LiftConstantScalarOperands pass
-    # to avoid creating temporary tensors in the operation builder.
-    # However, this pass will create a get_attr node, which should be converted
-    # into a lifted tensor constant by the lift_constant_tensor_pass.
-    # If placed in the to_edge_transform_passes, it will be executed
-    # after the lift_constant_tensor_pass, causing the operation builder
-    # to fail to correctly retrieve the parameter by the get_parameter.
-    ep = QnnPassManager().transform_for_export_pipeline(ep)
-    transform_passes = QnnPassManager().get_to_edge_transform_passes(
-        ep, passes_job=passes_job, dep_table=dep_table
-    )
-    qnn_partitioner = QnnPartitioner(
-        compiler_specs,
-        skip_node_id_set=skip_node_id_set,
-        skip_node_op_set=skip_node_op_set,
-    )
-    edge_program_manager = to_edge_transform_and_lower(
-        ep,
+
+    def ensure_graph_specific_dict(value, graph_names):
+        """
+        Ensures the input value is a dictionary with keys matching the provided graph names.
+        If the input is not a dictionary or its keys do not match the graph names, a new dictionary
+        is created with the graph names as keys and the input value assigned to each key.
+
+        Examples:
+            1. Input is None:
+                >>> ensure_graph_specific_dict(None, ["forward1", "forward2"])
+                {'forward1': None, 'forward2': None}
+
+            2. Input is a single value:
+                >>> ensure_graph_specific_dict(input, ["forward1", "forward2"])
+                {'forward1': input, 'forward2': input}
+
+            3. Input is a non-graph specific dict:
+                >>> ensure_graph_specific_dict({Any: input}, ["forward1", "forward2"])
+                {'forward1': {Any: input}, 'forward2': {Any: input}}
+        """
+        if value is None:
+            return {graph_name: None for graph_name in graph_names}
+        if isinstance(value, dict) and graph_names == value.keys():
+            return value
+        return {graph_name: value for graph_name in graph_names}
+
+    if not isinstance(module, dict):
+        module = {"forward": module}
+
+    # Ensure attributes are graph-specific dictionaries
+    graph_names = module.keys()
+    inputs = ensure_graph_specific_dict(inputs, graph_names)
+    compiler_specs = ensure_graph_specific_dict(compiler_specs, graph_names)
+    dynamic_shapes = ensure_graph_specific_dict(dynamic_shapes, graph_names)
+    dep_table = ensure_graph_specific_dict(dep_table, graph_names)
+    passes_job = ensure_graph_specific_dict(passes_job, graph_names)
+
+    # Prepare programs and partitioners
+    aten_programs = {}
+    transform_passes = {}
+    qnn_partitioners = {
+        graph_name: [
+            QnnPartitioner(
+                compiler_specs[graph_name],
+                skip_node_id_set=skip_node_id_set,
+                skip_node_op_set=skip_node_op_set,
+            )
+        ]
+        for graph_name in graph_names
+    }
+
+    for graph_name, m in module.items():
+        ep = torch.export.export(
+            m,
+            inputs[graph_name],
+            dynamic_shapes=dynamic_shapes[graph_name],
+            strict=True,
+        )
+        # This transformation is primarily intended for the LiftConstantScalarOperands pass
+        # to avoid creating temporary tensors in the operation builder.
+        # However, this pass will create a get_attr node, which should be converted
+        # into a lifted tensor constant by the lift_constant_tensor_pass.
+        # If placed in the to_edge_transform_passes, it will be executed
+        # after the lift_constant_tensor_pass, causing the operation builder
+        # to fail to correctly retrieve the parameter by the get_parameter.
+        aten_programs[graph_name] = QnnPassManager().transform_for_export_pipeline(ep)
+        transform_passes[graph_name] = QnnPassManager().get_to_edge_transform_passes(
+            ep, passes_job=passes_job[graph_name], dep_table=dep_table[graph_name]
+        )
+
+    return to_edge_transform_and_lower(
+        aten_programs,
         transform_passes=transform_passes,
-        partitioner=[qnn_partitioner],
+        partitioner=qnn_partitioners,
         constant_methods=constant_methods,
         compile_config=qnn_edge_config(),
     )
-    return edge_program_manager
 
 
 def capture_program(
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
index c97aadc79a9..2ff50cced27 100755
--- a/examples/qualcomm/oss_scripts/llama/llama.py
+++ b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -692,44 +692,43 @@ def permute(w, heads):
             )
             for graph_name in graph_names
         ]
-
-        # TODO: retire capture_program once we figure out how to extract
-        #       intermediate graph from official lowering API
-        edge_progs = {
-            graph_name: capture_program(
-                module=llama_instance.llama_graph_module,
-                inputs=sample_input,
-                dep_table=llama_instance.dep_table,
-                passes_job=llama_instance.passes_job,
-            ).exported_program
-            for graph_name, llama_instance, sample_input in zip(
-                graph_names, llama_instance_list, sample_inputs_list
-            )
-        }
-        for n in edge_progs[graph_names[0]].graph.nodes:
+        edge_prog_mgr = to_edge_transform_and_lower_to_qnn(
+            {
+                graph_name: instance.llama_graph_module
+                for graph_name, instance in zip(graph_names, llama_instance_list)
+            },
+            {
+                graph_name: inputs
+                for graph_name, inputs in zip(graph_names, sample_inputs_list)
+            },
+            {
+                graph_name: compiler_spec
+                for graph_name, compiler_spec in zip(graph_names, compiler_specs)
+            },
+            llama_instance_list[1].llama_meta,
+            dep_table={
+                graph_name: instance.dep_table
+                for graph_name, instance in zip(graph_names, llama_instance_list)
+            },
+            passes_job={
+                graph_name: instance.passes_job
+                for graph_name, instance in zip(graph_names, llama_instance_list)
+            },
+            skip_node_op_set={"llama.fallback.default"},
+        )
+        for n in list(edge_prog_mgr._edge_programs.values())[0].graph.nodes:
             if n.op == "output":
                 for node, output_encoding in n.meta[QCOM_QUANT_ATTRS_MAP].items():
                     if node.meta["val"].size() in llama_instance_list[0].io_shape:
                         quant_attrs = output_encoding
 
-        partitioners = {
-            graph_name: QnnPartitioner(
-                compiler_spec, skip_node_op_set={"llama.fallback.default"}
-            )
-            for graph_name, compiler_spec in zip(graph_names, compiler_specs)
-        }
-
-        lowered_ep_dict = to_backend(
-            MethodProgramsPartitionerSpec(edge_progs, partitioners)
-        )
-
         if args.num_sharding > 1:
             # TODO: add arg parser of spill_fill_size since weight-sharing based
             #       context binaries cannot be opened in x86 host
             pass
 
         if args.verbose:
-            for ep in lowered_ep_dict.values():
+            for ep in edge_prog_mgr._edge_programs.values():
                 print_delegation_info(ep.graph_module)
 
         executorch_config = ExecutorchBackendConfig(
@@ -743,10 +742,7 @@ def permute(w, heads):
             ),
             extract_delegate_segments=True,
         )
-        exec_prog_mgr = EdgeProgramManager(
-            edge_programs=lowered_ep_dict,
-            constant_methods=llama_instance_list[1].llama_meta,
-        ).to_executorch(executorch_config)
+        exec_prog_mgr = edge_prog_mgr.to_executorch(executorch_config)
 
         with open(f"{args.artifact}/{pte_filename}.pte", "wb") as file:
             exec_prog_mgr.write_to_file(file)