Update on "Use llm_config instead of args in export_llama functions"

jackzhxng · jackzhxng · commit 4a875d87576c · 2025-06-03T19:40:10.000-07:00
Differential Revision: [D75484927](https://our.internmc.facebook.com/intern/diff/D75484927) [ghstack-poisoned]
diff --git a/examples/models/llama/config/llm_config.py b/examples/models/llama/config/llm_config.py
@@ -41,15 +41,23 @@ class ModelType(str, Enum):
 
 
 class PreqMode(str, Enum):
+    """
+    If you are dealing with pre-quantized checkpoints, this used to
+    be the way to specify them. Now you don't need to specify these
+    options if you use a TorchAo-prequantized checkpoint, but they
+    are still around to preservce backward compatibility.
+    """
+
     PREQ_8DA4W = "8da4w"
     PREQ_8DA4W_OUT_8DA8W = "8da4w_output_8da8w"
 
 
 @dataclass
 class BaseConfig:
     """
-    These are specific to the specific model, e.g. whether it’s Qwen3 0.6B or Phi-4-mini.
-    For each of these different models, you can expect each of these fields to change.
+    Configurations specific to the model, e.g. whether it’s Qwen3 or Phi-4-mini,
+    and are the minimal set of parameters needed to load the pretrained
+    eager model and its weights.
     """
 
     model_class: ModelType = ModelType.LLAMA3
@@ -73,6 +81,12 @@ class BaseConfig:
 
 
 class DtypeOverride(str, Enum):
+    """
+    DType of the model. Highly recommended to use "fp32", unless you want to
+    export without a backend, in which case you can also use "bf16". "fp16"
+    is not recommended.
+    """
+
     FP32 = "fp32"
     FP16 = "fp16"
     BF16 = "bf16"
@@ -81,10 +95,10 @@ class DtypeOverride(str, Enum):
 @dataclass
 class ModelConfig:
     """
-    These are not necessarily specific to the model, but are needed to finish off
-    the rest of the model configuration in eager. You can think of these like
-    optimizations / actual configurations. The same ModelConfig can be applied
-    to different models.
+    Configurations not necessarily specific to the model, but are needed to
+    finish off the rest of the model configuration in eager. You can think
+    of these like optimizations / actual configurations. The same ModelConfig
+    can be applied to multiple models.
     """
 
     dtype_override: DtypeOverride = DtypeOverride.FP32
@@ -109,6 +123,10 @@ class ModelConfig:
 
 @dataclass
 class ExportConfig:
+    """
+    Configures properties relevant to the export process.
+    """
+
     max_seq_length: int = 128
     max_context_length: int = 128
     output_dir: Optional[str] = None
@@ -124,6 +142,10 @@ class ExportConfig:
 
 @dataclass
 class DebugConfig:
+    """
+    Configures options to debug the export process.
+    """
+
     profile_memory: bool = False
     profile_path: Optional[str] = None
     generate_etrecord: bool = False
@@ -137,6 +159,14 @@ class DebugConfig:
 
 
 class Pt2eQuantize(str, Enum):
+    """
+    Type of backend-specific Pt2e quantization strategy to use.
+
+    Pt2e uses a different quantization library that is graph-based
+    compared to `qmode`, which is also specified in the QuantizationConfig
+    and is source transform-based.
+    """
+
     XNNPACK_DYNAMIC = "xnnpack_dynamic"
     XNNPACK_DYNAMIC_QC4 = "xnnpack_dynamic_qc4"
     QNN_8A8W = "qnn_8a8w"
@@ -157,6 +187,10 @@ class SpinQuant(str, Enum):
 
 @dataclass
 class QuantizationConfig:
+    """
+    Configures how the model should be quantized (PTQ).
+    """
+
     qmode: Optional[str] = None
     embedding_quantize: Optional[str] = None
     pt2e_quantize: Optional[Pt2eQuantize] = None
@@ -248,6 +282,11 @@ class MPSConfig:
 
 @dataclass
 class BackendConfig:
+    """
+    Configures which backends should be used and how the backends
+    should be set up.
+    """
+
     xnnpack: XNNPackConfig = field(default_factory=XNNPackConfig)
     coreml: CoreMLConfig = field(default_factory=CoreMLConfig)
     vulkan: VulkanConfig = field(default_factory=VulkanConfig)
@@ -262,6 +301,10 @@ class BackendConfig:
 
 @dataclass
 class LlmConfig:
+    """
+    The overall configuration for customizing the LLM export process.
+    """
+
     base: BaseConfig = field(default_factory=BaseConfig)
     model: ModelConfig = field(default_factory=ModelConfig)
     export: ExportConfig = field(default_factory=ExportConfig)
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -661,36 +661,16 @@ def _prepare_for_llama_export(llm_config: LlmConfig) -> LLMEdgeManager:
         canonical_path(llm_config.base.params) if llm_config.base.params else None
     )
     output_dir_path = canonical_path(llm_config.export.output_dir, dir=True)
-    weight_type = WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA
 
-    # Convert dtype override string to actual type
+    llm_config.base.checkpoint = checkpoint_path
+    llm_config.base.checkpoint_dir = checkpoint_dir
+    llm_config.base.params = params_path
+    llm_config.export.output_dir = output_dir_path
+
+    # Convert dtype override string to actual type.
     dtype_override = DType[llm_config.model.dtype_override]
 
-    edge_manager = _load_llama_model(
-        llm_config,
-        checkpoint=checkpoint_path,
-        checkpoint_dir=checkpoint_dir,
-        params_path=params_path,
-        use_kv_cache=llm_config.model.use_kv_cache,
-        use_sdpa_with_kv_cache=llm_config.model.use_sdpa_with_kv_cache,
-        generate_full_logits=llm_config.debug.generate_full_logits,
-        weight_type=weight_type,
-        enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
-        calibration_tasks=llm_config.quantization.calibration_tasks,
-        calibration_limit=llm_config.quantization.calibration_limit,
-        calibration_seq_length=llm_config.quantization.calibration_seq_length,
-        calibration_data=llm_config.quantization.calibration_data,
-        tokenizer_path=llm_config.base.tokenizer_path,
-        verbose=llm_config.debug.verbose,
-        max_seq_len=llm_config.export.max_seq_length,
-        max_context_len=llm_config.export.max_context_length,
-        input_prune_map_path=llm_config.model.input_prune_map,
-        output_prune_map_path=llm_config.model.output_prune_map,
-        metadata_str=llm_config.base.metadata,
-        dtype_override=dtype_override,
-        use_qnn=llm_config.backend.qnn.enabled,
-        export_only=llm_config.export.export_only,
-    )
+    edge_manager = _load_llama_model(llm_config)
 
     # At this point, the model is loaded in the default fp32.
 
@@ -1167,32 +1147,7 @@ def _load_llama_model_metadata(
     return metadata
 
 
-def _load_llama_model(
-    llm_config: LlmConfig,
-    *,
-    checkpoint: Optional[str] = None,
-    checkpoint_dir: Optional[str] = None,
-    params_path: Optional[str] = None,
-    use_kv_cache: bool = False,
-    use_sdpa_with_kv_cache: bool = False,
-    generate_full_logits: bool = False,
-    weight_type: WeightType = WeightType.LLAMA,
-    enable_dynamic_shape: bool = False,
-    calibration_tasks: Optional[List[str]] = None,
-    calibration_limit: Optional[int] = None,
-    calibration_seq_length: Optional[int] = None,
-    calibration_data: Optional[str] = None,
-    tokenizer_path: Optional[str] = None,
-    verbose: bool = False,
-    max_seq_len: int = 128,
-    max_context_len: int = 128,
-    input_prune_map_path: Optional[str] = None,
-    output_prune_map_path: Optional[str] = None,
-    metadata_str: Optional[str] = None,
-    dtype_override: Optional[DType] = None,
-    use_qnn: bool = False,
-    export_only: bool = False,
-) -> "LLMEdgeManager":
+def _load_llama_model(llm_config: LlmConfig) -> "LLMEdgeManager":
     """
     A helper util that builds a Llama2 model. It returns a LLMEdgeManager that
     can help further lower the model to ExecuTorch.
@@ -1220,31 +1175,33 @@ def _load_llama_model(
             llm_config=llm_config,
         )
     )
+    # Convert dtype override string to actual type.
+    dtype_override = DType[llm_config.model.dtype_override]
 
     return LLMEdgeManager(
         model=model,
         modelname=modelname,
         max_seq_len=model.max_seq_len,  # type: ignore
         dtype=dtype_override,
-        use_kv_cache=use_kv_cache,
-        generate_full_logits=generate_full_logits,
+        use_kv_cache=llm_config.model.use_kv_cache,
+        generate_full_logits=llm_config.debug.generate_full_logits,
         example_inputs=example_inputs,
         example_kwarg_inputs=example_kwarg_inputs,
         dynamic_shapes=dynamic_shapes,
-        enable_dynamic_shape=enable_dynamic_shape,
-        calibration_tasks=calibration_tasks,
-        calibration_limit=calibration_limit,
-        calibration_seq_length=calibration_seq_length,
-        calibration_data=calibration_data,
-        tokenizer_path=tokenizer_path,
-        use_legacy_export=use_qnn,
-        save_exported_program=export_only,
-        verbose=verbose,
+        enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
+        calibration_tasks=llm_config.quantization.calibration_tasks,
+        calibration_limit=llm_config.quantization.calibration_limit,
+        calibration_seq_length=llm_config.quantization.calibration_seq_length,
+        calibration_data=llm_config.quantization.calibration_data,
+        tokenizer_path=llm_config.base.tokenizer_path,
+        use_legacy_export=llm_config.backend.qnn.enabled,
+        save_exported_program=llm_config.export.export_only,
+        verbose=llm_config.debug.verbose,
         metadata=_load_llama_model_metadata(
-            weight_type,
-            use_kv_cache,
-            use_sdpa_with_kv_cache,
-            enable_dynamic_shape,
+            WeightType.FAIRSEQ2 if llm_config.base.fairseq2 else WeightType.LLAMA,
+            llm_config.model.use_kv_cache,
+            llm_config.model.use_sdpa_with_kv_cache,
+            llm_config.model.enable_dynamic_shape,
             # pyre-fixme[6]: For 5th argument expected `ModelArgs` but got
             #  `Union[Tensor, Module]`.
             model.max_seq_len,
@@ -1257,7 +1214,7 @@ def _load_llama_model(
             # pyre-fixme[6]: For 8th argument expected `int` but got `Union[Tensor,
             #  Module]`.
             model.vocab_size,
-            metadata_str,
+            llm_config.base.metadata,
         ),
     )