Fix scripts

cg123 · cg123 · commit 6125c51ae2e6 · 2025-03-13T13:57:53.000-07:00
diff --git a/mergekit/moe/mixtral.py b/mergekit/moe/mixtral.py
@@ -8,7 +8,8 @@
 import tqdm
 import transformers
 
-from mergekit.architecture import MISTRAL_INFO, WeightInfo
+from mergekit.architecture import WeightInfo
+from mergekit.architecture.mixtral import MISTRAL_INFO
 from mergekit.moe.arch import MoEOutputArchitecture
 from mergekit.moe.common import copy_tensor_out, initialize_io, select_dtype
 from mergekit.moe.config import MoEMergeConfig
diff --git a/mergekit/moe/qwen.py b/mergekit/moe/qwen.py
@@ -12,12 +12,14 @@
 # if the transformers version installed is too old
 from transformers.models.qwen2_moe import Qwen2MoeConfig
 
-from mergekit.architecture import QWEN2_INFO
+from mergekit.architecture.json_definitions import NAME_TO_ARCH
 from mergekit.moe.arch import MoEOutputArchitecture
 from mergekit.moe.common import copy_tensor_out, initialize_io, select_dtype
 from mergekit.moe.config import MoEMergeConfig
 from mergekit.options import MergeOptions
 
+QWEN2_INFO = NAME_TO_ARCH["Qwen2ForCausalLM"][0]
+
 
 class QwenMoE(MoEOutputArchitecture):
     def name(self) -> str:
diff --git a/mergekit/scripts/fill_missing_params.py b/mergekit/scripts/fill_missing_params.py
@@ -3,13 +3,14 @@
 import logging
 import shutil
 from pathlib import Path
+from typing import List, Optional, Tuple
 
 import click
 import torch
+from huggingface_hub import snapshot_download
 from safetensors import safe_open
 from tqdm import tqdm
 
-from mergekit.architecture import ParameterNamesUtils
 from mergekit.io.lazy_tensor_loader import ShardedTensorIndex
 from mergekit.io.tensor_writer import TensorWriter
 
@@ -197,3 +198,199 @@ def main(
 
 if __name__ == "__main__":
     main()
+
+
+class ParameterNamesUtils:
+    """Utility functions for handling parameter names."""
+
+    @staticmethod
+    def resolve_model_directory(repo_id: str) -> Path:
+        """Resolve the model directory (local or Hugging Face Hub)."""
+        if Path(repo_id).is_dir():
+            return Path(repo_id)
+
+        return Path(snapshot_download(repo_id))
+
+    @staticmethod
+    def get_model_parameter_names(repo_id: str) -> List[str]:
+        """Get parameter names of a model from a Hugging Face repo or local directory."""
+        model_dir = ParameterNamesUtils.resolve_model_directory(repo_id)
+        return list(ShardedTensorIndex.from_disk(str(model_dir)).tensor_paths.keys())
+
+    @staticmethod
+    def strip_prefix(name: str, prefix: str) -> str:
+        """Remove a single prefix from the start of a name."""
+        if prefix != "" and name.startswith(prefix + "."):
+            return name[len(prefix) + 1 :]
+        return name
+
+    @staticmethod
+    def find_prefix(list1: List[str], list2: List[str]) -> Optional[str]:
+        """
+        Find a prefix in list1 that, after removal, makes list2 an ordered sublist.
+        """
+        assert len(list1) >= len(list2), "params name list1 can't be shorter than list2"
+
+        possible_prefixes = {item.split(".")[0] for item in list1 if "." in item}
+        possible_prefixes = [""] + list(possible_prefixes)
+
+        prefix_matches = {}
+        best_prefix = ""  # Default to no prefix
+        for prefix in possible_prefixes:
+            stripped_list1 = [
+                ParameterNamesUtils.strip_prefix(item, prefix) for item in list1
+            ]
+            prefix_matches[prefix] = len(
+                [item for item in list2 if item in stripped_list1]
+            )
+
+        if max(prefix_matches.values()) > prefix_matches[""]:
+            best_prefix = max(prefix_matches, key=prefix_matches.get)
+
+        return best_prefix
+
+    @staticmethod
+    def find_common_ordered_names(
+        param_names: List[List[str]], prefixes: List[str]
+    ) -> List[str]:
+        """Identify and return common parameter names across all models, ensuring correct order. Also account for prefix."""
+        common_names = set(param_names[0])
+        for i in range(1, len(param_names)):
+            prefix = f"{prefixes[i]}." if prefixes[i] else ""
+            common_names.intersection_update({prefix + name for name in param_names[i]})
+        return [name for name in param_names[0] if name in common_names]
+
+    @staticmethod
+    def remove_size_conflicts(common_names, referenced_models, prefixes):
+        model_dirs = [
+            ParameterNamesUtils.resolve_model_directory(m.model.path)
+            for m in referenced_models
+        ]
+        model_indices = [ShardedTensorIndex.from_disk(str(dir)) for dir in model_dirs]
+
+        common_name_and_shape = common_names.copy()
+        removed_names = []
+
+        for name in common_names:
+            base_shape = ParameterNamesUtils.tensor_shape(name, model_indices[0])
+
+            for i in range(1, len(referenced_models)):
+                other_name = name
+                prefix = f"{prefixes[i]}." if prefixes[i] else ""
+                if name.startswith(prefix) and prefix != "":
+                    other_name = name[len(prefix) :]
+                shape = ParameterNamesUtils.tensor_shape(other_name, model_indices[i])
+
+                if base_shape != shape:
+                    common_name_and_shape.remove(name)
+                    removed_names.append((name, base_shape, shape, i))
+                    break
+
+        size_mismatch_count = len(removed_names)
+        if size_mismatch_count > 0:
+            logging.warning(
+                f"Size mismatch detected for {size_mismatch_count}/{size_mismatch_count + len(common_names)} tensors. "
+                "These names were removed from the merge list."
+            )
+            logging.info(
+                "The following tensors have different shapes across models and were removed from the merge list:"
+            )
+            for name, base_shape, shape, i in removed_names:
+                logging.info(
+                    f"Tensor name: {name}, Base model shape: {base_shape}, Mismatched shape: {shape} in model {referenced_models[i].model.path}"
+                )
+
+        return common_name_and_shape
+
+    @staticmethod
+    def are_common_params_ordered(list1: List[str], list2: List[str]) -> bool:
+        """
+        Check if common elements of list2 maintain their relative order in list1.
+        """
+        common_params = set(list1).intersection(set(list2))
+        last_index = -1
+
+        for param in list2:
+            if param in common_params:
+                current_index = list1.index(param)
+                if current_index < last_index:
+                    return False
+                last_index = current_index
+        return True
+
+    @staticmethod
+    def ordered_sublist(list1: List[str], list2: List[str]) -> bool:
+        """
+        Check if list2 is a contiguous ordered sublist of list1.
+        """
+        n, m = len(list1), len(list2)
+
+        for i in range(n - m + 1):
+            if list1[i : i + m] == list2:
+                return True
+        return False
+
+    @staticmethod
+    def report_names_similarity(
+        base_names: List[str], other_names: List[str]
+    ) -> Tuple[Optional[str], str]:
+        """
+        Analyze similarity between parameter names of two models and identify shared prefixes.
+        Returns:
+            best_prefix (str): Best matching prefix for parameter names.
+            case_message (str): Explanation of the structural relationship.
+        """
+        possible_prefixes = {""}
+        possible_prefixes.update(
+            {item.split(".")[0] for item in base_names if "." in item}
+        )
+
+        prefixes_subset_overlap = {}
+        best_prefix = None
+        case_message = "No common parameter names found for any prefix"
+
+        for prefix in possible_prefixes:
+            base_names_stripped = [
+                ParameterNamesUtils.strip_prefix(name, prefix) for name in base_names
+            ]
+
+            if ParameterNamesUtils.ordered_sublist(base_names_stripped, other_names):
+                return prefix, "All params in model have exact match in base model."
+
+            intersection = set(base_names_stripped).intersection(set(other_names))
+            prefixes_subset_overlap[prefix] = intersection
+
+        if prefixes_subset_overlap:
+            best_prefix = max(
+                prefixes_subset_overlap, key=lambda x: len(prefixes_subset_overlap[x])
+            )
+            base_names_stripped = [
+                ParameterNamesUtils.strip_prefix(name, best_prefix)
+                for name in base_names
+            ]
+
+            overlap = len(prefixes_subset_overlap[best_prefix])
+            ordered = ParameterNamesUtils.are_common_params_ordered(
+                base_names_stripped, other_names
+            )
+            mismatched = [
+                item for item in other_names if item not in base_names_stripped
+            ]
+            mismatched = "\n    ".join(mismatched)
+            case_message = (
+                f"{overlap}/{len(other_names)} ({100 * overlap / len(other_names):.2f}%) "
+                f"of model parameters are in the base model. \n"
+                f"  Name ordering is {'preserved' if ordered else 'not preserved'}.\n"
+                f"  Missing parameters:\n    {mismatched}"
+            )
+
+        return best_prefix, case_message
+
+    @staticmethod
+    def tensor_shape(name, index) -> Tuple[int]:
+        from safetensors import safe_open
+
+        with safe_open(
+            Path(index.base_path) / index.tensor_paths[name], framework="pt"
+        ) as f:
+            return f.get_slice(name).get_shape()
diff --git a/mergekit/scripts/moe.py b/mergekit/scripts/moe.py
@@ -163,9 +163,6 @@ def select_output_arch(
     help="Device to use to compute embeddings",
     show_default=True,
 )
-@click.option(
-    "--verbose", "-v", type=bool, default=False, is_flag=True, help="Verbose logging"
-)
 @click.option(
     "--i-understand-this-is-not-useful-without-training",
     type=bool,
@@ -180,7 +177,6 @@ def main(
     load_in_4bit: bool,
     load_in_8bit: bool,
     device: str,
-    verbose: bool,
     i_understand_this_is_not_useful_without_training: bool,
     merge_options: MergeOptions,
 ):
@@ -204,7 +200,7 @@ def main(
         load_in_8bit=load_in_8bit,
         device=device,
         allow_all_same=i_understand_this_is_not_useful_without_training,
-        verbose=verbose,
+        verbose=merge_options.verbose,
     )
 
     if merge_options.write_model_card:
diff --git a/mergekit/scripts/tokensurgeon.py b/mergekit/scripts/tokensurgeon.py
@@ -132,6 +132,7 @@ def main(
         barycentric=barycentric,
         cosine_similarity=cosine_similarity,
         name=embed_info.name,
+        log_reconstruction_error=verbosity > 0,
     )
 
     if lm_head_info:
@@ -469,12 +470,14 @@ def get_embeddings(
 
         if log_reconstruction_error:
             # compute reconstruction error in donor_embed space
-            knn_reconstruction_error.append(
-                torch.nn.functional.mse_loss(
-                    (knn_embeddings.T.to(weights.dtype) @ weights).squeeze(),
-                    token_embedding,
-                ).item()
+            reconstructed = (
+                (knn_embeddings.T.to(weights.dtype) @ weights)
+                .squeeze()
+                .to(token_embedding.dtype)
             )
+            diff = token_embedding - reconstructed
+            mse = diff.square().mean().item()
+            knn_reconstruction_error.append(mse)
 
         # Reconstruct the embedding in original_embed space
         res[idx_1] = (e_c_0[indices].T @ weights).squeeze()
@@ -591,7 +594,7 @@ def validate_architecture(
     donor_arch_info = arch_info_for_config(donor_cfg)
     if donor_arch_info != model_arch_info:
         report_issue(
-            f"Model architectures do not match: {model_arch_info.name()} vs {donor_arch_info.name()}",
+            f"Model architectures do not match: {model_arch_info.expected_model_type} vs {donor_arch_info.expected_model_type}",
             error=not options.allow_crimes,
         )