add example for custom registered preprocessing func

RemyLau · RemyLau · commit aba9fb649dab · 2024-01-29T19:33:43.000-05:00
diff --git a/examples/tuning/cta_svm/main.py b/examples/tuning/cta_svm/main.py
@@ -3,14 +3,39 @@
 from typing import get_args
 
 import wandb
+from sklearn.random_projection import GaussianRandomProjection
 
 from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.svm import SVM
 from dance.pipeline import PipelinePlaner
+from dance.registry import register_preprocessor
+from dance.transforms.base import BaseTransform
 from dance.typing import LogLevel
 from dance.utils import set_seed
 
+
+@register_preprocessor("feature", "cell")  # NOTE: register any custom preprocessing function to be used for tuning
+class GaussRandProjFeature(BaseTransform):
+    """Custom preprocessing to extract cell feature via Gaussian random projection."""
+
+    _DISPLAY_ATTRS = ("n_components", "eps")
+
+    def __init__(self, n_components: int = 400, eps: float = 0.1, **kwargs):
+        super().__init__(**kwargs)
+        self.n_components = n_components
+        self.eps = eps
+
+    def __call__(self, data):
+        feat = data.get_feature(return_type="numpy")
+        grp = GaussianRandomProjection(n_components=self.n_components, eps=self.eps)
+
+        self.logger.info(f"Start generateing cell feature via Gaussian random projection (d={self.n_components}).")
+        data.data.obsm[self.out] = grp.fit_transform(feat)
+
+        return data
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("--cache", action="store_true", help="Cache processed data.")
diff --git a/examples/tuning/cta_svm/pipeline_tuning_config.yaml b/examples/tuning/cta_svm/pipeline_tuning_config.yaml
@@ -6,9 +6,11 @@ pipeline:
       - WeightedFeaturePCA
       - CellPCA
       - CellSVD
+      - GaussRandProjFeature  # Registered custom preprocessing func
     params:
       n_components: 400
       out: feature.cell
+      log_level: INFO
     default_params:
       WeightedFeaturePCA:
         split_name: train