step2 cluster

xingzhongyu · xingzhongyu · commit 96fe34a0acd5 · 2024-01-20T17:11:40.000+08:00
diff --git a/test_automl/fun2code.py b/test_automl/fun2code.py
@@ -3,6 +3,7 @@
 from dance.transforms.cell_feature import CellPCA, CellSVD, WeightedFeaturePCA
 from dance.transforms.filter import FilterGenesPercentile, FilterGenesRegression
 from dance.transforms.interface import AnnDataTransform
+from dance.transforms.misc import SaveRaw
 from dance.transforms.normalize import ScaleFeature, ScTransformR
 
 #TODO register more functions
@@ -19,5 +20,6 @@
     "cell_svd": CellSVD(),
     "cell_weighted_pca": WeightedFeaturePCA(split_name="train"),
     "cell_pca": CellPCA(),
-    # "filter_cell_by_count":AnnDataTransform(sc.pp.filter_cells,min_genes=1)
+    "filter_cell_by_count": AnnDataTransform(sc.pp.filter_cells, min_genes=1),
+    "save_raw": SaveRaw()
 }  #funcion 2 code
diff --git a/test_automl/step2_cell_type_annotation_actinn_example.py b/test_automl/step2_cell_type_annotation_actinn_example.py
@@ -2,10 +2,12 @@
 
 import numpy as np
 import torch
-from step2_config import get_preprocessing_pipeline, log_in_wandb, setStep2
+from step2_config import get_transforms, log_in_wandb, setStep2
 
+from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.actinn import ACTINN
+from dance.transforms.misc import Compose
 from dance.utils import set_seed
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -15,9 +17,11 @@
 def train(config):
 
     model = ACTINN(hidden_dims=config.hidden_dims, lambd=config.lambd, device=device)
-    preprocessing_pipeline = get_preprocessing_pipeline(config=config)
-    if preprocessing_pipeline is None:
+    transforms = get_transforms(config=config)
+    if transforms is None:
+        logger.warning("skip transforms")
         return {"scores": 0}
+    preprocessing_pipeline = Compose(*transforms, log_level="INFO")
     train_dataset = [753, 3285]
     test_dataset = [2695]
     tissue = "Brain"
@@ -75,6 +79,6 @@ def startSweep(parameters_dict) -> Tuple[Dict[str, Any], Callable[..., Any]]:
 
 if __name__ == "__main__":
     """get_function_combinations."""
-    function_list = setStep2(startSweep, original_list=["normalize_total", "gene_filter", "gene_dim_reduction"])
+    function_list = setStep2(startSweep, original_list=["normalize", "gene_filter", "gene_dim_reduction"])
     for func in function_list:
         func()
diff --git a/test_automl/step2_clustering_scdcc.py b/test_automl/step2_clustering_scdcc.py
@@ -0,0 +1,153 @@
+#normalize_per_cell是一定要选的，因为需要n_counts
+import os
+from typing import Any, Callable, Dict, Tuple
+
+import numpy as np
+import torch
+from step2_config import get_transforms, log_in_wandb, setStep2
+
+from dance import logger
+from dance.datasets.singlemodality import CellTypeAnnotationDataset, ClusteringDataset
+from dance.modules.single_modality.cell_type_annotation.actinn import ACTINN
+from dance.modules.single_modality.clustering.scdcc import ScDCC
+from dance.transforms.misc import Compose, SetConfig
+from dance.transforms.preprocess import generate_random_pair
+from dance.utils import set_seed
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+@log_in_wandb(config=None)
+def train(config):
+    aris = []
+    for seed in range(config.seed, config.seed + config.num_runs):
+        set_seed(seed)
+
+        # Load data and perform necessary preprocessing
+        dataloader = ClusteringDataset("./test_automl/data", "10X_PBMC")
+
+        transforms = get_transforms(config=config, set_data_config=False, save_raw=True)
+        if ("normalize" not in config.keys() or config.normalize != "normalize_total") or transforms is None:
+            logger.warning("skip transforms")
+            return {"scores": 0}
+        transforms.append(
+            SetConfig({
+                "feature_channel": [None, None, "n_counts"],
+                "feature_channel_type": ["X", "raw_X", "obs"],
+                "label_channel": "Group"
+            }))
+        preprocessing_pipeline = Compose(*transforms, log_level="INFO")
+        data = dataloader.load_data(transform=preprocessing_pipeline, cache=config.cache)
+
+        # inputs: x, x_raw, n_counts
+        inputs, y = data.get_train_data()
+        n_clusters = len(np.unique(y))
+        in_dim = inputs[0].shape[1]
+
+        # Generate random pairs
+        if not os.path.exists(config.label_cells_files):
+            indx = np.arange(len(y))
+            np.random.shuffle(indx)
+            label_cell_indx = indx[0:int(np.ceil(config.label_cells * len(y)))]
+        else:
+            label_cell_indx = np.loadtxt(config.label_cells_files, dtype=np.int)
+
+        if config.n_pairwise > 0:
+            ml_ind1, ml_ind2, cl_ind1, cl_ind2, error_num = generate_random_pair(y, label_cell_indx, config.n_pairwise,
+                                                                                 config.n_pairwise_error)
+            print("Must link paris: %d" % ml_ind1.shape[0])
+            print("Cannot link paris: %d" % cl_ind1.shape[0])
+            print("Number of error pairs: %d" % error_num)
+        else:
+            ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([])
+
+        # Build and train moodel
+        model = ScDCC(input_dim=in_dim, z_dim=config.z_dim, n_clusters=n_clusters, encodeLayer=config.encodeLayer,
+                      decodeLayer=config.encodeLayer[::-1], sigma=config.sigma, gamma=config.gamma,
+                      ml_weight=config.ml_weight, cl_weight=config.ml_weight, device=config.device,
+                      pretrain_path=f"scdcc_{config.dataset}_pre.pkl")
+        model.fit(inputs, y, lr=config.lr, batch_size=config.batch_size, epochs=config.epochs, ml_ind1=ml_ind1,
+                  ml_ind2=ml_ind2, cl_ind1=cl_ind1, cl_ind2=cl_ind2, update_interval=config.update_interval,
+                  tol=config.tol, pt_batch_size=config.batch_size, pt_lr=config.pretrain_lr,
+                  pt_epochs=config.pretrain_epochs)
+
+        # Evaluate model predictions
+        score = model.score(None, y)
+        print(f"{score=:.4f}")
+        aris.append(score)
+
+    print('scdcc')
+    print(config.dataset)
+    print(f'aris: {aris}')
+    print(f'aris: {np.mean(aris)} +/- {np.std(aris)}')
+    return ({"scores": np.mean(aris)})
+
+
+def startSweep(parameters_dict) -> Tuple[Dict[str, Any], Callable[..., Any]]:
+    parameters_dict.update({
+        'seed': {
+            'value': 0
+        },
+        'num_runs': {
+            'value': 1
+        },
+        'cache': {
+            'value': True
+        },
+        'label_cells_files': {
+            'value': 'label_10X_PBMC.txt'
+        },
+        'label_cells': {
+            'value': 0.1
+        },
+        'n_pairwise': {
+            'value': 0
+        },
+        'n_pairwise_error': {
+            'value': 0
+        },
+        'z_dim': {
+            'value': 32
+        },
+        'encodeLayer': {
+            'value': [256, 64]
+        },
+        'sigma': {
+            'value': 2.5
+        },
+        'gamma': {
+            'value': 1.0
+        },
+        'ml_weight': {
+            'value': 1.0
+        },
+        'cl_weight': {
+            'value': 1.0
+        },
+        'update_interval': {
+            'value': 1.0
+        },
+        'tol': {
+            'value': 0.00001
+        },
+        'ae_weights': {
+            'value': None
+        },
+        'ae_weight_file': {
+            'value': "AE_weights.pth.tar"
+        }
+    })
+
+    sweep_config = {'method': 'grid'}
+    sweep_config['parameters'] = parameters_dict
+    metric = {'name': 'scores', 'goal': 'maximize'}
+
+    sweep_config['metric'] = metric
+    return sweep_config, train  #Return function configuration and training function
+
+
+if __name__ == "__main__":
+    """get_function_combinations."""
+    function_list = setStep2(startSweep, original_list=["gene_filter", "cell_filter", "normalize"])
+    for func in function_list:
+        func()
diff --git a/test_automl/step2_config.py b/test_automl/step2_config.py
@@ -1,9 +1,9 @@
 import functools
 from itertools import combinations
 
-import wandb
 from fun2code import fun2code_dict
 
+import wandb
 from dance.transforms.misc import Compose, SetConfig
 
 #TODO register more functions and add more examples
@@ -17,6 +17,9 @@
     },
     "gene_dim_reduction": {
         "values": ["cell_svd", "cell_weighted_pca", "cell_pca"]
+    },
+    "cell_filter": {
+        "values": ["filter_cell_by_count"]
     }
 }  #Functions registered in the preprocessing process
 
@@ -25,29 +28,34 @@ def getFunConfig(selected_keys=None):
     """Get the config that needs to be optimized and the number of rounds."""
     global pipline2fun_dict
     pipline2fun_dict_subset = {key: pipline2fun_dict[key] for key in selected_keys}
+    print(pipline2fun_dict)
     count = 1
     for _, pipline_values in pipline2fun_dict_subset.items():
         count *= len(pipline_values['values'])
     return pipline2fun_dict_subset, count
 
 
-def get_preprocessing_pipeline(config=None):
+def get_transforms(config=None, set_data_config=True, save_raw=False):
     """Obtain the Compose of the preprocessing function according to the preprocessing
     process."""
     if ("normalize" not in config.keys() or config.normalize
             != "log1p") and ("gene_filter" in config.keys() and config.gene_filter == "highly_variable_genes"):
 
         return None
     transforms = []
-    transforms.append(fun2code_dict[config.normalize]) if "normalize" in config.keys() else None
     transforms.append(fun2code_dict[config.gene_filter]) if "gene_filter" in config.keys() else None
+    transforms.append(fun2code_dict[config.cell_filter]) if "cell_filter" in config.keys() else None
+    if save_raw:
+        transforms.append(fun2code_dict["save_raw"])
+    transforms.append(fun2code_dict[config.normalize]) if "normalize" in config.keys() else None
     transforms.append(fun2code_dict[config.gene_dim_reduction]) if "gene_dim_reduction" in config.keys() else None
-    data_config = {"label_channel": "cell_type"}
-    if "gene_dim_reduction" in config.keys():
-        data_config.update({"feature_channel": fun2code_dict[config.gene_dim_reduction].name})
-    transforms.append(SetConfig(data_config))
-    preprocessing_pipeline = Compose(*transforms, log_level="INFO")
-    return preprocessing_pipeline
+
+    if set_data_config:
+        data_config = {"label_channel": "cell_type"}
+        if "gene_dim_reduction" in config.keys():
+            data_config.update({"feature_channel": fun2code_dict[config.gene_dim_reduction].name})
+        transforms.append(SetConfig(data_config))
+    return transforms
 
 
 def sweepDecorator(selected_keys=None, project="pytorch-cell_type_annotation_ACTINN"):
diff --git a/test_automl/step2_test.py b/test_automl/step2_test.py
@@ -0,0 +1,2 @@
+def test_get_preprocessing_pipeline():
+    pass  #不一定需要，因为主要都是装饰器函数
diff --git a/test_automl/step3_cell_type_annotation_actinn_example.py b/test_automl/step3_cell_type_annotation_actinn_example.py
@@ -1,10 +1,12 @@
 import numpy as np
 import optuna
 import torch
-from step3_config import get_optimizer, get_preprocessing_pipeline
+from step3_config import get_optimizer, get_transforms
 
+from dance import logger
 from dance.datasets.singlemodality import CellTypeAnnotationDataset
 from dance.modules.single_modality.cell_type_annotation.actinn import ACTINN
+from dance.transforms.misc import Compose
 from dance.utils import set_seed
 
 fun_list = ["log1p", "filter_gene_by_count"]
@@ -30,7 +32,11 @@ def objective(trial):
     species = "mouse"
     dataloader = CellTypeAnnotationDataset(train_dataset=train_dataset, test_dataset=test_dataset, tissue=tissue,
                                            species=species, data_dir="./test_automl/data")
-    preprocessing_pipeline = get_preprocessing_pipeline(trial=trial, fun_list=fun_list)
+    transforms = get_transforms(trial=trial, fun_list=fun_list)
+    if transforms is None:
+        logger.warning("skip transforms")
+        return {"scores": 0}
+    preprocessing_pipeline = Compose(*transforms, log_level="INFO")
     data = dataloader.load_data(transform=preprocessing_pipeline, cache=True)
 
     # Obtain training and testing data
diff --git a/test_automl/step3_clustering_scdcc.py b/test_automl/step3_clustering_scdcc.py
diff --git a/test_automl/step3_config.py b/test_automl/step3_config.py
@@ -3,10 +3,10 @@
 
 import optuna
 import scanpy as sc
-import wandb
 from fun2code import fun2code_dict
 from optuna.integration.wandb import WeightsAndBiasesCallback
 
+import wandb
 from dance.transforms.cell_feature import CellPCA, CellSVD, WeightedFeaturePCA
 from dance.transforms.filter import FilterGenesPercentile, FilterGenesRegression
 from dance.transforms.interface import AnnDataTransform
@@ -115,6 +115,20 @@ def normalize_total(method_name: str, trial: optuna.Trial):
                                 exclude_highly_expressed=exclude_highly_expressed, max_fraction=max_fraction)
 
 
+@set_method_name
+def filter_cell_by_count(method_name: str, trial: optuna.Trial):
+    method = trial.suggest_categorical(method_name + "method", ['min_counts', 'min_genes', 'max_counts', 'max_genes'])
+    if method == "min_counts":
+        num = trial.suggest_int(method_name + "num", 2, 10)
+    if method == "min_genes":
+        num = trial.suggest_int(method_name + "num", 2, 10)
+    if method == "max_counts":
+        num = trial.suggest_int(method_name + "num", 500, 1000)
+    if method == "max_genes":
+        num = trial.suggest_int(method_name + "num", 500, 1000)
+    return AnnDataTransform(sc.pp.filter_cells, **{method: num})
+
+
 # # 获取当前文件中的所有函数
 # functions = [(name,obj) for name, obj in inspect.getmembers(
 #     sys.modules[__name__]) if inspect.isfunction(obj)]
@@ -127,20 +141,22 @@ def normalize_total(method_name: str, trial: optuna.Trial):
 #         setattr(__name__, name, set_method_name(function))
 
 
-def get_preprocessing_pipeline(trial, fun_list):
+def get_transforms(trial, fun_list, set_data_config=True):
     """Obtain the Compose of the preprocessing function according to the preprocessing
     function."""
     transforms = []
     for f_str in fun_list:
         fun_i = eval(f_str)
         transforms.append(fun_i(trial))
-    data_config = {"label_channel": "cell_type"}
-    feature_name = {"cell_svd", "cell_weighted_pca", "cell_pca"} & set(fun_list)
-    if feature_name:
-        data_config.update({"feature_channel": fun2code_dict[feature_name].name})
-    transforms.append(SetConfig(data_config))
-    preprocessing_pipeline = Compose(*transforms, log_level="INFO")
-    return preprocessing_pipeline
+    if "highly_variable_genes" in fun_list and "log1p" not in fun_list[:fun_list.index('"highly_variable_genes"')]:
+        return None
+    if set_data_config:
+        data_config = {"label_channel": "cell_type"}
+        feature_name = {"cell_svd", "cell_weighted_pca", "cell_pca"} & set(fun_list)
+        if feature_name:
+            data_config.update({"feature_channel": fun2code_dict[feature_name].name})
+        transforms.append(SetConfig(data_config))
+    return transforms
 
 
 def log_in_wandb(wandbc=None):
diff --git a/test_automl/step3_test.py b/test_automl/step3_test.py
@@ -0,0 +1,2 @@
+def test_get_preprocessing_pipeline():
+    pass  #不一定需要，因为主要都是装饰器函数

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+def test_get_preprocessing_pipeline():`
	`2`	`+ pass #不一定需要，因为主要都是装饰器函数`