|
12 | 12 | from torch.utils.data import TensorDataset
|
13 | 13 |
|
14 | 14 | from dance.atlas.sc_similarity.anndata_similarity import AnnDataSimilarity, get_anndata
|
| 15 | +from dance.settings import DANCEDIR, METADIR |
15 | 16 | from dance.utils import set_seed
|
16 | 17 |
|
17 | 18 | # target_files = [
|
|
25 | 26 | # "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569"
|
26 | 27 | # ]
|
27 | 28 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
28 |
| -parser.add_argument("--tissue", type=str, default="blood") |
29 |
| -parser.add_argument("--data_dir", default="../../tuning/temp_data") |
| 29 | +parser.add_argument("--tissue", type=str, default="heart") |
| 30 | +parser.add_argument("--data_dir", default=DANCEDIR / f"examples/tuning/temp_data") |
30 | 31 | args = parser.parse_args()
|
31 | 32 |
|
32 | 33 | data_dir = args.data_dir
|
|
35 | 36 | tissue = args.tissue
|
36 | 37 | # conf_data = pd.read_csv(f"results/{tissue}_result.csv", index_col=0)
|
37 | 38 | conf_data = pd.read_excel("Cell Type Annotation Atlas.xlsx", sheet_name=tissue)
|
38 |
| -target_files = list(conf_data[conf_data["queryed"] == False]["dataset_id"]) |
39 |
| -source_files = list(conf_data[conf_data["queryed"] == True]["dataset_id"]) |
| 39 | +atlas_datasets = list(conf_data[conf_data["queryed"] == False]["dataset_id"]) |
| 40 | +query_datasets = list(conf_data[conf_data["queryed"] == True]["dataset_id"]) |
40 | 41 |
|
41 | 42 |
|
42 | 43 | class CustomEncoder(json.JSONEncoder):
|
@@ -117,46 +118,74 @@ def run_test_case(source_file):
|
117 | 118 |
|
118 | 119 | """
|
119 | 120 | ans = {}
|
120 |
| - for target_file in target_files: |
| 121 | + source_data = get_anndata(train_dataset=[f"{source_file}"], data_dir=data_dir, tissue=tissue.capitalize()) |
| 122 | + |
| 123 | + for target_file in atlas_datasets: |
121 | 124 | # source_data=sc.read_h5ad(f"{data_root}/{source_file}.h5ad")
|
122 | 125 | # target_data=sc.read_h5ad(f"{data_root}/{target_file}.h5ad")
|
123 |
| - source_data = get_anndata(train_dataset=[f"{source_file}"], data_dir=data_dir) |
124 |
| - target_data = get_anndata(train_dataset=[f"{target_file}"], data_dir=data_dir) |
| 126 | + target_data = get_anndata(train_dataset=[f"{target_file}"], data_dir=data_dir, tissue=tissue.capitalize()) |
125 | 127 |
|
126 | 128 | # Initialize similarity calculator with multiple metrics
|
127 | 129 | similarity_calculator = AnnDataSimilarity(adata1=source_data, adata2=target_data, sample_size=10,
|
128 | 130 | init_random_state=42, n_runs=1,
|
129 | 131 | ground_truth_conf_path="Cell Type Annotation Atlas.xlsx",
|
130 |
| - adata1_name=source_file, adata2_name=target_file) |
| 132 | + adata1_name=source_file, adata2_name=target_file, tissue=tissue) |
131 | 133 |
|
132 | 134 | # Calculate similarity using multiple methods
|
133 | 135 | ans[target_file] = similarity_calculator.get_similarity_matrix_A2B(methods=[
|
134 |
| - "wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num", |
135 |
| - "ground_truth", "mmd", "metadata_sim" |
| 136 | + "wasserstein", |
| 137 | + "Hausdorff", |
| 138 | + "chamfer", |
| 139 | + "energy", |
| 140 | + "sinkhorn2", |
| 141 | + "bures", |
| 142 | + "spectral", |
| 143 | + "common_genes_num", |
| 144 | + # "ground_truth", |
| 145 | + "mmd", |
| 146 | + "metadata_sim" |
136 | 147 | ])
|
137 | 148 |
|
138 | 149 | # Convert results to DataFrame and save
|
139 | 150 | ans = pd.DataFrame(ans)
|
140 |
| - ans.to_csv(f'sim_{source_file}.csv') |
| 151 | + ans_to_path = f'sims/{tissue}/sim_{source_file}.csv' |
| 152 | + os.makedirs(os.path.dirname(ans_to_path), exist_ok=True) |
| 153 | + ans.to_csv(ans_to_path) |
141 | 154 | return ans
|
142 | 155 |
|
143 | 156 |
|
144 |
| -query_data = os.listdir(file_root / "query_data") |
145 |
| -with pd.ExcelWriter(file_root / f"{tissue}_similarity.xlsx", engine='openpyxl') as writer: |
146 |
| - for source_file in source_files: |
147 |
| - query_ans = [ |
148 |
| - pd.read_csv(file_root / "query_data" / element, index_col=0) for element in query_data |
149 |
| - if element.split("_")[-3] == source_file |
150 |
| - ] |
151 |
| - ans = run_test_case(source_file) |
152 |
| - merged_df = pd.concat(query_ans + [ans], join='inner') |
153 |
| - try: |
| 157 | +start = False |
| 158 | +query_data = os.listdir(file_root / "in_atlas_datas" / f"{tissue}") |
| 159 | +excel_path = file_root / f"{tissue}_similarity.xlsx" |
| 160 | +# with pd.ExcelWriter(file_root / f"{tissue}_similarity.xlsx", engine='openpyxl') as writer: |
| 161 | +for source_file in query_datasets: |
| 162 | + # if source_file[:4]=='c777': |
| 163 | + # start=True |
| 164 | + # if not start: |
| 165 | + # continue |
| 166 | + query_ans = pd.concat([ |
| 167 | + pd.read_csv(file_root / "in_atlas_datas" / f"{tissue}" / element, index_col=0) for element in query_data |
| 168 | + if element.split("_")[-3] == source_file |
| 169 | + ]) |
| 170 | + rename_dict = {col: col.replace('_from_cache', '') for col in query_ans.columns if '_from_cache' in col} |
| 171 | + query_ans = query_ans.rename(columns=rename_dict) |
| 172 | + ans = run_test_case(source_file) |
| 173 | + merged_df = pd.concat([query_ans, ans], join='inner') |
| 174 | + if os.path.exists(excel_path): |
| 175 | + excel = pd.ExcelFile(excel_path, engine='openpyxl') |
| 176 | + if source_file[:4] in excel.sheet_names: |
154 | 177 | # 尝试读取指定的分表
|
155 |
| - existing_df = pd.read_excel(file_root / f"{tissue}_similarity.xlsx", sheet_name=source_file[:4]) |
| 178 | + existing_df = pd.read_excel(file_root / f"{tissue}_similarity.xlsx", sheet_name=source_file[:4], |
| 179 | + engine="openpyxl", index_col=0) |
156 | 180 | # 找出在新数据框中存在但在现有表格中不存在的行
|
157 | 181 | merged_df = pd.concat([existing_df, merged_df])
|
158 |
| - merged_df = merged_df.drop_duplicates(keep='first') |
159 |
| - # 使用 ExcelWriter 更新特定分表 |
160 |
| - merged_df.to_excel(writer, sheet_name=source_file[:4], index=False) |
161 |
| - except ValueError: |
162 |
| - merged_df.to_excel(writer, sheet_name=source_file[:4], index=True) |
| 182 | + merged_df = merged_df.drop_duplicates(subset=merged_df.index.name, keep='last') |
| 183 | + excel.close() |
| 184 | + if os.path.exists(excel_path): |
| 185 | + mode = 'a' |
| 186 | + if_sheet_exists = "replace" |
| 187 | + else: |
| 188 | + mode = 'w' |
| 189 | + if_sheet_exists = None |
| 190 | + with pd.ExcelWriter(excel_path, engine='openpyxl', mode=mode, if_sheet_exists=if_sheet_exists) as writer: |
| 191 | + merged_df.to_excel(writer, sheet_name=source_file[:4]) |
0 commit comments