Skip to content

Commit f23b1ca

Browse files
committed
update ans
1 parent 9de47a1 commit f23b1ca

File tree

1 file changed

+56
-27
lines changed

1 file changed

+56
-27
lines changed

examples/atlas/sc_similarity_examples/example_usage_anndata.py

+56-27
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from torch.utils.data import TensorDataset
1313

1414
from dance.atlas.sc_similarity.anndata_similarity import AnnDataSimilarity, get_anndata
15+
from dance.settings import DANCEDIR, METADIR
1516
from dance.utils import set_seed
1617

1718
# target_files = [
@@ -25,8 +26,8 @@
2526
# "eeacb0c1-2217-4cf6-b8ce-1f0fedf1b569"
2627
# ]
2728
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
28-
parser.add_argument("--tissue", type=str, default="blood")
29-
parser.add_argument("--data_dir", default="../../tuning/temp_data")
29+
parser.add_argument("--tissue", type=str, default="heart")
30+
parser.add_argument("--data_dir", default=DANCEDIR / f"examples/tuning/temp_data")
3031
args = parser.parse_args()
3132

3233
data_dir = args.data_dir
@@ -35,8 +36,8 @@
3536
tissue = args.tissue
3637
# conf_data = pd.read_csv(f"results/{tissue}_result.csv", index_col=0)
3738
conf_data = pd.read_excel("Cell Type Annotation Atlas.xlsx", sheet_name=tissue)
38-
target_files = list(conf_data[conf_data["queryed"] == False]["dataset_id"])
39-
source_files = list(conf_data[conf_data["queryed"] == True]["dataset_id"])
39+
atlas_datasets = list(conf_data[conf_data["queryed"] == False]["dataset_id"])
40+
query_datasets = list(conf_data[conf_data["queryed"] == True]["dataset_id"])
4041

4142

4243
class CustomEncoder(json.JSONEncoder):
@@ -117,46 +118,74 @@ def run_test_case(source_file):
117118
118119
"""
119120
ans = {}
120-
for target_file in target_files:
121+
source_data = get_anndata(train_dataset=[f"{source_file}"], data_dir=data_dir, tissue=tissue.capitalize())
122+
123+
for target_file in atlas_datasets:
121124
# source_data=sc.read_h5ad(f"{data_root}/{source_file}.h5ad")
122125
# target_data=sc.read_h5ad(f"{data_root}/{target_file}.h5ad")
123-
source_data = get_anndata(train_dataset=[f"{source_file}"], data_dir=data_dir)
124-
target_data = get_anndata(train_dataset=[f"{target_file}"], data_dir=data_dir)
126+
target_data = get_anndata(train_dataset=[f"{target_file}"], data_dir=data_dir, tissue=tissue.capitalize())
125127

126128
# Initialize similarity calculator with multiple metrics
127129
similarity_calculator = AnnDataSimilarity(adata1=source_data, adata2=target_data, sample_size=10,
128130
init_random_state=42, n_runs=1,
129131
ground_truth_conf_path="Cell Type Annotation Atlas.xlsx",
130-
adata1_name=source_file, adata2_name=target_file)
132+
adata1_name=source_file, adata2_name=target_file, tissue=tissue)
131133

132134
# Calculate similarity using multiple methods
133135
ans[target_file] = similarity_calculator.get_similarity_matrix_A2B(methods=[
134-
"wasserstein", "Hausdorff", "chamfer", "energy", "sinkhorn2", "bures", "spectral", "common_genes_num",
135-
"ground_truth", "mmd", "metadata_sim"
136+
"wasserstein",
137+
"Hausdorff",
138+
"chamfer",
139+
"energy",
140+
"sinkhorn2",
141+
"bures",
142+
"spectral",
143+
"common_genes_num",
144+
# "ground_truth",
145+
"mmd",
146+
"metadata_sim"
136147
])
137148

138149
# Convert results to DataFrame and save
139150
ans = pd.DataFrame(ans)
140-
ans.to_csv(f'sim_{source_file}.csv')
151+
ans_to_path = f'sims/{tissue}/sim_{source_file}.csv'
152+
os.makedirs(os.path.dirname(ans_to_path), exist_ok=True)
153+
ans.to_csv(ans_to_path)
141154
return ans
142155

143156

144-
query_data = os.listdir(file_root / "query_data")
145-
with pd.ExcelWriter(file_root / f"{tissue}_similarity.xlsx", engine='openpyxl') as writer:
146-
for source_file in source_files:
147-
query_ans = [
148-
pd.read_csv(file_root / "query_data" / element, index_col=0) for element in query_data
149-
if element.split("_")[-3] == source_file
150-
]
151-
ans = run_test_case(source_file)
152-
merged_df = pd.concat(query_ans + [ans], join='inner')
153-
try:
157+
start = False
158+
query_data = os.listdir(file_root / "in_atlas_datas" / f"{tissue}")
159+
excel_path = file_root / f"{tissue}_similarity.xlsx"
160+
# with pd.ExcelWriter(file_root / f"{tissue}_similarity.xlsx", engine='openpyxl') as writer:
161+
for source_file in query_datasets:
162+
# if source_file[:4]=='c777':
163+
# start=True
164+
# if not start:
165+
# continue
166+
query_ans = pd.concat([
167+
pd.read_csv(file_root / "in_atlas_datas" / f"{tissue}" / element, index_col=0) for element in query_data
168+
if element.split("_")[-3] == source_file
169+
])
170+
rename_dict = {col: col.replace('_from_cache', '') for col in query_ans.columns if '_from_cache' in col}
171+
query_ans = query_ans.rename(columns=rename_dict)
172+
ans = run_test_case(source_file)
173+
merged_df = pd.concat([query_ans, ans], join='inner')
174+
if os.path.exists(excel_path):
175+
excel = pd.ExcelFile(excel_path, engine='openpyxl')
176+
if source_file[:4] in excel.sheet_names:
154177
# 尝试读取指定的分表
155-
existing_df = pd.read_excel(file_root / f"{tissue}_similarity.xlsx", sheet_name=source_file[:4])
178+
existing_df = pd.read_excel(file_root / f"{tissue}_similarity.xlsx", sheet_name=source_file[:4],
179+
engine="openpyxl", index_col=0)
156180
# 找出在新数据框中存在但在现有表格中不存在的行
157181
merged_df = pd.concat([existing_df, merged_df])
158-
merged_df = merged_df.drop_duplicates(keep='first')
159-
# 使用 ExcelWriter 更新特定分表
160-
merged_df.to_excel(writer, sheet_name=source_file[:4], index=False)
161-
except ValueError:
162-
merged_df.to_excel(writer, sheet_name=source_file[:4], index=True)
182+
merged_df = merged_df.drop_duplicates(subset=merged_df.index.name, keep='last')
183+
excel.close()
184+
if os.path.exists(excel_path):
185+
mode = 'a'
186+
if_sheet_exists = "replace"
187+
else:
188+
mode = 'w'
189+
if_sheet_exists = None
190+
with pd.ExcelWriter(excel_path, engine='openpyxl', mode=mode, if_sheet_exists=if_sheet_exists) as writer:
191+
merged_df.to_excel(writer, sheet_name=source_file[:4])

0 commit comments

Comments
 (0)