Skip to content

Commit c0ed755

Browse files
author
Gregory Johnson
authoredFeb 3, 2020
return fov data, record fov ids on data splits (#65)
1 parent 4b8a8a1 commit c0ed755

File tree

4 files changed

+37
-88
lines changed

4 files changed

+37
-88
lines changed
 

‎demo.ipynb

Lines changed: 0 additions & 72 deletions
This file was deleted.

‎fov_processing_pipeline/bin/process.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -97,35 +97,40 @@ def process(
9797
###########
9898
# QC data based on previous thresholds, etc
9999
###########
100-
df_stats = wrappers.qc_stats(df_stats, save_dir)
100+
df_stats_qc = wrappers.qc_stats(df_stats, save_dir)
101101

102-
###########
103-
# Make Plots
104-
###########
105-
wrappers.stats2plots(df_stats, parent_dir=save_dir, upstream_tasks=[df_stats])
102+
if not use_current_results:
106103

107-
###########
108-
# Make diagnostic images
109-
###########
110-
wrappers.im2diagnostics(
111-
fov_data, proj_paths, parent_dir=save_dir, upstream_tasks=[df_stats]
112-
)
104+
###########
105+
# Make Plots
106+
###########
107+
wrappers.stats2plots(
108+
df_stats_qc, parent_dir=save_dir, upstream_tasks=[df_stats_qc]
109+
)
110+
111+
###########
112+
# Make diagnostic images
113+
###########
114+
wrappers.im2diagnostics(
115+
fov_data, proj_paths, parent_dir=save_dir, upstream_tasks=[df_stats]
116+
)
113117

114118
###########
115119
# Do data splits for the data that survived QC
116120
###########
117121
splits_dict = wrappers.data_splits(
118-
df_stats, parent_dir=save_dir, upstream_tasks=[df_stats]
122+
df_stats_qc, parent_dir=save_dir, upstream_tasks=[df_stats_qc]
119123
)
120124

121125
state = flow.run(executor=executor)
122126

127+
fov_data = state.result[flow.get_tasks(name="save_load_data")[0]].result[1]
123128
df_stats = state.result[flow.get_tasks(name="load_stats")[0]].result
124129
splits_dict = state.result[flow.get_tasks(name="data_splits")[0]].result
125130

126131
log.info("Done!")
127132

128-
return df_stats, splits_dict
133+
return fov_data, df_stats, splits_dict
129134

130135

131136
###############################################################################

‎fov_processing_pipeline/tests/test_wrappers.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,14 @@ def test_save_load_data(demo_cell_data, demo_fov_data, tmpdir):
7070

7171
def test_data_splits(demo_fov_data, tmpdir):
7272

73+
n_rows = 100
74+
id_column = "FOVId"
75+
7376
# Data Prep
7477
# the demo_fov_data is only one element, so we repeat it a few times, and rebuild the random numbers
75-
demo_fov_data = pd.concat([demo_fov_data] * 100, 0)
78+
79+
demo_fov_data = pd.concat([demo_fov_data] * n_rows, 0)
80+
demo_fov_data[id_column] = np.arange(0, n_rows)
7681
demo_fov_data["FOVId_rng"] = np.random.rand(demo_fov_data.shape[0])
7782

7883
# have two structures
@@ -92,6 +97,7 @@ def test_data_splits(demo_fov_data, tmpdir):
9297
split_amounts=split_amounts,
9398
group_column="ProteinDisplayName",
9499
split_column="FOVId_rng",
100+
id_column=id_column,
95101
)
96102

97103
assert len(splits_dict) == len(np.unique(demo_fov_data["ProteinDisplayName"]))
@@ -106,7 +112,7 @@ def test_data_splits(demo_fov_data, tmpdir):
106112

107113
assert os.path.exists(splits_dict[u_group][split_name]["save_path"])
108114

109-
inds_list.append(splits_dict[u_group][split_name]["split_inds"])
115+
inds_list.append(splits_dict[u_group][split_name][id_column])
110116

111117
inds_list = np.hstack(inds_list)
112118

@@ -126,6 +132,7 @@ def test_data_splits(demo_fov_data, tmpdir):
126132
split_amounts=split_amounts_wrong,
127133
group_column="ProteinDisplayName",
128134
split_column="FOVId_rng",
135+
id_column=id_column,
129136
)
130137

131138
demo_fov_data_wrong = demo_fov_data.copy()
@@ -152,6 +159,7 @@ def test_data_splits(demo_fov_data, tmpdir):
152159
split_amounts=split_amounts,
153160
group_column="this column doesnt exist",
154161
split_column="FOVId_rng",
162+
id_column=id_column,
155163
)
156164

157165
# Not existing split column
@@ -163,4 +171,5 @@ def test_data_splits(demo_fov_data, tmpdir):
163171
split_amounts=split_amounts,
164172
group_column="ProteinDisplayName",
165173
split_column="this column doesnt exist",
174+
id_column=id_column,
166175
)

‎fov_processing_pipeline/wrappers.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ def data_splits(
328328
split_amounts=[0.8, 0.1, 0.1],
329329
group_column="ProteinDisplayName",
330330
split_column="FOVId_rng",
331+
id_column="FOVId",
331332
):
332333
"""
333334
Given a stats dataframe, split each unique entry of `group_column` into groups based on the `split_column random
@@ -354,6 +355,9 @@ def data_splits(
354355
split_column: str
355356
Column with values that we split in. All values must be in the range of [0, 1). See: numpy.random.rand
356357
358+
id_column: str
359+
Column corresponding to unique ids
360+
357361
Returns
358362
-------
359363
splits_dict
@@ -410,6 +414,9 @@ def data_splits(
410414

411415
splits_dict[u_group][split_name] = {}
412416
splits_dict[u_group][split_name]["save_path"] = save_path
413-
splits_dict[u_group][split_name]["split_inds"] = np.where(split_inds)[0]
417+
418+
splits_dict[u_group][split_name][id_column] = df_stats[id_column][
419+
split_inds
420+
]
414421

415422
return splits_dict

0 commit comments

Comments
 (0)