return fov data, record fov ids on data splits (#65)

Gregory Johnson · web-flow · commit c0ed755b46d2 · 2020-02-03T15:52:37.000-08:00
diff --git a/demo.ipynb b/demo.ipynb
diff --git a/fov_processing_pipeline/bin/process.py b/fov_processing_pipeline/bin/process.py
@@ -97,35 +97,40 @@ def process(
         ###########
         # QC data based on previous thresholds, etc
         ###########
-        df_stats = wrappers.qc_stats(df_stats, save_dir)
+        df_stats_qc = wrappers.qc_stats(df_stats, save_dir)
 
-        ###########
-        # Make Plots
-        ###########
-        wrappers.stats2plots(df_stats, parent_dir=save_dir, upstream_tasks=[df_stats])
+        if not use_current_results:
 
-        ###########
-        # Make diagnostic images
-        ###########
-        wrappers.im2diagnostics(
-            fov_data, proj_paths, parent_dir=save_dir, upstream_tasks=[df_stats]
-        )
+            ###########
+            # Make Plots
+            ###########
+            wrappers.stats2plots(
+                df_stats_qc, parent_dir=save_dir, upstream_tasks=[df_stats_qc]
+            )
+
+            ###########
+            # Make diagnostic images
+            ###########
+            wrappers.im2diagnostics(
+                fov_data, proj_paths, parent_dir=save_dir, upstream_tasks=[df_stats]
+            )
 
         ###########
         # Do data splits for the data that survived QC
         ###########
         splits_dict = wrappers.data_splits(
-            df_stats, parent_dir=save_dir, upstream_tasks=[df_stats]
+            df_stats_qc, parent_dir=save_dir, upstream_tasks=[df_stats_qc]
         )
 
     state = flow.run(executor=executor)
 
+    fov_data = state.result[flow.get_tasks(name="save_load_data")[0]].result[1]
     df_stats = state.result[flow.get_tasks(name="load_stats")[0]].result
     splits_dict = state.result[flow.get_tasks(name="data_splits")[0]].result
 
     log.info("Done!")
 
-    return df_stats, splits_dict
+    return fov_data, df_stats, splits_dict
 
 
 ###############################################################################
diff --git a/fov_processing_pipeline/tests/test_wrappers.py b/fov_processing_pipeline/tests/test_wrappers.py
@@ -70,9 +70,14 @@ def test_save_load_data(demo_cell_data, demo_fov_data, tmpdir):
 
 def test_data_splits(demo_fov_data, tmpdir):
 
+    n_rows = 100
+    id_column = "FOVId"
+
     # Data Prep
     # the demo_fov_data is only one element, so we repeat it a few times, and rebuild the random numbers
-    demo_fov_data = pd.concat([demo_fov_data] * 100, 0)
+
+    demo_fov_data = pd.concat([demo_fov_data] * n_rows, 0)
+    demo_fov_data[id_column] = np.arange(0, n_rows)
     demo_fov_data["FOVId_rng"] = np.random.rand(demo_fov_data.shape[0])
 
     # have two structures
@@ -92,6 +97,7 @@ def test_data_splits(demo_fov_data, tmpdir):
         split_amounts=split_amounts,
         group_column="ProteinDisplayName",
         split_column="FOVId_rng",
+        id_column=id_column,
     )
 
     assert len(splits_dict) == len(np.unique(demo_fov_data["ProteinDisplayName"]))
@@ -106,7 +112,7 @@ def test_data_splits(demo_fov_data, tmpdir):
 
             assert os.path.exists(splits_dict[u_group][split_name]["save_path"])
 
-            inds_list.append(splits_dict[u_group][split_name]["split_inds"])
+            inds_list.append(splits_dict[u_group][split_name][id_column])
 
     inds_list = np.hstack(inds_list)
 
@@ -126,6 +132,7 @@ def test_data_splits(demo_fov_data, tmpdir):
             split_amounts=split_amounts_wrong,
             group_column="ProteinDisplayName",
             split_column="FOVId_rng",
+            id_column=id_column,
         )
 
     demo_fov_data_wrong = demo_fov_data.copy()
@@ -152,6 +159,7 @@ def test_data_splits(demo_fov_data, tmpdir):
             split_amounts=split_amounts,
             group_column="this column doesnt exist",
             split_column="FOVId_rng",
+            id_column=id_column,
         )
 
     # Not existing split column
@@ -163,4 +171,5 @@ def test_data_splits(demo_fov_data, tmpdir):
             split_amounts=split_amounts,
             group_column="ProteinDisplayName",
             split_column="this column doesnt exist",
+            id_column=id_column,
         )
diff --git a/fov_processing_pipeline/wrappers.py b/fov_processing_pipeline/wrappers.py
@@ -328,6 +328,7 @@ def data_splits(
     split_amounts=[0.8, 0.1, 0.1],
     group_column="ProteinDisplayName",
     split_column="FOVId_rng",
+    id_column="FOVId",
 ):
     """
     Given a stats dataframe, split each unique entry of `group_column` into groups based on the `split_column random
@@ -354,6 +355,9 @@ def data_splits(
     split_column: str
         Column with values that we split in. All values must be in the range of [0, 1). See: numpy.random.rand
 
+    id_column: str
+        Column corresponding to unique ids
+
     Returns
     -------
     splits_dict
@@ -410,6 +414,9 @@ def data_splits(
 
             splits_dict[u_group][split_name] = {}
             splits_dict[u_group][split_name]["save_path"] = save_path
-            splits_dict[u_group][split_name]["split_inds"] = np.where(split_inds)[0]
+
+            splits_dict[u_group][split_name][id_column] = df_stats[id_column][
+                split_inds
+            ]
 
     return splits_dict