updated readme with flipbooks and scraping

volume-em · Dec 13, 2022 · 79ad7dd · 79ad7dd
1 parent c3a2b96
commit 79ad7dd
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 35 deletions.
diff --git a/dataset/3d/reconstruct3d.py b/dataset/3d/reconstruct3d.py
@@ -13,14 +13,27 @@
 '-ROI-' identifier exists in each subvolume name.
 
 Volumes must be in a format readable by SimpleITK (primarily .mrc, .nrrd, .tif, etc.). THIS
-SCRIPT DOES NOT SUPPORT (OME)ZARRs. As a rule, such datasets are usually created because of 
-their large size. Sparsely sampled ROIs from such NGFF-style datasets can be saved in one of
-the supported formats using the ../scraping/ngff_download.py script.
+SCRIPT DOES NOT SUPPORT NGFFs. As a rule, such datasets are usually created because of 
+their large size. Sparsely sampled ROIs from such NGFF datasets can be downloaded and saved 
+in one of the supported formats using the ../scraping/ngff_download.py script.
 
 Example usage:
 --------------
 
-python reconstruct3d.py {impaths_file} {volume_dir} {savedir} -nz 224 -p 4 --cross-plane
+python reconstruct3d.py {filtered_dir} \
+        -vd {volume_dir1} {volume_dir2} {volume_dir3} \
+        -sd {savedir} -nz 224 -p 4 --limit 100
+        
+Reconstruct a maximum of 100 subvolumes with 224 z-slices from each
+dataset represented in {filtered_dir}. Save them in {savedir}, which
+will contain a separate subdirectory corresponding to each dataset.
+
+Note1: For generating flipbooks, -nz should always be odd. While even
+numbers strictly can be used, they're likely to cause confusion at
+annotation time because there isn't a "real" middle slice.
+
+Note2: Z-slices will always be the first dimension in the subvolume 
+(this is essential for generating flipbooks).
 
 For help with arguments:
 ------------------------
@@ -99,9 +112,6 @@ def find_children(vol_fpath):
         else:
             dirname = volname
 
-        # lookup the img_source
-        #assert(dirname in img_fpaths_dict), \
-        #f"Directory {dirname} not found in image paths!"
         if dirname not in img_fpaths_dict:
             return [], dirname
 
@@ -114,6 +124,12 @@ def find_children(vol_fpath):
         return vol_img_fpaths, dirname
 
     def extract_subvolume(volume, img_fpath):
+        """
+        Extracts the correct subvolume from the
+        full volumetric dataset based on the name
+        of a given image which must include the -LOC-
+        identifier.
+        """
         # extract location of image from filename
         img_fpath = os.path.basename(img_fpath)
         volname, loc = img_fpath.split('-LOC-')
@@ -135,29 +151,28 @@ def extract_subvolume(volume, img_fpath):
         lowz = index - span
         highz = index + span + 1
 
-        # pass images that don't have enough
-        # context to be annotated as a flipbook
+        # pass images that don't have enough context
         if lowz < 0 or highz >= volume.shape[axis]:
             return None, None
         else:
             axis_span = slice(lowz, highz)
 
             if axis == 0:
-                flipbook = volume[axis_span, yslice, xslice]
+                subvol = volume[axis_span, yslice, xslice]
             elif axis == 1:
-                flipbook = volume[yslice, axis_span, xslice]
-                flipbook = flipbook.transpose(1, 0, 2)
+                subvol = volume[yslice, axis_span, xslice]
+                subvol = subvol.transpose(1, 0, 2)
             elif axis == 2:
-                flipbook = volume[yslice, xslice, axis_span]
-                flipbook = flipbook.transpose(2, 0, 1)
+                subvol = volume[yslice, xslice, axis_span]
+                subvol = subvol.transpose(2, 0, 1)
             else:
                 raise Exception(f'Axis cannot be {axis}, must be in [0, 1, 2]')
 
-            flipbook_fname = f'{volname}-LOC-{axis}_{lowz}-{highz}_{yrange}_{xrange}'
+            subvol_fname = f'{volname}-LOC-{axis}_{lowz}-{highz}_{yrange}_{xrange}'
 
-        return flipbook, flipbook_fname
+        return subvol, subvol_fname
 
-    def create_flipbooks(vp):
+    def create_subvols(vp):
         children, dirname = find_children(vp)
 
         vol_savedir = os.path.join(savedir, dirname)
@@ -174,23 +189,19 @@ def create_flipbooks(vp):
                 volume = volume[..., 0]
 
             if np.any(np.array(volume.shape) < numberz):
-                raise Exception(f'Flipbooks of size {numberz} cannot be created from {vp} with size {volume.shape}')
+                raise Exception(f'Subvolume of size {numberz} cannot be created from {vp} with size {volume.shape}')
 
-            # directory in which to save flipbooks 
+            # directory in which to save subvols 
             # from this volume dataset
             if not os.path.isdir(vol_savedir):
                 os.makedirs(vol_savedir, exist_ok=True)
 
-            # extract and save flipbooks
-            count = 0
+            # extract and save subvols
             for child in children:
-                if count >= 50:
-                    break
-                flipbook, flipbook_fname = extract_subvolume(volume, child)
-                if flipbook_fname is not None:
-                    io.imsave(os.path.join(vol_savedir, flipbook_fname + '.tif'), 
-                              flipbook, check_contrast=False)
-                    count += 1
+                subvol, subvol_fname = extract_subvolume(volume, child)
+                if subvol_fname is not None:
+                    io.imsave(os.path.join(vol_savedir, subvol_fname + '.tif'), 
+                              subvol, check_contrast=False)
 
     with Pool(processes) as pool:
-        output = pool.map(create_flipbooks, volume_fpaths)
+        output = pool.map(create_subvols, volume_fpaths)
diff --git a/dataset/README.md b/dataset/README.md
@@ -39,31 +39,72 @@ python preprocess/vid2stack.py {dir_of_videos}
 
 ## 3D Data Preparation
 
-3D datasets are expected to be in a single directory (this includes any video stacks created in the previous section). Supported formats are anything that can be [read by SimpleITK](https://simpleitk.readthedocs.io/en/v1.2.3/Documentation/docs/source/IO.html). It's important that if any volumes are in ```.mrc``` format they be converted to unsigned bytes. With IMOD installed this can be done using:
+3D datasets are expected to be in a single directory (this includes any video stacks created in the previous section). 
+Supported formats are anything that can be [read by SimpleITK](https://simpleitk.readthedocs.io/en/v1.2.3/Documentation/docs/source/IO.html). It's important that if any volumes are in
+```.mrc``` format they be converted to unsigned bytes. With IMOD installed this can be done using:
 
 ```bash
 python preprocess/mrc2byte.py {dir_of_mrc_files}
 ```
 
-Next, cross-section, patch, and deduplicate volume files. If processing a combination of isotropic and anisotropic volumes, it's crucial that each dataset has a correct header recording the voxel size. If Z resolution is greater that 25% 
-different from xy resolution, then cross-sections will only be cut from the xy plane, even if axes 0, 1, 2 are passed to the script (see usage example below). 
+Next, cross-section, patch, and deduplicate volume files. If processing a combination of isotropic and anisotropic volumes,
+it's crucial that each dataset has a correct header recording the voxel size. If Z resolution is greater that 25% 
+different from xy resolution, then cross-sections will only be cut from the xy plane, even if axes 0, 1, 2 are passed to
+the script (see usage example below). 
 
 ```bash
 python patchify3d.py {dir_of_3d_datasets} {patch_dir} -cs 224 --axes 0 1 2 --processes 4
 ```
 
-The ```patchify3d.py``` script will save a ```.pkl``` file with the name of each volume file. Pickle files contain a dictionary of patches along with corresponding filenames. These files are ready for filtering (see below).
+The ```patchify3d.py``` script will save a ```.pkl``` file with the name of each volume file. Pickle files contain a 
+dictionary of patches along with corresponding filenames. These files are ready for filtering (see below).
 
 ## Filtering 
 
-2D, video, and 3D datasets can be filtered with the same script just put all the ```.pkl``` files in the same directory. By default, filtering uses a ResNet34 model that was trained on 12,000 manually annotated patches. The weights for this model are downloaded from [Zenodo](https://zenodo.org/record/6458015#.YlmNaS-cbTR) automatically. A new model can be trained, if needed, using the ```train_patch_classifier.py``` script.
+2D, video, and 3D datasets can be filtered with the same script just put all the ```.pkl``` files in the same directory. 
+By default, filtering uses a ResNet34 model that was trained on 12,000 manually annotated patches. The weights for this
+model are downloaded from [Zenodo](https://zenodo.org/record/6458015#.YlmNaS-cbTR) automatically. A new model can be 
+trained, if needed, using the ```train_patch_classifier.py``` script.
 
 Filtering will be fastest with a GPU installed, but it's not required.
 
 ```bash
 python classify_patches.py {patch_dir} {save_dir}
 ```
 
-After running filtering, the ```save_dir``` with have one subdirectory for each of the ```.pkl``` files that were processed. Each subdirectory contains single channel grayscale, unsigned 8-bit tiff images.
+After running filtering, the ```save_dir``` with have one subdirectory for each of the ```.pkl``` files that were 
+processed. Each subdirectory contains single channel grayscale, unsigned 8-bit tiff images.
 
+# Reconstructing subvolumes and flipbooks
 
+Although the curation process always results in 2D image patches, it's possible to retrieve 3D subvolumes as long as one 
+has access to the original 3D datasets. Patch filenames from 3D datasets always include a suffix denoted by '-LOC-' that
+records the slicing plane, the index of the slice, and the x and y positions of the patch. To extract a subvolume around
+a patch, use the ```3d/reconstruct3d.py``` script. 
+
+For example, to create short flipbooks of 5 consecutive images from a directory of curated patches:
+
+```bash
+python reconstruct3d.py {filtered_patch_dir} \
+        -vd {volume_dir1} {volume_dir2} {volume_dir3} \
+        -sd {savedir} -nz  -p 4
+```
+
+See the script header for more details.
+
+# Scraping large online datasets 
+
+The patching, deduplication, and filtering pipeline works for volumes in nrrd, mrc, and tif formats. However, very large
+datasets like those generated for connectomics research are often to large to practically download and store in memory.
+Instead they are commonly stored as NGFFs. Our workflow assumes that these datasets will be sparsely sampled.
+The ```scraping/ngff_download.py``` script will download sparsely cropped cubes of image data and save them in the 
+nrrd format for compatibility with the rest of this workflow.
+
+For example, to download 5 gigabytes of image data from a list of datasets:
+
+```bash
+python ngff_download.py ngff_datasets.csv {save_path} -gb 5
+```
+
+Similarly, large datasets that are not stored in NGFF but are over some size threshold (we've used 5 GB in our work)
+can be cropped into smaller ROIs with the ```crop_rois_from_volume.py``` script.