BXC-4557 - Phase 1 processing fixes (#32)

* Filter out files to process based on a minimum byte size, which defaults to 128kb * Record class as 2 if it contains an invalid bounding box. Display readable label for predicted class. Display 'extended' for cases where the bar just had to be extended * Remove log statement, and add warning about skipped file * overly large bounding boxes should now be picked up by is_problematic_box
UNC-Libraries · May 7, 2024 · cc540af · cc540af
1 parent e3b6f4b
commit cc540af
Show file tree

Hide file tree

Showing 9 changed files with 90 additions and 21 deletions.
diff --git a/segmenter_predict.py b/segmenter_predict.py
@@ -19,6 +19,8 @@
                     help='If provided, then the progress log and CSV report will be discarded and processing will start from the beginning')
 parser.add_argument('--refresh', action="store_true",
                     help='If provided, then the list of files to process will be refreshed from disk')
+parser.add_argument('-b', '--minimum-bytes', type=int, default=128000,
+                    help='Minimum size of files to process, in bytes. Default: 128000')
 
 
 args = parser.parse_args()
@@ -31,7 +33,7 @@
 
 path = None
 if args.file_list or args.src_path.is_dir():
-  paths = CachedFileList(args.src_path, extensions, args.refresh)
+  paths = CachedFileList(args.src_path, extensions, args.refresh, minimum_bytes = args.minimum_bytes)
   print(f'Found {len(paths)} paths for processing')
 else:
   paths = [args.src_path]

diff --git a/src/reports/seg_report.html b/src/reports/seg_report.html
@@ -65,11 +65,19 @@
         columns: [
             { title: 'Image', data: 'image', width: "25%", render: (d,t,r,m) => '<a href="' + d + '" target="_blank"><img src="'+d+'" style=height:200px; loading="lazy" /></a>'},
             { title: 'Path', data: 'original'},
-            { title: 'Class', data: 'pred_class'},
+            { title: 'Class', data: 'pred_class', render: function (data, type, row, meta) {
+              if (data == 0) {
+                return 'no color bar';
+              } else if (data == 1) {
+                return 'has color bar';
+              } else {
+                return 'invalid box';
+              }
+            }},
             { title: 'Confidence', data: 'pred_conf', render: $.fn.dataTable.render.number(',', '.', 3, '')},
             { title: 'Problem', data: 'problem', render: function (data, type, row, meta) {
               if (data) {
-                return 'problem detected';
+                return 'extended';
               } else {
                 return '';
               }

diff --git a/src/tests/test_bounding_box_utils.py b/src/tests/test_bounding_box_utils.py
@@ -56,6 +56,10 @@ def test_is_problematic_box_none(self):
     coords = None
     assert not is_problematic_box(coords)
 
+  def test_is_problematic_box_three_edges_too_big(self):
+    coords = [0.0, 0.0616408920288086, 1.0, 1.0]
+    assert is_problematic_box(coords)
+
   def test_extend_bounding_box_to_edges_one_edge_left(self):
     coords = [0.0, 0.10288684844970702, 0.0860845947265625, 0.88]
     result = extend_bounding_box_to_edges(coords)

diff --git a/src/tests/test_cached_file_list.py b/src/tests/test_cached_file_list.py
@@ -37,6 +37,23 @@ def test_with_directory_from_cache(self, tmp_path):
     finally:
       subject.cache_path.unlink()
 
+  def test_with_directory_files_too_small(self, tmp_path):
+    self.setup_test_dir(tmp_path)
+    subject = CachedFileList(self.base_path, { ".jpg", ".tiff" }, minimum_bytes = 20000)
+    try:
+      assert len(subject) == 0
+
+      # Increase size of the first file to above the threshold
+      self.write_file(self.file_path1, num_kb = 32)
+      subject = CachedFileList(self.base_path, { ".jpg", ".tiff" }, True, minimum_bytes = 20000)
+
+      assert len(subject) == 1
+      paths = self.collect_paths(subject)
+      assert len(subject) == 1
+      assert self.file_path1 in paths
+    finally:
+      subject.cache_path.unlink()
+
   def test_with_file_list(self, tmp_path):
     self.setup_test_dir(tmp_path)
     file_path5 = tmp_path / "file5.tiff"
@@ -102,9 +119,12 @@ def setup_test_dir(self, tmp_path):
     self.write_file(self.file_path2)
     self.write_file(self.file_path3)
 
-  def write_file(self, file_path):
-    with open(file_path, "w") as text_file:
-      text_file.write(str(file_path))
+  def write_file(self, file_path, num_kb = 8):
+    content = "content!" * 128
+    with open(file_path, "a") as binary_file:
+      for _ in range(num_kb):
+        binary_file.write(content)
+    print(f"Wrote file {file_path.stat()}")
 
   def collect_paths(self, subject):
     paths = []

diff --git a/src/tests/test_segmentation_workflow_service.py b/src/tests/test_segmentation_workflow_service.py
@@ -36,6 +36,11 @@ def mock_model():
         'boxes': torch.tensor([[  0.0000,  100.5813, 150.0, 1333.0]]),
         'labels': torch.tensor([1]),
         'scores': torch.tensor([0.8920])
+      },
+      {
+        'boxes': torch.tensor([[  111.0000,  100.5813, 400.0, 400.0]]),
+        'labels': torch.tensor([1]),
+        'scores': torch.tensor([0.76])
       }]
     # each call to model(image) will return a different set of outputs
     model_mock.side_effect = [value1, value3]
@@ -53,14 +58,15 @@ def test_process(self, config, tmp_path, mock_load_from_checkpoint):
     service = SegmentationWorkflowService(config, report_path)
     service.process([Path('./fixtures/normalized_images/gilmer/00276_op0204_0001.jpg'),
       Path('./fixtures/normalized_images/gilmer/00276_op0204_0001_tiny.jpg'),
-      Path('./fixtures/normalized_images/gilmer/00276_op0226a_0001.jpg')])
+      Path('./fixtures/normalized_images/gilmer/00276_op0226a_0001.jpg'),
+      Path('./fixtures/normalized_images/gilmer/00276_op0217_0001_e.jpg')])
 
     assert report_path.exists()
     with open(report_path, newline='') as f:
       reader = csv.reader(f)
       data = list(reader)
 
-      assert len(data) == 4
+      assert len(data) == 5
       assert data[1][0].endswith('fixtures/normalized_images/gilmer/00276_op0204_0001.jpg')
       assert 'output/00276_op0204_0001.jpg' in data[1][1]
       assert data[1][2] == '1'
@@ -81,3 +87,10 @@ def test_process(self, config, tmp_path, mock_load_from_checkpoint):
       assert data[3][3] == '0.8920'
       assert data[3][4] == '[0.0, 0.19644784927368164, 0.29296875, 1.0]'
       assert data[3][5] == '[0.0, 0.0, 0.29296875, 1.0]'
+
+      assert data[4][0].endswith('fixtures/normalized_images/gilmer/00276_op0217_0001_e.jpg')
+      assert 'output/00276_op0217_0001_e.jpg' in data[4][1]
+      assert data[4][2] == '2'
+      assert data[4][3] == '0.7600'
+      assert data[4][4] == '[0.216796875, 0.19644784927368164, 0.78125, 0.78125]'
+      assert data[4][5] == ''
diff --git a/src/utils/bounding_box_utils.py b/src/utils/bounding_box_utils.py
@@ -48,7 +48,7 @@ def is_problematic_box(coords):
   if coords == None:
     return False
   count = number_sides_at_image_edge(coords)
-  return count != 3
+  return count != 3 or bounding_box_is_too_large(bounding_box_lengths(coords))
 
 def number_sides_at_image_edge(coords):
   count = 0
@@ -65,24 +65,34 @@ def get_box_coords(row, index = 4):
     return box_coords
   return None
 
+def bounding_box_is_too_large(lengths):
+  return lengths[0] >= 0.5 and lengths[1] >= 0.5
+
+def bounding_box_is_square(lengths):
+  return lengths[0] == lengths[1]
+
+def bounding_box_lengths(coords):
+  horizontal_length = coords[2] - coords[0]
+  vertical_length = coords[3] - coords[1]
+  return (horizontal_length, vertical_length)
+
 # Used to extend a bounding box that is only touching 1 or 2 image edges, so that it touches 3
 # edges so that it is usable for cropping
 def extend_bounding_box_to_edges(box_coords):
   coords = box_coords.copy()
-  horizontal_length = coords[2] - coords[0]
-  vertical_length = coords[3] - coords[1]
+  lengths = bounding_box_lengths(coords)
   # don't extend if it'll produce a bounding box greater than or equal to half the image
-  if vertical_length >= 0.5 and horizontal_length >= 0.5:
+  if bounding_box_is_too_large(lengths):
     raise InvalidBoundingBoxException("Cannot extend bounding box to image edges, total size of bounding box is too large")
-  if vertical_length == horizontal_length:
+  if bounding_box_is_square(lengths):
     raise InvalidBoundingBoxException("Cannot extend bounding box to image edges, sides are equal length")
   left_edge = coords[0] == 0
   right_edge = coords[2] == 1
   top_edge = coords[1] == 0
   bottom_edge = coords[3] == 1
   if (left_edge or right_edge) and (bottom_edge or top_edge):
     # bounding box touches two edges, so extend longest edge
-    if vertical_length > horizontal_length:
+    if lengths[1] > lengths[0]:
       coords[1] = 0.0
       coords[3] = 1.0
     else:

diff --git a/src/utils/cached_file_list.py b/src/utils/cached_file_list.py
@@ -10,11 +10,12 @@ class CachedFileList(list):
   a file, then each line in the file will be treated as a path to be added, where any
   directories in the file will be expanded.
   """
-  def __init__(self, file_path, extensions, refresh = False):
+  def __init__(self, file_path, extensions, refresh = False, minimum_bytes = 4000):
     super().__init__()
     self.file_path = file_path
     self.extensions = extensions
     self.cache_path = Path.cwd() / (file_path.stem + "-cache.txt")
+    self.minimum_bytes = minimum_bytes
     if not self.cache_path.exists() or refresh:
       self.populate_cache()
     with open(self.cache_path, "r") as file:
@@ -33,9 +34,9 @@ def populate_cache(self):
         self.recursive_paths_from_file_list()
 
   def add_expanded_dir(self, dir_path):
-    for p in Path(dir_path).glob("**/*"):
-      if p.suffix in self.extensions:
-        print(str(p), file=self.file)
+    for path in Path(dir_path).glob("**/*"):
+      if path.suffix in self.extensions:
+        self.add_file_path(path)
 
   def recursive_paths_from_file_list(self):
     with open(self.file_path) as f:
@@ -45,7 +46,16 @@ def recursive_paths_from_file_list(self):
           print(f"Expanding path {path}")
           self.add_expanded_dir(path)
         else:
-          print(str(path), file=self.file)
+          self.add_file_path(path)
+
+  # Adds the given file path to the cache file if it has an acceptable extension and is above the min size
+  def add_file_path(self, path):
+    if path.suffix in self.extensions:
+      file_size = path.stat().st_size
+      if file_size >= self.minimum_bytes:
+        print(str(path), file=self.file)
+      else:
+        print(f"Skipping small file {path}")
 
   def __iter__(self):
     self.file = open(self.cache_path, "r")

diff --git a/src/utils/segmentation_report_service.py b/src/utils/segmentation_report_service.py
@@ -2,7 +2,7 @@
 import os
 from pathlib import Path
 from src.utils.json_utils import to_json
-from src.utils.bounding_box_utils import draw_bounding_boxes, is_problematic_box, get_box_coords
+from src.utils.bounding_box_utils import draw_bounding_boxes, get_box_coords
 from src.utils.common_utils import log
 from PIL import Image
 import shutil
@@ -63,7 +63,7 @@ def csv_to_data(self, row, image_path):
       'original' : row[0],
       'pred_class' : row[2],
       'pred_conf' : row[3],
-      'problem' : is_problematic_box(boxes),
+      'problem' : bool(row[5]),
       'image' : str(rel_path)
     }
 

diff --git a/src/utils/segmentation_workflow_service.py b/src/utils/segmentation_workflow_service.py
@@ -89,6 +89,8 @@ def process(self, paths):
                     print(f"   Problem detected with bounding box, extending to edges.")
                   except InvalidBoundingBoxException as e:
                     print(e.message)
+                    # Set the predicted class to 2, to indicate its an invalid prediction
+                    predicted_class = 2
               csv_writer.writerow([orig_path, normalized_path, predicted_class, "{:.4f}".format(top_score), box_norms, extended_box])
               self.progress_tracker.record_completed(orig_path)
             except (KeyboardInterrupt, SystemExit) as e: