diff --git a/segmenter_predict.py b/segmenter_predict.py index 6cd497d..c1aaca3 100644 --- a/segmenter_predict.py +++ b/segmenter_predict.py @@ -19,6 +19,8 @@ help='If provided, then the progress log and CSV report will be discarded and processing will start from the beginning') parser.add_argument('--refresh', action="store_true", help='If provided, then the list of files to process will be refreshed from disk') +parser.add_argument('-b', '--minimum-bytes', type=int, default=128000, + help='Minimum size of files to process, in bytes. Default: 128000') args = parser.parse_args() @@ -31,7 +33,7 @@ path = None if args.file_list or args.src_path.is_dir(): - paths = CachedFileList(args.src_path, extensions, args.refresh) + paths = CachedFileList(args.src_path, extensions, args.refresh, minimum_bytes = args.minimum_bytes) print(f'Found {len(paths)} paths for processing') else: paths = [args.src_path] diff --git a/src/reports/seg_report.html b/src/reports/seg_report.html index a06ccd0..836aa3b 100644 --- a/src/reports/seg_report.html +++ b/src/reports/seg_report.html @@ -65,11 +65,19 @@ columns: [ { title: 'Image', data: 'image', width: "25%", render: (d,t,r,m) => ''}, { title: 'Path', data: 'original'}, - { title: 'Class', data: 'pred_class'}, + { title: 'Class', data: 'pred_class', render: function (data, type, row, meta) { + if (data == 0) { + return 'no color bar'; + } else if (data == 1) { + return 'has color bar'; + } else { + return 'invalid box'; + } + }}, { title: 'Confidence', data: 'pred_conf', render: $.fn.dataTable.render.number(',', '.', 3, '')}, { title: 'Problem', data: 'problem', render: function (data, type, row, meta) { if (data) { - return 'problem detected'; + return 'extended'; } else { return ''; } diff --git a/src/tests/test_bounding_box_utils.py b/src/tests/test_bounding_box_utils.py index 7846a84..98fcc68 100644 --- a/src/tests/test_bounding_box_utils.py +++ b/src/tests/test_bounding_box_utils.py @@ -56,6 +56,10 @@ def test_is_problematic_box_none(self): coords = None assert not is_problematic_box(coords) + def test_is_problematic_box_three_edges_too_big(self): + coords = [0.0, 0.0616408920288086, 1.0, 1.0] + assert is_problematic_box(coords) + def test_extend_bounding_box_to_edges_one_edge_left(self): coords = [0.0, 0.10288684844970702, 0.0860845947265625, 0.88] result = extend_bounding_box_to_edges(coords) diff --git a/src/tests/test_cached_file_list.py b/src/tests/test_cached_file_list.py index 268e12a..f4f599e 100644 --- a/src/tests/test_cached_file_list.py +++ b/src/tests/test_cached_file_list.py @@ -37,6 +37,23 @@ def test_with_directory_from_cache(self, tmp_path): finally: subject.cache_path.unlink() + def test_with_directory_files_too_small(self, tmp_path): + self.setup_test_dir(tmp_path) + subject = CachedFileList(self.base_path, { ".jpg", ".tiff" }, minimum_bytes = 20000) + try: + assert len(subject) == 0 + + # Increase size of the first file to above the threshold + self.write_file(self.file_path1, num_kb = 32) + subject = CachedFileList(self.base_path, { ".jpg", ".tiff" }, True, minimum_bytes = 20000) + + assert len(subject) == 1 + paths = self.collect_paths(subject) + assert len(subject) == 1 + assert self.file_path1 in paths + finally: + subject.cache_path.unlink() + def test_with_file_list(self, tmp_path): self.setup_test_dir(tmp_path) file_path5 = tmp_path / "file5.tiff" @@ -102,9 +119,12 @@ def setup_test_dir(self, tmp_path): self.write_file(self.file_path2) self.write_file(self.file_path3) - def write_file(self, file_path): - with open(file_path, "w") as text_file: - text_file.write(str(file_path)) + def write_file(self, file_path, num_kb = 8): + content = "content!" * 128 + with open(file_path, "a") as binary_file: + for _ in range(num_kb): + binary_file.write(content) + print(f"Wrote file {file_path.stat()}") def collect_paths(self, subject): paths = [] diff --git a/src/tests/test_segmentation_workflow_service.py b/src/tests/test_segmentation_workflow_service.py index f72c99a..838129c 100644 --- a/src/tests/test_segmentation_workflow_service.py +++ b/src/tests/test_segmentation_workflow_service.py @@ -36,6 +36,11 @@ def mock_model(): 'boxes': torch.tensor([[ 0.0000, 100.5813, 150.0, 1333.0]]), 'labels': torch.tensor([1]), 'scores': torch.tensor([0.8920]) + }, + { + 'boxes': torch.tensor([[ 111.0000, 100.5813, 400.0, 400.0]]), + 'labels': torch.tensor([1]), + 'scores': torch.tensor([0.76]) }] # each call to model(image) will return a different set of outputs model_mock.side_effect = [value1, value3] @@ -53,14 +58,15 @@ def test_process(self, config, tmp_path, mock_load_from_checkpoint): service = SegmentationWorkflowService(config, report_path) service.process([Path('./fixtures/normalized_images/gilmer/00276_op0204_0001.jpg'), Path('./fixtures/normalized_images/gilmer/00276_op0204_0001_tiny.jpg'), - Path('./fixtures/normalized_images/gilmer/00276_op0226a_0001.jpg')]) + Path('./fixtures/normalized_images/gilmer/00276_op0226a_0001.jpg'), + Path('./fixtures/normalized_images/gilmer/00276_op0217_0001_e.jpg')]) assert report_path.exists() with open(report_path, newline='') as f: reader = csv.reader(f) data = list(reader) - assert len(data) == 4 + assert len(data) == 5 assert data[1][0].endswith('fixtures/normalized_images/gilmer/00276_op0204_0001.jpg') assert 'output/00276_op0204_0001.jpg' in data[1][1] assert data[1][2] == '1' @@ -81,3 +87,10 @@ def test_process(self, config, tmp_path, mock_load_from_checkpoint): assert data[3][3] == '0.8920' assert data[3][4] == '[0.0, 0.19644784927368164, 0.29296875, 1.0]' assert data[3][5] == '[0.0, 0.0, 0.29296875, 1.0]' + + assert data[4][0].endswith('fixtures/normalized_images/gilmer/00276_op0217_0001_e.jpg') + assert 'output/00276_op0217_0001_e.jpg' in data[4][1] + assert data[4][2] == '2' + assert data[4][3] == '0.7600' + assert data[4][4] == '[0.216796875, 0.19644784927368164, 0.78125, 0.78125]' + assert data[4][5] == '' diff --git a/src/utils/bounding_box_utils.py b/src/utils/bounding_box_utils.py index 5b19334..676f7ef 100644 --- a/src/utils/bounding_box_utils.py +++ b/src/utils/bounding_box_utils.py @@ -48,7 +48,7 @@ def is_problematic_box(coords): if coords == None: return False count = number_sides_at_image_edge(coords) - return count != 3 + return count != 3 or bounding_box_is_too_large(bounding_box_lengths(coords)) def number_sides_at_image_edge(coords): count = 0 @@ -65,16 +65,26 @@ def get_box_coords(row, index = 4): return box_coords return None +def bounding_box_is_too_large(lengths): + return lengths[0] >= 0.5 and lengths[1] >= 0.5 + +def bounding_box_is_square(lengths): + return lengths[0] == lengths[1] + +def bounding_box_lengths(coords): + horizontal_length = coords[2] - coords[0] + vertical_length = coords[3] - coords[1] + return (horizontal_length, vertical_length) + # Used to extend a bounding box that is only touching 1 or 2 image edges, so that it touches 3 # edges so that it is usable for cropping def extend_bounding_box_to_edges(box_coords): coords = box_coords.copy() - horizontal_length = coords[2] - coords[0] - vertical_length = coords[3] - coords[1] + lengths = bounding_box_lengths(coords) # don't extend if it'll produce a bounding box greater than or equal to half the image - if vertical_length >= 0.5 and horizontal_length >= 0.5: + if bounding_box_is_too_large(lengths): raise InvalidBoundingBoxException("Cannot extend bounding box to image edges, total size of bounding box is too large") - if vertical_length == horizontal_length: + if bounding_box_is_square(lengths): raise InvalidBoundingBoxException("Cannot extend bounding box to image edges, sides are equal length") left_edge = coords[0] == 0 right_edge = coords[2] == 1 @@ -82,7 +92,7 @@ def extend_bounding_box_to_edges(box_coords): bottom_edge = coords[3] == 1 if (left_edge or right_edge) and (bottom_edge or top_edge): # bounding box touches two edges, so extend longest edge - if vertical_length > horizontal_length: + if lengths[1] > lengths[0]: coords[1] = 0.0 coords[3] = 1.0 else: diff --git a/src/utils/cached_file_list.py b/src/utils/cached_file_list.py index 0ef2f3f..accc4b0 100644 --- a/src/utils/cached_file_list.py +++ b/src/utils/cached_file_list.py @@ -10,11 +10,12 @@ class CachedFileList(list): a file, then each line in the file will be treated as a path to be added, where any directories in the file will be expanded. """ - def __init__(self, file_path, extensions, refresh = False): + def __init__(self, file_path, extensions, refresh = False, minimum_bytes = 4000): super().__init__() self.file_path = file_path self.extensions = extensions self.cache_path = Path.cwd() / (file_path.stem + "-cache.txt") + self.minimum_bytes = minimum_bytes if not self.cache_path.exists() or refresh: self.populate_cache() with open(self.cache_path, "r") as file: @@ -33,9 +34,9 @@ def populate_cache(self): self.recursive_paths_from_file_list() def add_expanded_dir(self, dir_path): - for p in Path(dir_path).glob("**/*"): - if p.suffix in self.extensions: - print(str(p), file=self.file) + for path in Path(dir_path).glob("**/*"): + if path.suffix in self.extensions: + self.add_file_path(path) def recursive_paths_from_file_list(self): with open(self.file_path) as f: @@ -45,7 +46,16 @@ def recursive_paths_from_file_list(self): print(f"Expanding path {path}") self.add_expanded_dir(path) else: - print(str(path), file=self.file) + self.add_file_path(path) + + # Adds the given file path to the cache file if it has an acceptable extension and is above the min size + def add_file_path(self, path): + if path.suffix in self.extensions: + file_size = path.stat().st_size + if file_size >= self.minimum_bytes: + print(str(path), file=self.file) + else: + print(f"Skipping small file {path}") def __iter__(self): self.file = open(self.cache_path, "r") diff --git a/src/utils/segmentation_report_service.py b/src/utils/segmentation_report_service.py index 149c7c3..d1a464a 100644 --- a/src/utils/segmentation_report_service.py +++ b/src/utils/segmentation_report_service.py @@ -2,7 +2,7 @@ import os from pathlib import Path from src.utils.json_utils import to_json -from src.utils.bounding_box_utils import draw_bounding_boxes, is_problematic_box, get_box_coords +from src.utils.bounding_box_utils import draw_bounding_boxes, get_box_coords from src.utils.common_utils import log from PIL import Image import shutil @@ -63,7 +63,7 @@ def csv_to_data(self, row, image_path): 'original' : row[0], 'pred_class' : row[2], 'pred_conf' : row[3], - 'problem' : is_problematic_box(boxes), + 'problem' : bool(row[5]), 'image' : str(rel_path) } diff --git a/src/utils/segmentation_workflow_service.py b/src/utils/segmentation_workflow_service.py index 9819e5a..15148f7 100644 --- a/src/utils/segmentation_workflow_service.py +++ b/src/utils/segmentation_workflow_service.py @@ -89,6 +89,8 @@ def process(self, paths): print(f" Problem detected with bounding box, extending to edges.") except InvalidBoundingBoxException as e: print(e.message) + # Set the predicted class to 2, to indicate its an invalid prediction + predicted_class = 2 csv_writer.writerow([orig_path, normalized_path, predicted_class, "{:.4f}".format(top_score), box_norms, extended_box]) self.progress_tracker.record_completed(orig_path) except (KeyboardInterrupt, SystemExit) as e: