Skip to content

Commit

Permalink
BXC-4557 - Phase 1 processing fixes (#32)
Browse files Browse the repository at this point in the history
* Filter out files to process based on a minimum byte size, which defaults to 128kb

* Record class as 2 if it contains an invalid bounding box. Display readable label for predicted class. Display 'extended' for cases where the bar just had to be extended

* Remove log statement, and add warning about skipped file

* overly large bounding boxes should now be picked up by is_problematic_box
  • Loading branch information
bbpennel authored May 7, 2024
1 parent e3b6f4b commit cc540af
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 21 deletions.
4 changes: 3 additions & 1 deletion segmenter_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
help='If provided, then the progress log and CSV report will be discarded and processing will start from the beginning')
parser.add_argument('--refresh', action="store_true",
help='If provided, then the list of files to process will be refreshed from disk')
parser.add_argument('-b', '--minimum-bytes', type=int, default=128000,
help='Minimum size of files to process, in bytes. Default: 128000')


args = parser.parse_args()
Expand All @@ -31,7 +33,7 @@

path = None
if args.file_list or args.src_path.is_dir():
paths = CachedFileList(args.src_path, extensions, args.refresh)
paths = CachedFileList(args.src_path, extensions, args.refresh, minimum_bytes = args.minimum_bytes)
print(f'Found {len(paths)} paths for processing')
else:
paths = [args.src_path]
Expand Down
12 changes: 10 additions & 2 deletions src/reports/seg_report.html
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,19 @@
columns: [
{ title: 'Image', data: 'image', width: "25%", render: (d,t,r,m) => '<a href="' + d + '" target="_blank"><img src="'+d+'" style=height:200px; loading="lazy" /></a>'},
{ title: 'Path', data: 'original'},
{ title: 'Class', data: 'pred_class'},
{ title: 'Class', data: 'pred_class', render: function (data, type, row, meta) {
if (data == 0) {
return 'no color bar';
} else if (data == 1) {
return 'has color bar';
} else {
return 'invalid box';
}
}},
{ title: 'Confidence', data: 'pred_conf', render: $.fn.dataTable.render.number(',', '.', 3, '')},
{ title: 'Problem', data: 'problem', render: function (data, type, row, meta) {
if (data) {
return 'problem detected';
return 'extended';
} else {
return '';
}
Expand Down
4 changes: 4 additions & 0 deletions src/tests/test_bounding_box_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ def test_is_problematic_box_none(self):
coords = None
assert not is_problematic_box(coords)

def test_is_problematic_box_three_edges_too_big(self):
coords = [0.0, 0.0616408920288086, 1.0, 1.0]
assert is_problematic_box(coords)

def test_extend_bounding_box_to_edges_one_edge_left(self):
coords = [0.0, 0.10288684844970702, 0.0860845947265625, 0.88]
result = extend_bounding_box_to_edges(coords)
Expand Down
26 changes: 23 additions & 3 deletions src/tests/test_cached_file_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,23 @@ def test_with_directory_from_cache(self, tmp_path):
finally:
subject.cache_path.unlink()

def test_with_directory_files_too_small(self, tmp_path):
self.setup_test_dir(tmp_path)
subject = CachedFileList(self.base_path, { ".jpg", ".tiff" }, minimum_bytes = 20000)
try:
assert len(subject) == 0

# Increase size of the first file to above the threshold
self.write_file(self.file_path1, num_kb = 32)
subject = CachedFileList(self.base_path, { ".jpg", ".tiff" }, True, minimum_bytes = 20000)

assert len(subject) == 1
paths = self.collect_paths(subject)
assert len(subject) == 1
assert self.file_path1 in paths
finally:
subject.cache_path.unlink()

def test_with_file_list(self, tmp_path):
self.setup_test_dir(tmp_path)
file_path5 = tmp_path / "file5.tiff"
Expand Down Expand Up @@ -102,9 +119,12 @@ def setup_test_dir(self, tmp_path):
self.write_file(self.file_path2)
self.write_file(self.file_path3)

def write_file(self, file_path):
with open(file_path, "w") as text_file:
text_file.write(str(file_path))
def write_file(self, file_path, num_kb = 8):
content = "content!" * 128
with open(file_path, "a") as binary_file:
for _ in range(num_kb):
binary_file.write(content)
print(f"Wrote file {file_path.stat()}")

def collect_paths(self, subject):
paths = []
Expand Down
17 changes: 15 additions & 2 deletions src/tests/test_segmentation_workflow_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def mock_model():
'boxes': torch.tensor([[ 0.0000, 100.5813, 150.0, 1333.0]]),
'labels': torch.tensor([1]),
'scores': torch.tensor([0.8920])
},
{
'boxes': torch.tensor([[ 111.0000, 100.5813, 400.0, 400.0]]),
'labels': torch.tensor([1]),
'scores': torch.tensor([0.76])
}]
# each call to model(image) will return a different set of outputs
model_mock.side_effect = [value1, value3]
Expand All @@ -53,14 +58,15 @@ def test_process(self, config, tmp_path, mock_load_from_checkpoint):
service = SegmentationWorkflowService(config, report_path)
service.process([Path('./fixtures/normalized_images/gilmer/00276_op0204_0001.jpg'),
Path('./fixtures/normalized_images/gilmer/00276_op0204_0001_tiny.jpg'),
Path('./fixtures/normalized_images/gilmer/00276_op0226a_0001.jpg')])
Path('./fixtures/normalized_images/gilmer/00276_op0226a_0001.jpg'),
Path('./fixtures/normalized_images/gilmer/00276_op0217_0001_e.jpg')])

assert report_path.exists()
with open(report_path, newline='') as f:
reader = csv.reader(f)
data = list(reader)

assert len(data) == 4
assert len(data) == 5
assert data[1][0].endswith('fixtures/normalized_images/gilmer/00276_op0204_0001.jpg')
assert 'output/00276_op0204_0001.jpg' in data[1][1]
assert data[1][2] == '1'
Expand All @@ -81,3 +87,10 @@ def test_process(self, config, tmp_path, mock_load_from_checkpoint):
assert data[3][3] == '0.8920'
assert data[3][4] == '[0.0, 0.19644784927368164, 0.29296875, 1.0]'
assert data[3][5] == '[0.0, 0.0, 0.29296875, 1.0]'

assert data[4][0].endswith('fixtures/normalized_images/gilmer/00276_op0217_0001_e.jpg')
assert 'output/00276_op0217_0001_e.jpg' in data[4][1]
assert data[4][2] == '2'
assert data[4][3] == '0.7600'
assert data[4][4] == '[0.216796875, 0.19644784927368164, 0.78125, 0.78125]'
assert data[4][5] == ''
22 changes: 16 additions & 6 deletions src/utils/bounding_box_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def is_problematic_box(coords):
if coords == None:
return False
count = number_sides_at_image_edge(coords)
return count != 3
return count != 3 or bounding_box_is_too_large(bounding_box_lengths(coords))

def number_sides_at_image_edge(coords):
count = 0
Expand All @@ -65,24 +65,34 @@ def get_box_coords(row, index = 4):
return box_coords
return None

def bounding_box_is_too_large(lengths):
return lengths[0] >= 0.5 and lengths[1] >= 0.5

def bounding_box_is_square(lengths):
return lengths[0] == lengths[1]

def bounding_box_lengths(coords):
horizontal_length = coords[2] - coords[0]
vertical_length = coords[3] - coords[1]
return (horizontal_length, vertical_length)

# Used to extend a bounding box that is only touching 1 or 2 image edges, so that it touches 3
# edges so that it is usable for cropping
def extend_bounding_box_to_edges(box_coords):
coords = box_coords.copy()
horizontal_length = coords[2] - coords[0]
vertical_length = coords[3] - coords[1]
lengths = bounding_box_lengths(coords)
# don't extend if it'll produce a bounding box greater than or equal to half the image
if vertical_length >= 0.5 and horizontal_length >= 0.5:
if bounding_box_is_too_large(lengths):
raise InvalidBoundingBoxException("Cannot extend bounding box to image edges, total size of bounding box is too large")
if vertical_length == horizontal_length:
if bounding_box_is_square(lengths):
raise InvalidBoundingBoxException("Cannot extend bounding box to image edges, sides are equal length")
left_edge = coords[0] == 0
right_edge = coords[2] == 1
top_edge = coords[1] == 0
bottom_edge = coords[3] == 1
if (left_edge or right_edge) and (bottom_edge or top_edge):
# bounding box touches two edges, so extend longest edge
if vertical_length > horizontal_length:
if lengths[1] > lengths[0]:
coords[1] = 0.0
coords[3] = 1.0
else:
Expand Down
20 changes: 15 additions & 5 deletions src/utils/cached_file_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@ class CachedFileList(list):
a file, then each line in the file will be treated as a path to be added, where any
directories in the file will be expanded.
"""
def __init__(self, file_path, extensions, refresh = False):
def __init__(self, file_path, extensions, refresh = False, minimum_bytes = 4000):
super().__init__()
self.file_path = file_path
self.extensions = extensions
self.cache_path = Path.cwd() / (file_path.stem + "-cache.txt")
self.minimum_bytes = minimum_bytes
if not self.cache_path.exists() or refresh:
self.populate_cache()
with open(self.cache_path, "r") as file:
Expand All @@ -33,9 +34,9 @@ def populate_cache(self):
self.recursive_paths_from_file_list()

def add_expanded_dir(self, dir_path):
for p in Path(dir_path).glob("**/*"):
if p.suffix in self.extensions:
print(str(p), file=self.file)
for path in Path(dir_path).glob("**/*"):
if path.suffix in self.extensions:
self.add_file_path(path)

def recursive_paths_from_file_list(self):
with open(self.file_path) as f:
Expand All @@ -45,7 +46,16 @@ def recursive_paths_from_file_list(self):
print(f"Expanding path {path}")
self.add_expanded_dir(path)
else:
print(str(path), file=self.file)
self.add_file_path(path)

# Adds the given file path to the cache file if it has an acceptable extension and is above the min size
def add_file_path(self, path):
if path.suffix in self.extensions:
file_size = path.stat().st_size
if file_size >= self.minimum_bytes:
print(str(path), file=self.file)
else:
print(f"Skipping small file {path}")

def __iter__(self):
self.file = open(self.cache_path, "r")
Expand Down
4 changes: 2 additions & 2 deletions src/utils/segmentation_report_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
from pathlib import Path
from src.utils.json_utils import to_json
from src.utils.bounding_box_utils import draw_bounding_boxes, is_problematic_box, get_box_coords
from src.utils.bounding_box_utils import draw_bounding_boxes, get_box_coords
from src.utils.common_utils import log
from PIL import Image
import shutil
Expand Down Expand Up @@ -63,7 +63,7 @@ def csv_to_data(self, row, image_path):
'original' : row[0],
'pred_class' : row[2],
'pred_conf' : row[3],
'problem' : is_problematic_box(boxes),
'problem' : bool(row[5]),
'image' : str(rel_path)
}

Expand Down
2 changes: 2 additions & 0 deletions src/utils/segmentation_workflow_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def process(self, paths):
print(f" Problem detected with bounding box, extending to edges.")
except InvalidBoundingBoxException as e:
print(e.message)
# Set the predicted class to 2, to indicate its an invalid prediction
predicted_class = 2
csv_writer.writerow([orig_path, normalized_path, predicted_class, "{:.4f}".format(top_score), box_norms, extended_box])
self.progress_tracker.record_completed(orig_path)
except (KeyboardInterrupt, SystemExit) as e:
Expand Down

0 comments on commit cc540af

Please sign in to comment.