Fix inconsistent anchor boxes generation between train and inference

tensorflower-gardener · tensorflower-gardener · commit 302e71ac379a · 2024-03-18T17:33:55.000-07:00
* Currently, `AnchorGenerator` and `Anchor` can actually generate different anchor boxes when input image size is not divisible by 2^max_level. This results in inconsistent training and inference box predictions. This change fix the inconsistency by calling only the Anchor class.
* Fix the case when input image is not square.
* Refactor the Anchor class to only generate and store `multilevel_boxes` since the flatten boxes are never used.

PiperOrigin-RevId: 616997347
diff --git a/official/vision/ops/anchor.py b/official/vision/ops/anchor.py
@@ -22,7 +22,6 @@
 
 import tensorflow as tf, tf_keras
 
-from official.vision.ops import anchor_generator
 from official.vision.ops import box_matcher
 from official.vision.ops import iou_similarity
 from official.vision.ops import target_gather
@@ -32,7 +31,38 @@
 
 
 class Anchor(object):
-  """Anchor class for anchor-based object detectors."""
+  """Anchor class for anchor-based object detectors.
+
+  Example:
+  ```python
+  anchor_boxes = Anchor(
+      min_level=3,
+      max_level=4,
+      num_scales=2,
+      aspect_ratios=[0.5, 1., 2.],
+      anchor_size=4.,
+      image_size=[256, 256],
+  ).multilevel_boxes
+  ```
+
+  Attributes:
+    min_level: integer number of minimum level of the output feature pyramid.
+    max_level: integer number of maximum level of the output feature pyramid.
+    num_scales: integer number representing intermediate scales added on each
+      level. For instances, num_scales=2 adds one additional intermediate
+      anchor scales [2^0, 2^0.5] on each level.
+    aspect_ratios: list of float numbers representing the aspect ratio anchors
+      added on each level. The number indicates the ratio of width to height.
+      For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
+      scale level.
+    anchor_size: float number representing the scale of size of the base
+      anchor to the feature stride 2^level.
+    image_size: a list of integer numbers or Tensors representing [height,
+      width] of the input image size.
+    multilevel_boxes: an OrderedDict from level to the generated anchor boxes of
+      shape [height_l, width_l, num_anchors_per_location * 4].
+    anchors_per_location: number of anchors per pixel location.
+  """
 
   def __init__(
       self,
@@ -43,57 +73,40 @@ def __init__(
       anchor_size,
       image_size,
   ):
-    """Constructs multi-scale anchors.
-
-    Args:
-      min_level: integer number of minimum level of the output feature pyramid.
-      max_level: integer number of maximum level of the output feature pyramid.
-      num_scales: integer number representing intermediate scales added on each
-        level. For instances, num_scales=2 adds one additional intermediate
-        anchor scales [2^0, 2^0.5] on each level.
-      aspect_ratios: list of float numbers representing the aspect ratio anchors
-        added on each level. The number indicates the ratio of width to height.
-        For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
-        scale level.
-      anchor_size: float number representing the scale of size of the base
-        anchor to the feature stride 2^level.
-      image_size: a list of integer numbers or Tensors representing [height,
-        width] of the input image size.The image_size should be divided by the
-        largest feature stride 2^max_level.
-    """
+    """Initializes the instance."""
     self.min_level = min_level
     self.max_level = max_level
     self.num_scales = num_scales
     self.aspect_ratios = aspect_ratios
     self.anchor_size = anchor_size
     self.image_size = image_size
-    self.boxes = self._generate_boxes()
+    self.multilevel_boxes = self._generate_multilevel_boxes()
 
-  def _generate_boxes(self) -> tf.Tensor:
+  def _generate_multilevel_boxes(self) -> Dict[str, tf.Tensor]:
     """Generates multi-scale anchor boxes.
 
     Returns:
-      a Tensor of shape [N, 4], representing anchor boxes of all levels
-      concatenated together.
+      An OrderedDict from level to anchor boxes of shape [height_l, width_l,
+      num_anchors_per_location * 4].
     """
-    boxes_all = []
+    multilevel_boxes = collections.OrderedDict()
     for level in range(self.min_level, self.max_level + 1):
       boxes_l = []
-      feat_size = math.ceil(self.image_size[0] / 2**level)
-      stride = tf.cast(self.image_size[0] / feat_size, tf.float32)
+      feat_size_y = math.ceil(self.image_size[0] / 2**level)
+      feat_size_x = math.ceil(self.image_size[1] / 2**level)
+      stride_y = tf.cast(self.image_size[0] / feat_size_y, tf.float32)
+      stride_x = tf.cast(self.image_size[1] / feat_size_x, tf.float32)
+      x = tf.range(stride_x / 2, self.image_size[1], stride_x)
+      y = tf.range(stride_y / 2, self.image_size[0], stride_y)
+      xv, yv = tf.meshgrid(x, y)
       for scale in range(self.num_scales):
         for aspect_ratio in self.aspect_ratios:
-          intermidate_scale = 2 ** (scale / float(self.num_scales))
-          base_anchor_size = self.anchor_size * stride * intermidate_scale
+          intermidate_scale = 2 ** (scale / self.num_scales)
+          base_anchor_size = self.anchor_size * 2**level * intermidate_scale
           aspect_x = aspect_ratio**0.5
           aspect_y = aspect_ratio**-0.5
           half_anchor_size_x = base_anchor_size * aspect_x / 2.0
           half_anchor_size_y = base_anchor_size * aspect_y / 2.0
-          x = tf.range(stride / 2, self.image_size[1], stride)
-          y = tf.range(stride / 2, self.image_size[0], stride)
-          xv, yv = tf.meshgrid(x, y)
-          xv = tf.cast(tf.reshape(xv, [-1]), dtype=tf.float32)
-          yv = tf.cast(tf.reshape(yv, [-1]), dtype=tf.float32)
           # Tensor shape Nx4.
           boxes = tf.stack(
               [
@@ -102,41 +115,18 @@ def _generate_boxes(self) -> tf.Tensor:
                   yv + half_anchor_size_y,
                   xv + half_anchor_size_x,
               ],
-              axis=1,
+              axis=-1,
           )
           boxes_l.append(boxes)
-      # Concat anchors on the same level to tensor shape NxAx4.
-      boxes_l = tf.stack(boxes_l, axis=1)
-      boxes_l = tf.reshape(boxes_l, [-1, 4])
-      boxes_all.append(boxes_l)
-    return tf.concat(boxes_all, axis=0)
-
-  def unpack_labels(self, labels: tf.Tensor) -> Dict[str, tf.Tensor]:
-    """Unpacks an array of labels into multi-scales labels."""
-    unpacked_labels = collections.OrderedDict()
-    count = 0
-    for level in range(self.min_level, self.max_level + 1):
-      feat_size_y = tf.cast(
-          math.ceil(self.image_size[0] / 2**level), tf.int32
-      )
-      feat_size_x = tf.cast(
-          math.ceil(self.image_size[1] / 2**level), tf.int32
-      )
-      steps = feat_size_y * feat_size_x * self.anchors_per_location
-      unpacked_labels[str(level)] = tf.reshape(
-          labels[count : count + steps], [feat_size_y, feat_size_x, -1]
-      )
-      count += steps
-    return unpacked_labels
+      # Concat anchors on the same level to tensor shape HxWx(Ax4).
+      boxes_l = tf.concat(boxes_l, axis=-1)
+      multilevel_boxes[str(level)] = boxes_l
+    return multilevel_boxes
 
   @property
-  def anchors_per_location(self):
+  def anchors_per_location(self) -> int:
     return self.num_scales * len(self.aspect_ratios)
 
-  @property
-  def multilevel_boxes(self):
-    return self.unpack_labels(self.boxes)
-
 
 class AnchorLabeler(object):
   """Labeler for dense object detector."""
@@ -420,24 +410,68 @@ def label_anchors(  # pytype: disable=signature-mismatch  # overriding-parameter
     return score_targets_dict, box_targets_dict
 
 
+class AnchorGeneratorv2:
+  """Utility to generate anchors for a multiple feature maps.
+
+  Attributes:
+    min_level: integer number of minimum level of the output feature pyramid.
+    max_level: integer number of maximum level of the output feature pyramid.
+    num_scales: integer number representing intermediate scales added on each
+      level. For instances, num_scales=2 adds one additional intermediate
+      anchor scales [2^0, 2^0.5] on each level.
+    aspect_ratios: list of float numbers representing the aspect ratio anchors
+      added on each level. The number indicates the ratio of width to height.
+      For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each
+      scale level.
+    anchor_size: float number representing the scale of size of the base
+      anchor to the feature stride 2^level.
+  """
+
+  def __init__(
+      self,
+      min_level,
+      max_level,
+      num_scales,
+      aspect_ratios,
+      anchor_size,
+    ):
+    """Initializes the instance."""
+    self.min_level = min_level
+    self.max_level = max_level
+    self.num_scales = num_scales
+    self.aspect_ratios = aspect_ratios
+    self.anchor_size = anchor_size
+
+  def __call__(self, image_size):
+    """Generate multilevel anchor boxes.
+
+    Args:
+      image_size: a list of integer numbers or Tensors representing [height,
+        width] of the input image size.
+    Returns:
+      An ordered dictionary from level to anchor boxes of shape [height_l,
+      width_l, num_anchors_per_location * 4].
+    """
+    return Anchor(
+        min_level=self.min_level,
+        max_level=self.max_level,
+        num_scales=self.num_scales,
+        aspect_ratios=self.aspect_ratios,
+        anchor_size=self.anchor_size,
+        image_size=image_size,
+    ).multilevel_boxes
+
+
 def build_anchor_generator(
     min_level, max_level, num_scales, aspect_ratios, anchor_size
 ):
   """Build anchor generator from levels."""
-  anchor_sizes = collections.OrderedDict()
-  strides = collections.OrderedDict()
-  scales = []
-  for scale in range(num_scales):
-    scales.append(2 ** (scale / float(num_scales)))
-  for level in range(min_level, max_level + 1):
-    stride = 2**level
-    strides[str(level)] = stride
-    anchor_sizes[str(level)] = anchor_size * stride
-  anchor_gen = anchor_generator.AnchorGenerator(
-      anchor_sizes=anchor_sizes,
-      scales=scales,
+  anchor_gen = AnchorGeneratorv2(
+      min_level=min_level,
+      max_level=max_level,
+      num_scales=num_scales,
       aspect_ratios=aspect_ratios,
-      strides=strides,
+      anchor_size=anchor_size,
   )
   return anchor_gen
 
diff --git a/official/vision/ops/anchor_generator.py b/official/vision/ops/anchor_generator.py
@@ -109,7 +109,7 @@ def __call__(self, image_size):
     return tf.reshape(result, [shape[0], shape[1], shape[2] * shape[3]])
 
 
-class AnchorGenerator():
+class AnchorGeneratorv1():
   """Utility to generate anchors for a multiple feature maps.
 
   Example:
diff --git a/official/vision/ops/anchor_generator_test.py b/official/vision/ops/anchor_generator_test.py
@@ -77,7 +77,7 @@ def testAnchorGeneration(self, min_level, max_level, aspect_ratios,
     levels = range(min_level, max_level + 1)
     anchor_sizes = [2**(level + 1) for level in levels]
     strides = [2**level for level in levels]
-    anchor_gen = anchor_generator.AnchorGenerator(
+    anchor_gen = anchor_generator.AnchorGeneratorv1(
         anchor_sizes=anchor_sizes,
         scales=[1.],
         aspect_ratios=aspect_ratios,
@@ -98,7 +98,7 @@ def testAnchorGenerationClipped(self, min_level, max_level, aspect_ratios,
     levels = range(min_level, max_level + 1)
     anchor_sizes = [2**(level + 1) for level in levels]
     strides = [2**level for level in levels]
-    anchor_gen = anchor_generator.AnchorGenerator(
+    anchor_gen = anchor_generator.AnchorGeneratorv1(
         anchor_sizes=anchor_sizes,
         scales=[1.],
         aspect_ratios=aspect_ratios,
@@ -122,7 +122,7 @@ def testAnchorGenerationDict(self, min_level, max_level, aspect_ratios,
     levels = range(min_level, max_level + 1)
     anchor_sizes = dict((str(level), 2**(level + 1)) for level in levels)
     strides = dict((str(level), 2**level) for level in levels)
-    anchor_gen = anchor_generator.AnchorGenerator(
+    anchor_gen = anchor_generator.AnchorGeneratorv1(
         anchor_sizes=anchor_sizes,
         scales=[1.],
         aspect_ratios=aspect_ratios,
diff --git a/official/vision/ops/anchor_test.py b/official/vision/ops/anchor_test.py
@@ -58,40 +58,52 @@ def testAnchorRpnSample(self, num_anchors, num_positives,
     self.assertEqual(negatives, expected_negatives)
 
   @parameterized.parameters(
-      # Single scale anchor.
-      (5, 5, 1, [1.0], 2.0,
-       [[-16, -16, 48, 48], [-16, 16, 48, 80],
-        [16, -16, 80, 48], [16, 16, 80, 80]]),
-      # Multi scale anchor.
-      (5, 6, 1, [1.0], 2.0,
-       [[-16, -16, 48, 48], [-16, 16, 48, 80],
-        [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
-      # # Multi aspect ratio anchor.
-      (6, 6, 1, [1.0, 4.0, 0.25], 2.0,
-       [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
-
+      # Single scale anchor
+      (5, 5, 1, [1.0], 2.0, [64, 64],
+       {'5': [[[-16, -16, 48, 48], [-16, 16, 48, 80]],
+              [[16, -16, 80, 48], [16, 16, 80, 80]]]}),
+      # Multi scale anchor
+      (5, 6, 1, [1.0], 2.0, [64, 64],
+       {'5': [[[-16, -16, 48, 48], [-16, 16, 48, 80]],
+              [[16, -16, 80, 48], [16, 16, 80, 80]]],
+        '6': [[[-32, -32, 96, 96]]]}),
+      # Multi aspect ratio anchor
+      (6, 6, 1, [1.0, 4.0, 0.25], 2.0, [64, 64],
+       {'6': [[[-32, -32, 96, 96, -0, -96, 64, 160, -96, -0, 160, 64]]]}),
+      # Intermidate scales
+      (5, 5, 2, [1.0], 1.0, [32, 32],
+       {'5': [[[0, 0, 32, 32,
+                16 - 16 * 2**0.5, 16 - 16 * 2**0.5,
+                16 + 16 * 2**0.5, 16 + 16 * 2**0.5]]]}),
+      # Non-square
+      (5, 5, 1, [1.0], 1.0, [64, 32],
+       {'5': [[[0, 0, 32, 32]],
+              [[32, 0, 64, 32]]]}),
+      # Indivisible by 2^level
+      (5, 5, 1, [1.0], 1.0, [40, 32],
+       {'5': [[[-6, 0, 26, 32]],
+              [[14, 0, 46, 32]]]}),
   )
   def testAnchorGeneration(self, min_level, max_level, num_scales,
-                           aspect_ratios, anchor_size, expected_boxes):
-    image_size = [64, 64]
+                           aspect_ratios, anchor_size, image_size,
+                           expected_boxes):
     anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
                             anchor_size, image_size)
-    boxes = anchors.boxes.numpy()
-    self.assertEqual(expected_boxes, boxes.tolist())
+    self.assertAllClose(expected_boxes, anchors.multilevel_boxes)
 
   @parameterized.parameters(
       # Single scale anchor.
       (5, 5, 1, [1.0], 2.0,
-       [[-16, -16, 48, 48], [-16, 16, 48, 80],
-        [16, -16, 80, 48], [16, 16, 80, 80]]),
+       {'5': [[[-16, -16, 48, 48], [-16, 16, 48, 80]],
+              [[16, -16, 80, 48], [16, 16, 80, 80]]]}),
       # Multi scale anchor.
       (5, 6, 1, [1.0], 2.0,
-       [[-16, -16, 48, 48], [-16, 16, 48, 80],
-        [16, -16, 80, 48], [16, 16, 80, 80], [-32, -32, 96, 96]]),
-      # # Multi aspect ratio anchor.
+       {'5': [[[-16, -16, 48, 48], [-16, 16, 48, 80]],
+              [[16, -16, 80, 48], [16, 16, 80, 80]]],
+        '6': [[[-32, -32, 96, 96]]]}),
+      # Multi aspect ratio anchor.
       (6, 6, 1, [1.0, 4.0, 0.25], 2.0,
-       [[-32, -32, 96, 96], [-0, -96, 64, 160], [-96, -0, 160, 64]]),
-
+       {'6': [[[-32, -32, 96, 96, -0, -96, 64, 160, -96, -0, 160, 64]]]}),
   )
   def testAnchorGenerationWithImageSizeAsTensor(self,
                                                 min_level,
@@ -103,8 +115,25 @@ def testAnchorGenerationWithImageSizeAsTensor(self,
     image_size = tf.constant([64, 64], tf.int32)
     anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
                             anchor_size, image_size)
-    boxes = anchors.boxes.numpy()
-    self.assertEqual(expected_boxes, boxes.tolist())
+    self.assertAllClose(expected_boxes, anchors.multilevel_boxes)
+
+  @parameterized.parameters(
+      (6, 8, 2, [1.0, 2.0, 0.5], 3.0, [320, 256]),
+  )
+  def testAnchorGenerationAreCentered(self, min_level, max_level, num_scales,
+                                      aspect_ratios, anchor_size, image_size):
+    anchors = anchor.Anchor(min_level, max_level, num_scales, aspect_ratios,
+                            anchor_size, image_size)
+    multilevel_boxes = anchors.multilevel_boxes
+    image_size = np.array(image_size)
+    for boxes in multilevel_boxes.values():
+      boxes = boxes.numpy()
+      box_centers = boxes.mean(axis=0).mean(axis=0)
+      box_centers = [
+          (box_centers[0] + box_centers[2]) / 2,
+          (box_centers[1] + box_centers[3]) / 2,
+      ]
+      self.assertAllClose(image_size / 2, box_centers)
 
   @parameterized.parameters(
       (3, 6, 2, [1.0], 2.0, False),
@@ -164,6 +193,7 @@ def testLabelAnchors(self, min_level, max_level, num_scales, aspect_ratios,
       (3, 7, [.5, 1., 2.], 2, 8, (256, 256)),
       (3, 8, [1.], 3, 32, (512, 512)),
       (3, 3, [1.], 2, 4, (32, 32)),
+      (4, 8, [.5, 1., 2.], 2, 3, (320, 256)),
   )
   def testEquivalentResult(self, min_level, max_level, aspect_ratios,
                            num_scales, anchor_size, image_size):