Support custom anchors with different anchors per location by level

tensorflower-gardener · tensorflower-gardener · commit 247331deb4d3 · 2024-02-29T15:35:02.000-08:00
PiperOrigin-RevId: 611616542
diff --git a/official/vision/modeling/factory.py b/official/vision/modeling/factory.py
@@ -14,7 +14,7 @@
 
 """Factory methods to build models."""
 
-from typing import Optional
+from typing import Mapping, Optional
 
 import tensorflow as tf, tf_keras
 
@@ -262,9 +262,28 @@ def build_retinanet(
     model_config: retinanet_cfg.RetinaNet,
     l2_regularizer: Optional[tf_keras.regularizers.Regularizer] = None,
     backbone: Optional[tf_keras.Model] = None,
-    decoder: Optional[tf_keras.Model] = None
+    decoder: Optional[tf_keras.Model] = None,
+    num_anchors_per_location: int | dict[str, int] | None = None,
+    anchor_boxes: Mapping[str, tf.Tensor] | None = None,
 ) -> tf_keras.Model:
-  """Builds RetinaNet model."""
+  """Builds a RetinaNet model.
+
+  Args:
+    input_specs: The InputSpec of the input image tensor to the model.
+    model_config: The RetinaNet model configuration to build from.
+    l2_regularizer: Optional l2 regularizer to use for building the backbone, 
+      decorder, and head.
+    backbone: Optional instance of the backbone model.
+    decoder: Optional instance of the decoder model.
+    num_anchors_per_location: Optional number of anchors per pixel location for
+      building the RetinaNetHead. If an `int`, the same number is used for all
+      levels. If a `dict`, it specifies the number at each level. If `none`, it
+      uses `len(aspect_ratios) * num_scales` from the anchor config by default.
+    anchor_boxes: Optional fixed multilevel anchor boxes for inference.
+
+  Returns:
+    RetinaNet model.
+  """
   norm_activation_config = model_config.norm_activation
   if not backbone:
     backbone = backbones.factory.build_backbone(
@@ -282,7 +301,7 @@ def build_retinanet(
 
   head_config = model_config.head
   generator_config = model_config.detection_generator
-  num_anchors_per_location = (
+  num_anchors_per_location = num_anchors_per_location or (
       len(model_config.anchor.aspect_ratios) * model_config.anchor.num_scales)
 
   head = dense_prediction_heads.RetinaNetHead(
@@ -333,16 +352,26 @@ def build_retinanet(
       box_coder_weights=generator_config.box_coder_weights,
   )
 
+  num_scales = None
+  aspect_ratios = None
+  anchor_size = None
+  if anchor_boxes is None:
+    num_scales = model_config.anchor.num_scales
+    aspect_ratios = model_config.anchor.aspect_ratios
+    anchor_size = model_config.anchor.anchor_size
+
   model = retinanet_model.RetinaNetModel(
       backbone,
       decoder,
       head,
       detection_generator_obj,
+      anchor_boxes=anchor_boxes,
       min_level=model_config.min_level,
       max_level=model_config.max_level,
-      num_scales=model_config.anchor.num_scales,
-      aspect_ratios=model_config.anchor.aspect_ratios,
-      anchor_size=model_config.anchor.anchor_size)
+      num_scales=num_scales,
+      aspect_ratios=aspect_ratios,
+      anchor_size=anchor_size,
+  )
   return model
 
 
diff --git a/official/vision/modeling/factory_test.py b/official/vision/modeling/factory_test.py
@@ -13,13 +13,15 @@
 # limitations under the License.
 
 """Tests for factory.py."""
+import collections
 
 # Import libraries
 from absl.testing import parameterized
 import tensorflow as tf, tf_keras
 
 from official.vision.configs import backbones
 from official.vision.configs import backbones_3d
+from official.vision.configs import decoders
 from official.vision.configs import image_classification as classification_cfg
 from official.vision.configs import maskrcnn as maskrcnn_cfg
 from official.vision.configs import retinanet as retinanet_cfg
@@ -123,6 +125,47 @@ def test_builder(self, backbone_type, input_size, has_att_heads):
           ),
       )
 
+  def test_build_model_with_custom_anchors_can_run(self):
+    image_size = (16, 16)
+    input_specs = tf_keras.layers.InputSpec(shape=[None, *image_size, 3])
+    model_config = retinanet_cfg.RetinaNet(
+        num_classes=5,
+        min_level=3,
+        max_level=4,
+        decoder=decoders.Decoder(type='identity'),
+        head=retinanet_cfg.RetinaNetHead(
+            num_convs=0, share_level_convs=False,
+        )
+    )
+    anchor_boxes = collections.OrderedDict()
+    anchor_boxes['3'] = tf.constant(
+        [
+            [[3, 4, 5, 6], [3, 4, 5, 6]],
+            [[3, 4, 5, 6], [3, 4, 5, 6]],
+        ],
+        dtype=tf.float32,
+    )
+    anchor_boxes['4'] = tf.constant(
+        [[[3, 4, 5, 6, 3, 4, 5, 6]]], dtype=tf.float32
+    )
+    model = factory.build_retinanet(
+        input_specs=input_specs,
+        model_config=model_config,
+        anchor_boxes=anchor_boxes,
+        num_anchors_per_location={'3': 1, '4': 2},
+    )
+    test_input = tf.zeros([2, *image_size, 3])
+    outputs = model.call(test_input)
+    self.assertIn('box_outputs', outputs)
+    self.assertIn('3', outputs['box_outputs'])
+    self.assertIn('4', outputs['box_outputs'])
+    self.assertAllEqual(
+        outputs['box_outputs']['3'].numpy().shape, [2, 2, 2, 4 * 1]
+    )
+    self.assertAllEqual(
+        outputs['box_outputs']['4'].numpy().shape, [2, 1, 1, 4 * 2]
+    )
+
 
 class VideoClassificationModelBuilderTest(parameterized.TestCase,
                                           tf.test.TestCase):
diff --git a/official/vision/modeling/heads/dense_prediction_heads.py b/official/vision/modeling/heads/dense_prediction_heads.py
@@ -33,7 +33,7 @@ def __init__(
       min_level: int,
       max_level: int,
       num_classes: int,
-      num_anchors_per_location: int,
+      num_anchors_per_location: int | dict[str, int],
       num_convs: int = 4,
       num_filters: int = 256,
       attribute_heads: Optional[List[Dict[str, Any]]] = None,
@@ -55,7 +55,9 @@ def __init__(
       min_level: An `int` number of minimum feature level.
       max_level: An `int` number of maximum feature level.
       num_classes: An `int` number of classes to predict.
-      num_anchors_per_location: An `int` number of anchors per pixel location.
+      num_anchors_per_location: Number of anchors per pixel location. If an
+        `int`, the same number is used for all levels. If a `dict`, it specifies
+        the number at each level.
       num_convs: An `int` number that represents the number of the intermediate
         conv layers before the prediction.
       num_filters: An `int` number that represents the number of filters of the
@@ -134,15 +136,21 @@ def __init__(
     }
 
     self._classifier_kwargs = {
-        'filters': (
-            self._config_dict['num_classes']
-            * self._config_dict['num_anchors_per_location']
-        ),
         'kernel_size': 3,
         'padding': 'same',
         'bias_initializer': tf.constant_initializer(-np.log((1 - 0.01) / 0.01)),
         'bias_regularizer': self._config_dict['bias_regularizer'],
     }
+    if isinstance(self._config_dict['num_anchors_per_location'], dict):
+      self._classifier_kwargs['filters'] = {
+          level: v * self._config_dict['num_classes']
+          for level, v in self._config_dict['num_anchors_per_location'].items()
+      }
+    else:
+      self._classifier_kwargs['filters'] = (
+          self._config_dict['num_classes']
+          * self._config_dict['num_anchors_per_location']
+      )
     if self._config_dict['use_separable_conv']:
       self._classifier_kwargs.update({
           'depthwise_initializer': tf_keras.initializers.RandomNormal(
@@ -161,15 +169,21 @@ def __init__(
       })
 
     self._box_regressor_kwargs = {
-        'filters': (
-            self._config_dict['num_params_per_anchor']
-            * self._config_dict['num_anchors_per_location']
-        ),
         'kernel_size': 3,
         'padding': 'same',
         'bias_initializer': tf.zeros_initializer(),
         'bias_regularizer': self._config_dict['bias_regularizer'],
     }
+    if isinstance(self._config_dict['num_anchors_per_location'], dict):
+      self._box_regressor_kwargs['filters'] = {
+          level: v * self._config_dict['num_params_per_anchor']
+          for level, v in self._config_dict['num_anchors_per_location'].items()
+      }
+    else:
+      self._box_regressor_kwargs['filters'] = (
+          self._config_dict['num_params_per_anchor']
+          * self._config_dict['num_anchors_per_location']
+      )
     if self._config_dict['use_separable_conv']:
       self._box_regressor_kwargs.update({
           'depthwise_initializer': tf_keras.initializers.RandomNormal(
@@ -341,9 +355,16 @@ def _build_prediction_tower(
       for level in range(
           self._config_dict['min_level'], self._config_dict['max_level'] + 1
       ):
-        predictor_kwargs = self._conv_kwargs_new_kernel_init(predictor_kwargs)
+        predictor_kwargs_level = predictor_kwargs.copy()
+        if isinstance(predictor_kwargs_level['filters'], dict):
+          predictor_kwargs_level['filters'] = predictor_kwargs_level['filters'][
+              str(level)
+          ]
+        predictor_kwargs_level = self._conv_kwargs_new_kernel_init(
+            predictor_kwargs_level
+        )
         predictors.append(
-            conv_op(name=f'{predictor_name}-{level}', **predictor_kwargs)
+            conv_op(name=f'{predictor_name}-{level}', **predictor_kwargs_level)
         )
 
     return convs, norms, predictors
diff --git a/official/vision/modeling/heads/dense_prediction_heads_test.py b/official/vision/modeling/heads/dense_prediction_heads_test.py
@@ -181,6 +181,28 @@ def test_forward_shared_prediction_tower_with_share_classification_heads(
     }
     retinanet_head(features)
 
+  def test_forward_with_num_anchors_per_location_by_level(self):
+    bs = 2
+    retinanet_head = dense_prediction_heads.RetinaNetHead(
+        min_level=3,
+        max_level=4,
+        num_classes=7,
+        num_anchors_per_location={'3': 2, '4': 5},
+        num_convs=0,
+        num_filters=123,
+        attribute_heads=None,
+        share_level_convs=False,
+    )
+    features = {
+        '3': np.random.rand(bs, 32, 32, 11),
+        '4': np.random.rand(bs, 16, 16, 13),
+    }
+    scores, boxes, _ = retinanet_head(features)
+    self.assertAllEqual(scores['3'].numpy().shape, [bs, 32, 32, 2 * 7])
+    self.assertAllEqual(boxes['3'].numpy().shape, [bs, 32, 32, 2 * 4])
+    self.assertAllEqual(scores['4'].numpy().shape, [bs, 16, 16, 5 * 7])
+    self.assertAllEqual(boxes['4'].numpy().shape, [bs, 16, 16, 5 * 4])
+
   def test_serialize_deserialize(self):
     retinanet_head = dense_prediction_heads.RetinaNetHead(
         min_level=3,
diff --git a/official/vision/modeling/retinanet_model.py b/official/vision/modeling/retinanet_model.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 """RetinaNet."""
+import collections
 from typing import Any, Mapping, List, Optional, Union, Sequence
 
 # Import libraries
@@ -30,6 +31,7 @@ def __init__(self,
                decoder: tf_keras.Model,
                head: tf_keras.layers.Layer,
                detection_generator: tf_keras.layers.Layer,
+               anchor_boxes: Mapping[str, tf.Tensor] | None = None,
                min_level: Optional[int] = None,
                max_level: Optional[int] = None,
                num_scales: Optional[int] = None,
@@ -43,6 +45,12 @@ def __init__(self,
       decoder: `tf_keras.Model` a decoder network.
       head: `RetinaNetHead`, the RetinaNet head.
       detection_generator: the detection generator.
+      anchor_boxes: a dict of tensors which includes multilevel anchors.
+        - key: `str`, the level of the multilevel predictions.
+        - values: `Tensor`, the anchor coordinates of a particular feature
+            level, whose shape is [height_l, width_l, 4 *
+            num_anchors_per_location_l].
+        If provided, these anchors will be used for inference (training=False).
       min_level: Minimum level in output feature maps.
       max_level: Maximum level in output feature maps.
       num_scales: A number representing intermediate scales added
@@ -72,11 +80,12 @@ def __init__(self,
     self._decoder = decoder
     self._head = head
     self._detection_generator = detection_generator
+    self._anchor_boxes = anchor_boxes
 
   def call(self,
            images: Union[tf.Tensor, Sequence[tf.Tensor]],
            image_shape: Optional[tf.Tensor] = None,
-           anchor_boxes: Optional[Mapping[str, tf.Tensor]] = None,
+           anchor_boxes: Mapping[str, tf.Tensor] | None = None,
            output_intermediate_features: bool = False,
            training: bool = None) -> Mapping[str, tf.Tensor]:
     """Forward pass of the RetinaNet model.
@@ -91,10 +100,8 @@ def call(self,
         this is the actual image shape excluding paddings. For example, images
         in the batch may be resized into different shapes before padding to the
         fixed size.
-      anchor_boxes: a dict of tensors which includes multilevel anchors.
-        - key: `str`, the level of the multilevel predictions.
-        - values: `Tensor`, the anchor coordinates of a particular feature
-            level, whose shape is [height_l, width_l, num_anchors_per_location].
+      anchor_boxes: the anchor boxes to use for inference (training=False) if
+        not provided in the init.
       output_intermediate_features: `bool` indicating whether to return the
         intermediate feature maps generated by backbone and decoder.
       training: `bool`, indicating whether it is in training mode.
@@ -131,18 +138,23 @@ def call(self,
 
     # Dense prediction. `raw_attributes` can be empty.
     raw_scores, raw_boxes, raw_attributes = self.head(features)
+    outputs.update({
+        'cls_outputs': raw_scores,
+        'box_outputs': raw_boxes,
+    })
 
     if training:
-      outputs.update({
-          'cls_outputs': raw_scores,
-          'box_outputs': raw_boxes,
-      })
       if raw_attributes:
         outputs.update({'attribute_outputs': raw_attributes})
       return outputs
     else:
-      # Generate anchor boxes for this batch if not provided.
-      if anchor_boxes is None:
+      if self._anchor_boxes is not None:
+        batch_size = tf.shape(raw_boxes[str(self._config_dict['min_level'])])[0]
+        anchor_boxes = collections.OrderedDict()
+        for level, boxes in self._anchor_boxes.items():
+          anchor_boxes[level] = tf.tile(boxes[None, ...], [batch_size, 1, 1, 1])
+      elif anchor_boxes is None:
+        # Generate anchor boxes for this batch if not provided.
         if isinstance(images, Sequence):
           primary_images = images[0]
         elif isinstance(images, tf.Tensor):
@@ -169,10 +181,6 @@ def call(self,
       final_results = self.detection_generator(raw_boxes, raw_scores,
                                                anchor_boxes, image_shape,
                                                raw_attributes)
-      outputs.update({
-          'cls_outputs': raw_scores,
-          'box_outputs': raw_boxes,
-      })
 
       def _update_decoded_results():
         outputs.update({