Adding tf.RaggedTensor support to tft.tukey_scale, tft.tukey_h_params, tft.scale_to_gaussian.

iindyk · tf-transform-team · commit 84f3eebca50e · 2021-10-19T16:22:24.000-07:00
PiperOrigin-RevId: 404391810
diff --git a/RELEASE.md b/RELEASE.md
@@ -5,7 +5,8 @@
 ## Major Features and Improvements
 
 *   Added `tf.RaggedTensor` support to `tft.bucketize`,
-    `tft.compute_and_apply_vocabulary` and related analyzers and mappers.
+    `tft.compute_and_apply_vocabulary`, `tft.scale_to_gaussian` and related
+    analyzers and mappers with `reduce_instance_dims=True`.
 
 ## Bug Fixes and Other Changes
 
diff --git a/tensorflow_transform/analyzers.py b/tensorflow_transform/analyzers.py
@@ -856,7 +856,7 @@ def _mean_and_var(x, reduce_instance_dims=True, output_dtype=None):
 
 
 @common.log_api_use(common.ANALYZER_COLLECTION)
-def tukey_location(x: common_types.TensorType,
+def tukey_location(x: common_types.InputTensorType,
                    reduce_instance_dims: Optional[bool] = True,
                    output_dtype: Optional[tf.DType] = None,
                    name: Optional[str] = None) -> tf.Tensor:
@@ -872,7 +872,7 @@ def tukey_location(x: common_types.TensorType,
   Mathematics, vol. 2012, 2012. doi:10.5402/2012/980153
 
   Args:
-    x: A `Tensor` or `SparseTensor`. Its type must be floating point
+    x: A `Tensor` or `CompositeTensor`. Its type must be floating point
         (float{16|32|64}), or integral ([u]int{8|16|32|64}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
@@ -893,7 +893,7 @@ def tukey_location(x: common_types.TensorType,
 
 
 @common.log_api_use(common.ANALYZER_COLLECTION)
-def tukey_scale(x: common_types.TensorType,
+def tukey_scale(x: common_types.InputTensorType,
                 reduce_instance_dims: Optional[bool] = True,
                 output_dtype: Optional[tf.DType] = None,
                 name: Optional[str] = None) -> tf.Tensor:
@@ -910,7 +910,7 @@ def tukey_scale(x: common_types.TensorType,
 
 
   Args:
-    x: A `Tensor` or `SparseTensor`. Its type must be floating point
+    x: A `Tensor` or `CompositeTensor`. Its type must be floating point
         (float{16|32|64}), or integral ([u]int{8|16|32|64}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
@@ -931,7 +931,7 @@ def tukey_scale(x: common_types.TensorType,
 
 
 @common.log_api_use(common.ANALYZER_COLLECTION)
-def tukey_h_params(x: common_types.TensorType,
+def tukey_h_params(x: common_types.InputTensorType,
                    reduce_instance_dims: bool = True,
                    output_dtype: Optional[tf.DType] = None,
                    name: Optional[str] = None) -> Tuple[tf.Tensor, tf.Tensor]:
@@ -947,7 +947,7 @@ def tukey_h_params(x: common_types.TensorType,
   Mathematics, vol. 2012, 2012. doi:10.5402/2012/980153
 
   Args:
-    x: A `Tensor` or `SparseTensor`. Its type must be floating point
+    x: A `Tensor` or `CompositeTensor`. Its type must be floating point
         (float{16|32|64}), or integral ([u]int{8|16|32|64}).
     reduce_instance_dims: By default collapses the batch and instance dimensions
         to arrive at a single scalar output. If False, only collapses the batch
@@ -968,7 +968,7 @@ def tukey_h_params(x: common_types.TensorType,
 
 
 def _tukey_parameters(
-    x: common_types.TensorType,
+    x: common_types.InputTensorType,
     reduce_instance_dims: bool = True,
     output_dtype: Optional[tf.DType] = None
 ) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
diff --git a/tensorflow_transform/beam/tukey_hh_params_integration_test.py b/tensorflow_transform/beam/tukey_hh_params_integration_test.py
@@ -193,6 +193,67 @@ def preprocessing_fn(inputs):
         desired_batch_size=20,
         beam_pipeline=beam.Pipeline())
 
+  @tft_unit.parameters(
+      (tf.int16,),
+      (tf.int32,),
+      (tf.int64,),
+      (tf.float32,),
+      (tf.float64,),
+  )
+  def testGaussianizeRagged(self, input_dtype):
+    tft_unit.skip_if_not_tf2('RaggedFeature is not available in TF 1.x.')
+
+    def preprocessing_fn(inputs):
+      x_gaussianized = tft.scale_to_gaussian(tf.cast(inputs['x'], input_dtype))
+      self.assertEqual(x_gaussianized.dtype,
+                       impl_test._mean_output_dtype(input_dtype))
+      return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)}
+
+    input_data_values = [
+        516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508,
+        669, 617, 502, 532, 517, 479
+    ]
+    input_data = []
+    for idx, v in enumerate(input_data_values):
+      input_data.append({
+          'val': [v, -input_data_values[-1 - idx]],
+          'row_lengths_1': [2, 1, 0],
+          'row_lengths_2': [1, 0, 1],
+      })
+    input_metadata = tft_unit.metadata_from_feature_spec({
+        'x':
+            tf.io.RaggedFeature(
+                tft_unit.canonical_numeric_dtype(input_dtype),
+                value_key='val',
+                partitions=[
+                    tf.io.RaggedFeature.RowLengths('row_lengths_1'),  # pytype: disable=attribute-error
+                    tf.io.RaggedFeature.RowLengths('row_lengths_2')  # pytype: disable=attribute-error
+                ]),
+    })
+    expected_data_values = [
+        0.91555131, -1.54543642, 1.30767697, 0.73634456, 1.03620536, 1.03443104,
+        0.26969729, 0.84990131, 1.02201077, 0.72569862, 1.04862563, 1.49752966,
+        -0.02838919, 0.90135672, 1.18702292, 1.09475806, 0.89071077, 0.9439405,
+        0.91732564, 0.84990131
+    ]
+    expected_data = []
+    for idx, v in enumerate(expected_data_values):
+      expected_data.append({
+          'x_gaussianized$ragged_values': ([v,
+                                            -expected_data_values[-1 - idx]]),
+          'x_gaussianized$row_lengths_1': [2, 1, 0],
+          'x_gaussianized$row_lengths_2': [1, 0, 1]
+      })
+
+    self.assertAnalyzeAndTransformResults(
+        input_data,
+        input_metadata,
+        preprocessing_fn,
+        expected_data,
+        desired_batch_size=20,
+        # Runs the test deterministically on the whole batch.
+        beam_pipeline=beam.Pipeline())
+
   @tft_unit.named_parameters(
       dict(
           testcase_name='tukey_int64in',
@@ -497,5 +558,74 @@ def assert_and_cast_dtype(tensor, out_dtype):
         # Runs the test deterministically on the whole batch.
         beam_pipeline=beam.Pipeline())
 
+  @tft_unit.parameters(
+      (tf.int16,),
+      (tf.int32,),
+      (tf.int64,),
+      (tf.float32,),
+      (tf.float64,),
+  )
+  def testTukeyHHAnalyzersWithRaggedInputs(self, input_dtype):
+    tft_unit.skip_if_not_tf2('RaggedFeature is not available in TF 1.x.')
+
+    output_dtype = impl_test._mean_output_dtype(input_dtype)
+    canonical_output_dtype = tft_unit.canonical_numeric_dtype(output_dtype)
+
+    def analyzer_fn(inputs):
+      a = tf.cast(inputs['a'], input_dtype)
+
+      def assert_and_cast_dtype(tensor):
+        self.assertEqual(tensor.dtype, output_dtype)
+        return tf.cast(tensor, canonical_output_dtype)
+
+      return {
+          'tukey_location': assert_and_cast_dtype(tft.tukey_location(a)),
+          'tukey_scale': assert_and_cast_dtype(tft.tukey_scale(a)),
+          'tukey_hl': assert_and_cast_dtype(tft.tukey_h_params(a)[0]),
+          'tukey_hr': assert_and_cast_dtype(tft.tukey_h_params(a)[1]),
+      }
+
+    input_data_values = [
+        516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508,
+        669, 617, 502, 532, 517, 479
+    ]
+    input_data = []
+    for idx, v in enumerate(input_data_values):
+      input_data.append({
+          'val': [v, -input_data_values[-1 - idx]],
+          'row_lengths_1': [2, 0, 1],
+          'row_lengths_2': [0, 1, 1]
+      })
+    input_metadata = tft_unit.metadata_from_feature_spec({
+        'a':
+            tf.io.RaggedFeature(
+                tft_unit.canonical_numeric_dtype(input_dtype),
+                value_key='val',
+                partitions=[
+                    tf.io.RaggedFeature.RowLengths('row_lengths_1'),  # pytype: disable=attribute-error
+                    tf.io.RaggedFeature.RowLengths('row_lengths_2')  # pytype: disable=attribute-error
+                ]),
+    })
+
+    expected_outputs = {
+        'tukey_location':
+            np.array(0.0, canonical_output_dtype.as_numpy_dtype),
+        'tukey_scale':
+            np.array(572.2776, canonical_output_dtype.as_numpy_dtype),
+        'tukey_hl':
+            np.array(0.0, canonical_output_dtype.as_numpy_dtype),
+        'tukey_hr':
+            np.array(0.0, canonical_output_dtype.as_numpy_dtype),
+    }
+
+    self.assertAnalyzerOutputs(
+        input_data,
+        input_metadata,
+        analyzer_fn,
+        expected_outputs,
+        desired_batch_size=20,
+        # Runs the test deterministically on the whole batch.
+        beam_pipeline=beam.Pipeline())
+
 if __name__ == '__main__':
   tft_unit.main()
diff --git a/tensorflow_transform/mappers.py b/tensorflow_transform/mappers.py
@@ -70,11 +70,11 @@ def preprocessing_fn(inputs):
 
 @common.log_api_use(common.MAPPER_COLLECTION)
 def scale_to_gaussian(
-    x: common_types.ConsistentTensorType,
+    x: common_types.ConsistentInputTensorType,
     elementwise: bool = False,
     name: Optional[str] = None,
     output_dtype: Optional[tf.DType] = None
-) -> common_types.ConsistentTensorType:
+) -> common_types.ConsistentInputTensorType:
   """Returns an (approximately) normal column with mean to 0 and variance 1.
 
   We transform the column to values that are approximately distributed
@@ -105,17 +105,17 @@ def scale_to_gaussian(
   input vaules unchanged.
 
   Args:
-    x: A numeric `Tensor` or `SparseTensor`.
+    x: A numeric `Tensor` or `CompositeTensor`.
     elementwise: If true, scales each element of the tensor independently;
         otherwise uses the parameters of the whole tensor.
     name: (Optional) A name for this operation.
     output_dtype: (Optional) If not None, casts the output tensor to this type.
 
   Returns:
-    A `Tensor` or `SparseTensor` containing the input column transformed to be
-    approximately standard distributed (i.e. a Gaussian with mean 0 and variance
-    1). If `x` is floating point, the mean will have the same type as `x`. If
-    `x` is integral, the output is cast to tf.float32.
+    A `Tensor` or `CompositeTensor` containing the input column transformed to
+    be approximately standard distributed (i.e. a Gaussian with mean 0 and
+    variance 1). If `x` is floating point, the mean will have the same type as
+    `x`. If `x` is integral, the output is cast to tf.float32.
 
     Note that TFLearn generally permits only tf.int64 and tf.float32, so casting
     this scaler's output may be necessary.
@@ -128,10 +128,10 @@ def scale_to_gaussian(
 
 
 def _scale_to_gaussian_internal(
-    x: common_types.ConsistentTensorType,
+    x: common_types.ConsistentInputTensorType,
     elementwise: bool = False,
     output_dtype: Optional[tf.DType] = None
-) -> common_types.ConsistentTensorType:
+) -> common_types.ConsistentInputTensorType:
   """Implementation for scale_to_gaussian."""
   # x_mean will be float16, float32, or float64, depending on type of x.
   x_loc, x_scale, hl, hr = analyzers._tukey_parameters(  # pylint: disable=protected-access
@@ -151,6 +151,11 @@ def _scale_to_gaussian_internal(
       hl = tf.gather_nd(hl, x.indices[:, 1:])
       hr = tf.gather_nd(hr, x.indices[:, 1:])
       x_var = tf.gather_nd(x_var, x.indices[:, 1:])
+  elif isinstance(x, tf.RaggedTensor):
+    if elementwise:
+      raise NotImplementedError(
+          'Elementwise scale_to_gaussian does not support RaggedTensors.')
+    x_values = x.flat_values
 
   numerator = tf.cast(x_values, x_loc.dtype) - x_loc
   is_long_tailed = tf.math.logical_or(hl > 0.0, hr > 0.0)