Skip to content

Commit 84f3eeb

Browse files
iindyktf-transform-team
authored and
tf-transform-team
committed
Adding tf.RaggedTensor support to tft.tukey_scale, tft.tukey_h_params, tft.scale_to_gaussian.
PiperOrigin-RevId: 404391810
1 parent 0927fdb commit 84f3eeb

File tree

4 files changed

+153
-17
lines changed

4 files changed

+153
-17
lines changed

RELEASE.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
## Major Features and Improvements
66

77
* Added `tf.RaggedTensor` support to `tft.bucketize`,
8-
`tft.compute_and_apply_vocabulary` and related analyzers and mappers.
8+
`tft.compute_and_apply_vocabulary`, `tft.scale_to_gaussian` and related
9+
analyzers and mappers with `reduce_instance_dims=True`.
910

1011
## Bug Fixes and Other Changes
1112

tensorflow_transform/analyzers.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -856,7 +856,7 @@ def _mean_and_var(x, reduce_instance_dims=True, output_dtype=None):
856856

857857

858858
@common.log_api_use(common.ANALYZER_COLLECTION)
859-
def tukey_location(x: common_types.TensorType,
859+
def tukey_location(x: common_types.InputTensorType,
860860
reduce_instance_dims: Optional[bool] = True,
861861
output_dtype: Optional[tf.DType] = None,
862862
name: Optional[str] = None) -> tf.Tensor:
@@ -872,7 +872,7 @@ def tukey_location(x: common_types.TensorType,
872872
Mathematics, vol. 2012, 2012. doi:10.5402/2012/980153
873873
874874
Args:
875-
x: A `Tensor` or `SparseTensor`. Its type must be floating point
875+
x: A `Tensor` or `CompositeTensor`. Its type must be floating point
876876
(float{16|32|64}), or integral ([u]int{8|16|32|64}).
877877
reduce_instance_dims: By default collapses the batch and instance dimensions
878878
to arrive at a single scalar output. If False, only collapses the batch
@@ -893,7 +893,7 @@ def tukey_location(x: common_types.TensorType,
893893

894894

895895
@common.log_api_use(common.ANALYZER_COLLECTION)
896-
def tukey_scale(x: common_types.TensorType,
896+
def tukey_scale(x: common_types.InputTensorType,
897897
reduce_instance_dims: Optional[bool] = True,
898898
output_dtype: Optional[tf.DType] = None,
899899
name: Optional[str] = None) -> tf.Tensor:
@@ -910,7 +910,7 @@ def tukey_scale(x: common_types.TensorType,
910910
911911
912912
Args:
913-
x: A `Tensor` or `SparseTensor`. Its type must be floating point
913+
x: A `Tensor` or `CompositeTensor`. Its type must be floating point
914914
(float{16|32|64}), or integral ([u]int{8|16|32|64}).
915915
reduce_instance_dims: By default collapses the batch and instance dimensions
916916
to arrive at a single scalar output. If False, only collapses the batch
@@ -931,7 +931,7 @@ def tukey_scale(x: common_types.TensorType,
931931

932932

933933
@common.log_api_use(common.ANALYZER_COLLECTION)
934-
def tukey_h_params(x: common_types.TensorType,
934+
def tukey_h_params(x: common_types.InputTensorType,
935935
reduce_instance_dims: bool = True,
936936
output_dtype: Optional[tf.DType] = None,
937937
name: Optional[str] = None) -> Tuple[tf.Tensor, tf.Tensor]:
@@ -947,7 +947,7 @@ def tukey_h_params(x: common_types.TensorType,
947947
Mathematics, vol. 2012, 2012. doi:10.5402/2012/980153
948948
949949
Args:
950-
x: A `Tensor` or `SparseTensor`. Its type must be floating point
950+
x: A `Tensor` or `CompositeTensor`. Its type must be floating point
951951
(float{16|32|64}), or integral ([u]int{8|16|32|64}).
952952
reduce_instance_dims: By default collapses the batch and instance dimensions
953953
to arrive at a single scalar output. If False, only collapses the batch
@@ -968,7 +968,7 @@ def tukey_h_params(x: common_types.TensorType,
968968

969969

970970
def _tukey_parameters(
971-
x: common_types.TensorType,
971+
x: common_types.InputTensorType,
972972
reduce_instance_dims: bool = True,
973973
output_dtype: Optional[tf.DType] = None
974974
) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:

tensorflow_transform/beam/tukey_hh_params_integration_test.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,67 @@ def preprocessing_fn(inputs):
193193
desired_batch_size=20,
194194
beam_pipeline=beam.Pipeline())
195195

196+
@tft_unit.parameters(
197+
(tf.int16,),
198+
(tf.int32,),
199+
(tf.int64,),
200+
(tf.float32,),
201+
(tf.float64,),
202+
)
203+
def testGaussianizeRagged(self, input_dtype):
204+
tft_unit.skip_if_not_tf2('RaggedFeature is not available in TF 1.x.')
205+
206+
def preprocessing_fn(inputs):
207+
x_gaussianized = tft.scale_to_gaussian(tf.cast(inputs['x'], input_dtype))
208+
self.assertEqual(x_gaussianized.dtype,
209+
impl_test._mean_output_dtype(input_dtype))
210+
return {'x_gaussianized': tf.cast(x_gaussianized, tf.float32)}
211+
212+
input_data_values = [
213+
516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508,
214+
669, 617, 502, 532, 517, 479
215+
]
216+
input_data = []
217+
for idx, v in enumerate(input_data_values):
218+
input_data.append({
219+
'val': [v, -input_data_values[-1 - idx]],
220+
'row_lengths_1': [2, 1, 0],
221+
'row_lengths_2': [1, 0, 1],
222+
})
223+
input_metadata = tft_unit.metadata_from_feature_spec({
224+
'x':
225+
tf.io.RaggedFeature(
226+
tft_unit.canonical_numeric_dtype(input_dtype),
227+
value_key='val',
228+
partitions=[
229+
tf.io.RaggedFeature.RowLengths('row_lengths_1'), # pytype: disable=attribute-error
230+
tf.io.RaggedFeature.RowLengths('row_lengths_2') # pytype: disable=attribute-error
231+
]),
232+
})
233+
expected_data_values = [
234+
0.91555131, -1.54543642, 1.30767697, 0.73634456, 1.03620536, 1.03443104,
235+
0.26969729, 0.84990131, 1.02201077, 0.72569862, 1.04862563, 1.49752966,
236+
-0.02838919, 0.90135672, 1.18702292, 1.09475806, 0.89071077, 0.9439405,
237+
0.91732564, 0.84990131
238+
]
239+
expected_data = []
240+
for idx, v in enumerate(expected_data_values):
241+
expected_data.append({
242+
'x_gaussianized$ragged_values': ([v,
243+
-expected_data_values[-1 - idx]]),
244+
'x_gaussianized$row_lengths_1': [2, 1, 0],
245+
'x_gaussianized$row_lengths_2': [1, 0, 1]
246+
})
247+
248+
self.assertAnalyzeAndTransformResults(
249+
input_data,
250+
input_metadata,
251+
preprocessing_fn,
252+
expected_data,
253+
desired_batch_size=20,
254+
# Runs the test deterministically on the whole batch.
255+
beam_pipeline=beam.Pipeline())
256+
196257
@tft_unit.named_parameters(
197258
dict(
198259
testcase_name='tukey_int64in',
@@ -497,5 +558,74 @@ def assert_and_cast_dtype(tensor, out_dtype):
497558
# Runs the test deterministically on the whole batch.
498559
beam_pipeline=beam.Pipeline())
499560

561+
@tft_unit.parameters(
562+
(tf.int16,),
563+
(tf.int32,),
564+
(tf.int64,),
565+
(tf.float32,),
566+
(tf.float64,),
567+
)
568+
def testTukeyHHAnalyzersWithRaggedInputs(self, input_dtype):
569+
tft_unit.skip_if_not_tf2('RaggedFeature is not available in TF 1.x.')
570+
571+
output_dtype = impl_test._mean_output_dtype(input_dtype)
572+
canonical_output_dtype = tft_unit.canonical_numeric_dtype(output_dtype)
573+
574+
def analyzer_fn(inputs):
575+
a = tf.cast(inputs['a'], input_dtype)
576+
577+
def assert_and_cast_dtype(tensor):
578+
self.assertEqual(tensor.dtype, output_dtype)
579+
return tf.cast(tensor, canonical_output_dtype)
580+
581+
return {
582+
'tukey_location': assert_and_cast_dtype(tft.tukey_location(a)),
583+
'tukey_scale': assert_and_cast_dtype(tft.tukey_scale(a)),
584+
'tukey_hl': assert_and_cast_dtype(tft.tukey_h_params(a)[0]),
585+
'tukey_hr': assert_and_cast_dtype(tft.tukey_h_params(a)[1]),
586+
}
587+
588+
input_data_values = [
589+
516, -871, 737, 415, 584, 583, 152, 479, 576, 409, 591, 844, -16, 508,
590+
669, 617, 502, 532, 517, 479
591+
]
592+
input_data = []
593+
for idx, v in enumerate(input_data_values):
594+
input_data.append({
595+
'val': [v, -input_data_values[-1 - idx]],
596+
'row_lengths_1': [2, 0, 1],
597+
'row_lengths_2': [0, 1, 1]
598+
})
599+
input_metadata = tft_unit.metadata_from_feature_spec({
600+
'a':
601+
tf.io.RaggedFeature(
602+
tft_unit.canonical_numeric_dtype(input_dtype),
603+
value_key='val',
604+
partitions=[
605+
tf.io.RaggedFeature.RowLengths('row_lengths_1'), # pytype: disable=attribute-error
606+
tf.io.RaggedFeature.RowLengths('row_lengths_2') # pytype: disable=attribute-error
607+
]),
608+
})
609+
610+
expected_outputs = {
611+
'tukey_location':
612+
np.array(0.0, canonical_output_dtype.as_numpy_dtype),
613+
'tukey_scale':
614+
np.array(572.2776, canonical_output_dtype.as_numpy_dtype),
615+
'tukey_hl':
616+
np.array(0.0, canonical_output_dtype.as_numpy_dtype),
617+
'tukey_hr':
618+
np.array(0.0, canonical_output_dtype.as_numpy_dtype),
619+
}
620+
621+
self.assertAnalyzerOutputs(
622+
input_data,
623+
input_metadata,
624+
analyzer_fn,
625+
expected_outputs,
626+
desired_batch_size=20,
627+
# Runs the test deterministically on the whole batch.
628+
beam_pipeline=beam.Pipeline())
629+
500630
if __name__ == '__main__':
501631
tft_unit.main()

tensorflow_transform/mappers.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,11 +70,11 @@ def preprocessing_fn(inputs):
7070

7171
@common.log_api_use(common.MAPPER_COLLECTION)
7272
def scale_to_gaussian(
73-
x: common_types.ConsistentTensorType,
73+
x: common_types.ConsistentInputTensorType,
7474
elementwise: bool = False,
7575
name: Optional[str] = None,
7676
output_dtype: Optional[tf.DType] = None
77-
) -> common_types.ConsistentTensorType:
77+
) -> common_types.ConsistentInputTensorType:
7878
"""Returns an (approximately) normal column with mean to 0 and variance 1.
7979
8080
We transform the column to values that are approximately distributed
@@ -105,17 +105,17 @@ def scale_to_gaussian(
105105
input vaules unchanged.
106106
107107
Args:
108-
x: A numeric `Tensor` or `SparseTensor`.
108+
x: A numeric `Tensor` or `CompositeTensor`.
109109
elementwise: If true, scales each element of the tensor independently;
110110
otherwise uses the parameters of the whole tensor.
111111
name: (Optional) A name for this operation.
112112
output_dtype: (Optional) If not None, casts the output tensor to this type.
113113
114114
Returns:
115-
A `Tensor` or `SparseTensor` containing the input column transformed to be
116-
approximately standard distributed (i.e. a Gaussian with mean 0 and variance
117-
1). If `x` is floating point, the mean will have the same type as `x`. If
118-
`x` is integral, the output is cast to tf.float32.
115+
A `Tensor` or `CompositeTensor` containing the input column transformed to
116+
be approximately standard distributed (i.e. a Gaussian with mean 0 and
117+
variance 1). If `x` is floating point, the mean will have the same type as
118+
`x`. If `x` is integral, the output is cast to tf.float32.
119119
120120
Note that TFLearn generally permits only tf.int64 and tf.float32, so casting
121121
this scaler's output may be necessary.
@@ -128,10 +128,10 @@ def scale_to_gaussian(
128128

129129

130130
def _scale_to_gaussian_internal(
131-
x: common_types.ConsistentTensorType,
131+
x: common_types.ConsistentInputTensorType,
132132
elementwise: bool = False,
133133
output_dtype: Optional[tf.DType] = None
134-
) -> common_types.ConsistentTensorType:
134+
) -> common_types.ConsistentInputTensorType:
135135
"""Implementation for scale_to_gaussian."""
136136
# x_mean will be float16, float32, or float64, depending on type of x.
137137
x_loc, x_scale, hl, hr = analyzers._tukey_parameters( # pylint: disable=protected-access
@@ -151,6 +151,11 @@ def _scale_to_gaussian_internal(
151151
hl = tf.gather_nd(hl, x.indices[:, 1:])
152152
hr = tf.gather_nd(hr, x.indices[:, 1:])
153153
x_var = tf.gather_nd(x_var, x.indices[:, 1:])
154+
elif isinstance(x, tf.RaggedTensor):
155+
if elementwise:
156+
raise NotImplementedError(
157+
'Elementwise scale_to_gaussian does not support RaggedTensors.')
158+
x_values = x.flat_values
154159

155160
numerator = tf.cast(x_values, x_loc.dtype) - x_loc
156161
is_long_tailed = tf.math.logical_or(hl > 0.0, hr > 0.0)

0 commit comments

Comments
 (0)