Skip to content

Commit b9f060a

Browse files
paulgc17tf-data-validation-team
authored and
tf-data-validation-team
committed
Directly convert a batch of tf.Examples to Arrow tables.
- Avoids converting tf.Example to intermediate Dict representation. - Adds dependency on tfx_bsl - Deletes fast example decoder. PiperOrigin-RevId: 273417600
1 parent 17f869f commit b9f060a

12 files changed

+164
-237
lines changed

RELEASE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
## Major Features and Improvements
55

66
* Generate statistics for sparse features.
7+
* Directly convert a batch of tf.Examples to Arrow tables. Avoids conversion of
8+
tf.Example to intermediate Dict representation.
79

810
## Bug Fixes and Other Changes
911

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ def has_ext_modules(self):
102102
# 'tensorflow>=1.14,<2',
103103
'tensorflow-metadata>=0.14,<0.15',
104104
'tensorflow-transform>=0.14,<0.15',
105+
'tfx-bsl>=0.15.0.dev0,<0.16',
105106

106107
# Dependencies needed for visualization.
107108
# Note that we don't add a max version for IPython as it introduces a

tensorflow_data_validation/coders/cc/BUILD

Lines changed: 0 additions & 19 deletions
This file was deleted.

tensorflow_data_validation/coders/cc/fast_example_decoder.cc

Lines changed: 0 additions & 125 deletions
This file was deleted.

tensorflow_data_validation/coders/cc/fast_example_decoder.h

Lines changed: 0 additions & 35 deletions
This file was deleted.

tensorflow_data_validation/coders/tf_example_decoder.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,18 @@
2323
from tensorflow_data_validation import constants
2424
from tensorflow_data_validation import types
2525
from tensorflow_data_validation.pyarrow_tf import pyarrow as pa
26-
from tensorflow_data_validation.pywrap import pywrap_tensorflow_data_validation
2726
from tensorflow_data_validation.utils import batch_util
27+
from tfx_bsl.coders import example_coder
2828

2929

30-
DecodeExample = pywrap_tensorflow_data_validation.TFDV_DecodeExample # pylint: disable=invalid-name
31-
32-
33-
# TODO(pachristopher): This fast coder can also benefit TFT. Consider moving
34-
# this coder to tf.Beam once it is available.
30+
# TODO(pachristopher): Deprecate this in 0.16.
3531
class TFExampleDecoder(object):
3632
"""A decoder for decoding TF examples into tf data validation datasets.
3733
"""
3834

3935
def decode(self, serialized_example_proto: bytes) -> types.Example:
4036
"""Decodes serialized tf.Example to tf data validation input dict."""
41-
return DecodeExample(serialized_example_proto)
37+
return example_coder.ExampleToNumpyDict(serialized_example_proto)
4238

4339

4440
@beam.ptransform_fn
@@ -58,9 +54,7 @@ def DecodeTFExample(
5854
Returns:
5955
A PCollection of Arrow tables.
6056
"""
61-
decoder = TFExampleDecoder()
6257
return (examples
63-
| 'ParseTFExamples' >> beam.Map(decoder.decode)
64-
| 'BatchExamplesToArrowTables' >>
65-
batch_util.BatchExamplesToArrowTables(
58+
| 'BatchSerializedExamplesToArrowTables' >>
59+
batch_util.BatchSerializedExamplesToArrowTables(
6660
desired_batch_size=desired_batch_size))

tensorflow_data_validation/pywrap/BUILD

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ tf_py_wrap_cc(
2424
srcs = ["tensorflow_data_validation.i"],
2525
swig_includes = [
2626
"arrow.i",
27-
"fast_example_decoder.i",
2827
"validation_api.i",
2928
],
3029
# Since we are building a python extension, we tell the linker to only
@@ -44,7 +43,6 @@ tf_py_wrap_cc(
4443
"//tensorflow_data_validation/arrow/cc:arrow_util",
4544
"//tensorflow_data_validation/arrow/cc:decoded_examples_to_arrow",
4645
"//tensorflow_data_validation/arrow/cc:merge",
47-
"//tensorflow_data_validation/coders/cc:fast_example_decoder",
4846
"@local_config_python//:python_headers",
4947
"@org_tensorflow//tensorflow/core:lib",
5048
],

tensorflow_data_validation/pywrap/fast_example_decoder.i

Lines changed: 0 additions & 24 deletions
This file was deleted.

tensorflow_data_validation/pywrap/tensorflow_data_validation.i

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,4 @@ limitations under the License.
1414
==============================================================================*/
1515

1616
%include "tensorflow_data_validation/pywrap/validation_api.i"
17-
%include "tensorflow_data_validation/pywrap/arrow.i"
18-
%include "tensorflow_data_validation/pywrap/fast_example_decoder.i"
17+
%include "tensorflow_data_validation/pywrap/arrow.i"

tensorflow_data_validation/tools/windows/pip/build_tfdv_windows.sh

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,17 @@ pip install setuptools --upgrade
129129
pip install wheel --upgrade
130130
pip freeze --all
131131

132+
echo "Installing TFX-BSL at head"
133+
pushd tfx_bsl_at_head
134+
source "tfx_bsl/tools/windows/pip/build_tfx_bsl_windows.sh" \
135+
|| { echo "Failed to source build_tfx_bsl_windows.sh" >&2; exit 1; }
136+
137+
(tfx_bsl::build_from_head_windows) && wheel=$(ls dist/*.whl) \
138+
|| { echo "Failed to build tfx_bsl."; exit 1; }
139+
140+
pip install ${wheel}
141+
popd
142+
132143
PYARROW_REQUIREMENT=$(python -c "fp = open('third_party/pyarrow_version.bzl', 'r'); d = {}; exec(fp.read(), d); fp.close(); print(d['PY_DEP'])")
133144
pip install "${PYARROW_REQUIREMENT}"
134145
./configure.sh
@@ -143,19 +154,8 @@ pip uninstall -y Cython
143154
pip install dist/*.whl
144155
pip install ${TENSORFLOW}
145156

146-
# If running with tf-nightly, install TFT at head. If installing TFT at head,
147-
# also install TFX-BSL at head.
157+
# If running with tf-nightly, install TFT at head.
148158
if [[ ${TENSORFLOW}==tf-nightly ]]; then
149-
echo "Installing TFX-BSL at head"
150-
pushd tfx_bsl_at_head
151-
PYARROW_REQUIREMENT=$(python -c "fp = open('third_party/pyarrow_version.bzl', 'r'); d = {}; exec(fp.read(), d); fp.close(); print(d['PY_DEP'])")
152-
pip install "${PYARROW_REQUIREMENT}"
153-
./configure.sh
154-
bazel run -c opt --copt=-DWIN32_LEAN_AND_MEAN tfx_bsl:build_pip_package -- --python_bin_path ${PYTHON_BIN_PATH}
155-
BSL_WHEEL_PATH=$(find dist -name "*.whl")
156-
pip install ${BSL_WHEEL_PATH}
157-
popd # pop tfx_bsl_at_head
158-
159159
pip uninstall -y tensorflow-transform
160160
echo "Installing TFT at head"
161161
pushd tft_at_head

0 commit comments

Comments
 (0)