From fa304a251a31c5f0efe20dee7997f521eb0b9e39 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Tue, 13 May 2025 10:54:38 -0700 Subject: [PATCH 1/8] First draft of Korean Cardinal ITN Sparrowhawk testing is not done yet. Signed-off-by: hmlee245 --- .../inverse_normalize.py | 7 +- .../inverse_text_normalization/ko/__init__.py | 17 + .../ko/clean_eval_data.py | 361 ++++++++++++++++++ .../ko/data/__init__.py | 13 + .../ko/data/numbers/__init__.py | 13 + .../ko/data/numbers/digit.tsv | 9 + .../ko/data/numbers/thousands.tsv | 11 + .../ko/data/numbers/zero.tsv | 1 + .../ko/graph_utils.py | 292 ++++++++++++++ .../ko/taggers/__init__.py | 17 + .../ko/taggers/cardinal.py | 104 +++++ .../ko/taggers/tokenize_and_classify.py | 76 ++++ .../ko/taggers/word.py | 32 ++ .../inverse_text_normalization/ko/utils.py | 23 ++ .../ko/verbalizers/__init__.py | 17 + .../ko/verbalizers/cardinal.py | 54 +++ .../ko/verbalizers/verbalize.py | 36 ++ .../ko/verbalizers/verbalize_final.py | 49 +++ .../ko/verbalizers/word.py | 34 ++ .../run_evaluate.py | 2 +- tests/nemo_text_processing/ko/__init__.py | 13 + .../test_cases_cardinal.txt | 27 ++ .../nemo_text_processing/ko/test_cardinal.py | 39 ++ ..._sparrowhawk_inverse_text_normalization.sh | 34 ++ .../pynini_export.py | 8 + 25 files changed, 1287 insertions(+), 2 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/ko/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/graph_utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/taggers/word.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/utils.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py create mode 100644 nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py create mode 100644 tests/nemo_text_processing/ko/__init__.py create mode 100644 tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt create mode 100644 tests/nemo_text_processing/ko/test_cardinal.py create mode 100644 tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index c10819908..e505a8ad0 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -131,6 +131,11 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import ( VerbalizeFinalFst, ) + elif lang == 'ko': # Korean + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( + VerbalizeFinalFst, + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -175,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/__init__.py new file mode 100644 index 000000000..f541211af --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py new file mode 100644 index 000000000..3c1193333 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py @@ -0,0 +1,361 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from argparse import ArgumentParser +from typing import List + +import regex as re + +from nemo_text_processing.text_normalization.data_loader_utils import ( + EOS_TYPE, + Instance, + load_files, + training_data_to_sentences, +) + +""" +This file is for evaluation purposes. +filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. +For example, normalized text should only include characters and whitespace characters but no punctuation. + Cardinal unnormalized instances should contain at least one integer and all other characters are removed. +""" + + +class Filter: + """ + Filter class + + Args: + class_type: semiotic class used in dataset + process_func: function to transform text + filter_func: function to filter text + + """ + + def __init__(self, class_type: str, process_func: object, filter_func: object): + self.class_type = class_type + self.process_func = process_func + self.filter_func = filter_func + + def filter(self, instance: Instance) -> bool: + """ + filter function + + Args: + filters given instance with filter function + + Returns: True if given instance fulfills criteria or does not belong to class type + """ + if instance.token_type != self.class_type: + return True + return self.filter_func(instance) + + def process(self, instance: Instance) -> Instance: + """ + process function + + Args: + processes given instance with process function + + Returns: processed instance if instance belongs to expected class type or original instance + """ + if instance.token_type != self.class_type: + return instance + return self.process_func(instance) + + +def filter_cardinal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_cardinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[^0-9]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_ordinal_1(instance: Instance) -> bool: + ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) + return ok + + +def process_ordinal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r"[,\s]", "", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_decimal_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_decimal_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_measure_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_measure_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"m2", "m²", un_normalized) + un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) + normalized = re.sub(r"[^a-z\s]", "", normalized) + normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_money_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_money_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + un_normalized = re.sub(r",", "", un_normalized) + un_normalized = re.sub(r"a\$", r"$", un_normalized) + un_normalized = re.sub(r"us\$", r"$", un_normalized) + un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) + un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_time_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_time_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r": ", ":", un_normalized) + un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) + un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_plain_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_plain_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_punct_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_punct_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_date_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_date_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + un_normalized = re.sub(r",", "", un_normalized) + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_letters_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_letters_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_verbatim_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_verbatim_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_digit_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_digit_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_telephone_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_telephone_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_electronic_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_electronic_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_fraction_1(instance: Instance) -> bool: + ok = re.search(r"[0-9]", instance.un_normalized) + return ok + + +def process_fraction_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +def filter_address_1(instance: Instance) -> bool: + ok = True + return ok + + +def process_address_1(instance: Instance) -> Instance: + un_normalized = instance.un_normalized + normalized = instance.normalized + normalized = re.sub(r"[^a-z ]", "", normalized) + return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) + + +filters = [] +filters.append(Filter(class_type="CARDINAL", + process_func=process_cardinal_1, filter_func=filter_cardinal_1)) +filters.append(Filter(class_type="ORDINAL", + process_func=process_ordinal_1, filter_func=filter_ordinal_1)) +filters.append(Filter(class_type="DECIMAL", + process_func=process_decimal_1, filter_func=filter_decimal_1)) +filters.append(Filter(class_type="MEASURE", + process_func=process_measure_1, filter_func=filter_measure_1)) +filters.append(Filter(class_type="MONEY", + process_func=process_money_1, filter_func=filter_money_1)) +filters.append(Filter(class_type="TIME", + process_func=process_time_1, filter_func=filter_time_1)) + +filters.append(Filter(class_type="DATE", + process_func=process_date_1, filter_func=filter_date_1)) +filters.append(Filter(class_type="PLAIN", + process_func=process_plain_1, filter_func=filter_plain_1)) +filters.append(Filter(class_type="PUNCT", + process_func=process_punct_1, filter_func=filter_punct_1)) +filters.append(Filter(class_type="LETTERS", + process_func=process_letters_1, filter_func=filter_letters_1)) +filters.append(Filter(class_type="VERBATIM", + process_func=process_verbatim_1, filter_func=filter_verbatim_1)) +filters.append(Filter(class_type="DIGIT", + process_func=process_digit_1, filter_func=filter_digit_1)) +filters.append(Filter(class_type="TELEPHONE", + process_func=process_telephone_1, filter_func=filter_telephone_1)) +filters.append(Filter(class_type="ELECTRONIC", + process_func=process_electronic_1, filter_func=filter_electronic_1)) +filters.append(Filter(class_type="FRACTION", + process_func=process_fraction_1, filter_func=filter_fraction_1)) +filters.append(Filter(class_type="ADDRESS", + process_func=process_address_1, filter_func=filter_address_1)) +filters.append(Filter(class_type=EOS_TYPE, + process_func=lambda x: x, filter_func=lambda x: True)) + + +def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: + """ + Filters list of instances + + Args: + data: list of instances + + Returns: filtered and transformed list of instances + """ + updates_instances = [] + for instance in data: + updated_instance = False + for fil in filters: + if fil.class_type == instance.token_type and fil.filter(instance): + instance = fil.process(instance) + updated_instance = True + if updated_instance: + if verbose: + print(instance) + updates_instances.append(instance) + return updates_instances + + +def parse_args(): + parser = ArgumentParser() + parser.add_argument("--input", help="input file path", + type=str, default='./en_with_types/output-00001-of-00100') + parser.add_argument( + "--verbose", help="print filtered instances", action='store_true') + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + file_path = args.input + + print("Loading training data: " + file_path) + instance_list = load_files([file_path]) # List of instances + filtered_instance_list = filter_loaded_data(instance_list, args.verbose) + training_data_to_sentences(filtered_instance_list) diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv new file mode 100644 index 000000000..9871cb9cf --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/digit.tsv @@ -0,0 +1,9 @@ +일 1 +이 2 +삼 3 +사 4 +오 5 +육 6 +칠 7 +팔 8 +구 9 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv new file mode 100644 index 000000000..541752211 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv @@ -0,0 +1,11 @@ +억 +조 +경 +해 +자 +양 +구 +간 +정 +재 +극 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv new file mode 100644 index 000000000..43baac7c1 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv @@ -0,0 +1 @@ +영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py new file mode 100644 index 000000000..7a9fd8720 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py @@ -0,0 +1,292 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import string +from pathlib import Path +from typing import Dict + +import pynini +from pynini import Far +from pynini.examples import plurals +from pynini.export import export +from pynini.lib import byte, pynutil, utf8 + +from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels + +NEMO_CHAR = utf8.VALID_UTF8_CHAR + +NEMO_NARROW_NON_BREAK_SPACE = "\u202f" +NEMO_DIGIT = byte.DIGIT +NEMO_LOWER = pynini.union(*string.ascii_lowercase).optimize() +NEMO_UPPER = pynini.union(*string.ascii_uppercase).optimize() +NEMO_ALPHA = pynini.union(NEMO_LOWER, NEMO_UPPER).optimize() +NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize() +NEMO_HEX = pynini.union(*string.hexdigits).optimize() +NEMO_NON_BREAKING_SPACE = "\u00a0" +NEMO_SPACE = " " +NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00a0").optimize() +NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize() +NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize() + +NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize() +NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize() + +NEMO_SIGMA = pynini.closure(NEMO_CHAR) + +NEMO_NOT_ALPHA = pynini.difference(NEMO_SIGMA, NEMO_ALPHA).optimize() +NEMO_LOWER_NOT_A = pynini.union( + "b", + "c", + "d", + "e", + "f", + "g", + "h", + "i", + "j", + "k", + "l", + "m", + "n", + "o", + "p", + "q", + "r", + "s", + "t", + "u", + "v", + "w", + "x", + "y", + "z", +).optimize() + +delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE)) +delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1)) +insert_space = pynutil.insert(" ") +delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ") +delete_preserve_order = pynini.closure( + pynutil.delete(" preserve_order: true") + | (pynutil.delete(" field_order: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"")) +) + +suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv")) +# _v = pynini.union("a", "e", "i", "o", "u") +_c = pynini.union( + "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z" +) +_ies = NEMO_SIGMA + _c + pynini.cross("y", "ies") +_es = NEMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es") +_s = NEMO_SIGMA + pynutil.insert("s") + +graph_plural = plurals._priority_union( + suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, NEMO_SIGMA), NEMO_SIGMA), NEMO_SIGMA +).optimize() + +SINGULAR_TO_PLURAL = graph_plural +PLURAL_TO_SINGULAR = pynini.invert(graph_plural) +TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]) +TO_UPPER = pynini.invert(TO_LOWER) +MIN_NEG_WEIGHT = -0.0001 +MIN_POS_WEIGHT = 0.0001 +INPUT_CASED = "cased" +INPUT_LOWER_CASED = "lower_cased" +MINUS = pynini.union("minus", "Minus").optimize() + + +def capitalized_input_graph( + graph: 'pynini.FstLike', original_graph_weight: float = None, capitalized_graph_weight: float = None +) -> 'pynini.FstLike': + """ + Allow graph input to be capitalized, e.g. for ITN) + + Args: + graph: FstGraph + original_graph_weight: weight to add to the original `graph` + capitalized_graph_weight: weight to add to the capitalized graph + """ + capitalized_graph = pynini.compose(TO_LOWER + NEMO_SIGMA, graph).optimize() + + if original_graph_weight is not None: + graph = pynutil.add_weight(graph, weight=original_graph_weight) + + if capitalized_graph_weight is not None: + capitalized_graph = pynutil.add_weight(capitalized_graph, weight=capitalized_graph_weight) + + graph |= capitalized_graph + return graph + + +def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): + """ + Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name. + + Args: + file_name: exported file name + graphs: Mapping of a rule name and Pynini WFST graph to be exported + """ + exporter = export.Exporter(file_name) + for rule, graph in graphs.items(): + exporter[rule] = graph.optimize() + exporter.close() + logging.info(f'Created {file_name}') + + +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + +def convert_space(fst) -> 'pynini.FstLike': + """ + Converts space to nonbreaking space. + Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty" + This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it. + + Args: + fst: input fst + + Returns output fst where breaking spaces are converted to non breaking spaces + """ + return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) + + +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logging.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + +class GraphFst: + """ + Base class for all grammar fsts. + + Args: + name: name of grammar class + kind: either 'classify' or 'verbalize' + deterministic: if True will provide a single transduction option, + for False multiple transduction are generated (used for audio-based normalization) + """ + + def __init__(self, name: str, kind: str, deterministic: bool = True): + self.name = name + self.kind = kind + self._fst = None + self.deterministic = deterministic + + self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far') + if self.far_exist(): + self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst() + + def far_exist(self) -> bool: + """ + Returns true if FAR can be loaded + """ + return self.far_path.exists() + + @property + def fst(self) -> 'pynini.FstLike': + return self._fst + + @fst.setter + def fst(self, fst): + self._fst = fst + + def add_tokens(self, fst) -> 'pynini.FstLike': + """ + Wraps class name around to given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }") + + def delete_tokens(self, fst) -> 'pynini.FstLike': + """ + Deletes class name wrap around output of given fst + + Args: + fst: input fst + + Returns: + Fst: fst + """ + res = ( + pynutil.delete(f"{self.name}") + + delete_space + + pynutil.delete("{") + + delete_space + + fst + + delete_space + + pynutil.delete("}") + ) + return res @ pynini.cdrewrite(pynini.cross(u"\u00a0", " "), "", "", NEMO_SIGMA) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py new file mode 100644 index 000000000..f541211af --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py new file mode 100644 index 000000000..df5804fc0 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -0,0 +1,104 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + +class CardinalFst(GraphFst): + """ + Finite state transducer for classifying cardinals + e.g. 마이너스 이십삼 -> cardinal { integer: "23" negative: "-" } } + + Args: + input_case: accepting Korean input. + """ + + def __init__(self): + super().__init__(name="cardinal", kind="classify") + + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) + graph_zero = pynini.cross("영", "0") + + graph_negative = pynini.cross("마이너스", "-") + graph_negative += delete_space + + ten = pynutil.delete("십") + ten_alt = pynini.cross("십", "1") + ### Responsible for second digit of two digit number. ex) 20's 2 + graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) + ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, + graph_ten_component += graph_digit | pynutil.insert("0") + + hundred = pynutil.delete("백") + hundred_alt = pynini.cross("백", "1") + graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) + graph_hundred_component += graph_ten_component + + thousand = pynutil.delete("천") + thousand_alt = pynini.cross("천", "1") + graph_thousand_component = pynini.union(((graph_digit + thousand) | thousand_alt), pynutil.insert("0")) + graph_thousand_component += graph_hundred_component + + tenthousand = pynutil.delete("만") + tenthousand_alt = pynini.cross("만", "1") + ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space + ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits + graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component += graph_thousand_component + + hundredmillion = pynutil.delete("억") + hundredmillion_alt = pynini.cross("억", "1") + graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) + graph_hundredmillion_component += graph_tenthousand_component + + trillion = pynutil.delete("조") + trillion_alt = pynini.cross("조", "1") + graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component += graph_hundredmillion_component + + tenquadrillion = pynutil.delete("경") + tenquadrillion_alt = pynini.cross("경", "1") + graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component += graph_trillion_component + + + graph = pynini.union( + ### From biggest unit to smallest, everything is included + graph_tenquadrillion_component| + graph_zero + ) + + leading_zero = ( + pynutil.delete(pynini.closure("0")) + pynini.difference(NEMO_DIGIT, "0") + pynini.closure(NEMO_DIGIT) + ) + graph_nonzero = graph @ leading_zero + graph = pynini.union(graph_nonzero, graph_zero) + + graph = graph @ leading_zero | graph_zero + + self.just_cardinals = graph + + optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + + final_graph = ( + optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) + + final_graph = self.add_tokens(final_graph) + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py new file mode 100644 index 000000000..760ce6829 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -0,0 +1,76 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) + + +class ClassifyFst(GraphFst): + """ + Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + + Args: + input_case: accepting either "lower_cased" or "cased" input. + cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache. + overwrite_cache: set to True to overwrite .far files + whitelist: path to a file with whitelist replacements + """ + + def __init__( + self, + input_case: str = INPUT_LOWER_CASED, + cache_dir: str = None, + overwrite_cache: bool = False, + whitelist: str = None, + ): + super().__init__(name="tokenize_and_classify", kind="classify") + + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"jp_itn_{input_case}.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] + logging.info(f"ClassifyFst.fst was restored from {far_file}.") + else: + logging.info(f"Creating ClassifyFst grammars.") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + word_graph = WordFst().fst + classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) + + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") + tagger = pynini.closure(token, 1) + + self.fst = tagger + + if far_file: + generator_main(far_file, {"tokenize_and_classify": self.fst}) + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py new file mode 100644 index 000000000..0d6ccd5c5 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -0,0 +1,32 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_SPACE, GraphFst + + +class WordFst(GraphFst): + """ + Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class. + e.g. sleep -> tokens { name: "sleep" } + """ + + def __init__(self): + super().__init__(name="word", kind="classify") + word = pynutil.insert( + "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py new file mode 100644 index 000000000..0222cc0b8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -0,0 +1,23 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + + +def get_abs_path(rel_path): + + return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py new file mode 100644 index 000000000..da950f35e --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py new file mode 100644 index 000000000..1800a6dc8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) + + +class CardinalFst(GraphFst): + """ + Finite state transducer for verbalizing cardinal + e.g. cardinal { negative: "-" integer: "23" } -> -23 + """ + + def __init__(self): + super().__init__(name="cardinal", kind="verbalize") + negative_sign = ( + pynutil.delete("negative:") + + delete_space + + pynutil.delete("\"") + + pynini.accep("-") + + pynutil.delete("\"") + ) + + optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) + + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + integer_cardinal = ( + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + digits_from_tag + + pynutil.delete("\"") + ) + + graph = integer_cardinal + final_graph = optional_sign_output + graph + self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py new file mode 100644 index 000000000..9d750d757 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -0,0 +1,36 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst + + +class VerbalizeFst(GraphFst): + """ + Composes other verbalizer grammars. + For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. + More details to deployment at NeMo/tools/text_processing_deployment. + """ + + def __init__(self): + super().__init__(name="verbalize", kind="verbalize") + cardinal = CardinalFst() + cardinal_graph = cardinal.fst + word_graph = WordFst().fst + + graph = (cardinal_graph|word_graph) + self.fst = graph + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py new file mode 100644 index 000000000..8554fc161 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space + + +class VerbalizeFinalFst(GraphFst): + """ + Finite state transducer that verbalizes an entire sentence, e.g. + tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now + """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): + super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) + far_file = None + if cache_dir is not None and cache_dir != "None": + os.makedirs(cache_dir, exist_ok=True) + far_file = os.path.join(cache_dir, f"ko_tn_{deterministic}_deterministic_verbalizer.far") + if not overwrite_cache and far_file and os.path.exists(far_file): + self.fst = pynini.Far(far_file, mode="r")["verbalize"] + else: + # token_graph = VerbalizeFst(deterministic=deterministic) + token_graph = VerbalizeFst().fst + token_verbalizer = ( + pynutil.delete("tokens {") + delete_space + token_graph + delete_space + pynutil.delete(" }") + ) + verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) + + self.fst = (verbalizer).optimize() + if far_file: + generator_main(far_file, {"verbalize": self.fst}) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py new file mode 100644 index 000000000..d79957ca8 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# Copyright 2015 and onwards Google, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import pynini +from pynini.lib import pynutil + +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space + + + +class WordFst(GraphFst): + ''' + tokens { name: "一" } -> 一 + ''' + + def __init__(self, deterministic: bool = True, lm: bool = False): + super().__init__(name="word", kind="verbalize", deterministic=deterministic) + + graph = pynutil.delete("name: \"") + NEMO_NOT_QUOTE + pynutil.delete("\"") + + self.fst = graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 0852329d6..7bfdd3399 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", 'ja'], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/__init__.py b/tests/nemo_text_processing/ko/__init__.py new file mode 100644 index 000000000..341a77c5b --- /dev/null +++ b/tests/nemo_text_processing/ko/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt new file mode 100644 index 000000000..007273e5e --- /dev/null +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt @@ -0,0 +1,27 @@ +영~0 +구~9 +십~10 +십칠~17 +오십삼~53 +백~100 +백오~105 +삼백이십~320 +구백팔십칠~987 +천~1000 +천육~1006 +천오백~1500 +오천사백삼십이~5432 +만~10000 +만천이백~11200 +삼만오천칠백~35700 +십이만~120000 +백오십만삼천~1503000 +천만~10000000 +오천이백칠십만육천백~52706100 +억~100000000 +삼억오천만~350000000 +십이억천만~1210000000 +백오십억칠천만~15070000000 +오천억~500000000000 +일조~1000000000000 +이조오천억~2500000000000 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py new file mode 100644 index 000000000..9fd366ea6 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -0,0 +1,39 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from parameterized import parameterized + +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio + +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file + + +class TestCardinal: + inverse_normalizer_ko = InverseNormalizer(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + + @parameterized.expand(parse_test_case_file('ko/data_inverse_text_normalization/test_cases_cardinal.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) + assert pred == expected + + normalizer_with_audio_ko = ( + NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) + if RUN_AUDIO_BASED_TESTS + else None + ) \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh new file mode 100644 index 000000000..c44f4a703 --- /dev/null +++ b/tests/nemo_text_processing/ko/test_sparrowhawk_inverse_text_normalization.sh @@ -0,0 +1,34 @@ +#! /bin/sh + +GRAMMARS_DIR=${1:-"/workspace/sparrowhawk/documentation/grammars"} +TEST_DIR=${2:-"/workspace/tests/ko"} + +runtest () { + input=$1 + echo "INPUT is $input" + cd ${GRAMMARS_DIR} + + # read test file + while read testcase; do + IFS='~' read spoken written <<< $testcase + denorm_pred=$(echo $spoken | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1) + + # trim white space + written="$(echo -e "${written}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + + # input expected actual + assertEquals "$spoken" "$written" "$denorm_pred" + done < "$input" +} + +testITNCardinal() { + input=$TEST_DIR/data_inverse_text_normalization/test_cases_cardinal.txt + runtest $input +} + +# Remove all command-line arguments +shift $# + +# Load shUnit2 +. /workspace/shunit2/shunit2 \ No newline at end of file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 6b82dfbec..0df099774 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,6 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', + 'ko' ], type=str, default='en', @@ -307,6 +308,13 @@ def parse_args(): PostProcessingFst as TNPostProcessingFst, ) from nemo_text_processing.text_normalization.ja.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst + elif args.language == 'ko': + from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ( + ClassifyFst as ITNClassifyFst, + ) + from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import ( + VerbalizeFst as ITNVerbalizeFst, + ) elif args.language == 'rw': from nemo_text_processing.text_normalization.rw.taggers.tokenize_and_classify import ( ClassifyFst as TNClassifyFst, From 77da79d12b1378502cc2b382cd6933b02e7c2545 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 18:46:22 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_normalize.py | 4 +- .../ko/clean_eval_data.py | 59 +++++++------------ .../ko/taggers/cardinal.py | 38 +++++++----- .../ko/taggers/tokenize_and_classify.py | 12 ++-- .../ko/taggers/word.py | 3 +- .../inverse_text_normalization/ko/utils.py | 3 - .../ko/verbalizers/__init__.py | 2 +- .../ko/verbalizers/cardinal.py | 18 ++---- .../ko/verbalizers/verbalize.py | 7 +-- .../ko/verbalizers/verbalize_final.py | 3 +- .../ko/verbalizers/word.py | 1 - .../run_evaluate.py | 2 +- .../nemo_text_processing/ko/test_cardinal.py | 6 +- .../pynini_export.py | 2 +- 14 files changed, 68 insertions(+), 92 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index e505a8ad0..acda8b7f9 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -135,7 +135,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( VerbalizeFinalFst, - ) + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -180,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py index 3c1193333..bc429e858 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py +++ b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py @@ -282,41 +282,24 @@ def process_address_1(instance: Instance) -> Instance: filters = [] -filters.append(Filter(class_type="CARDINAL", - process_func=process_cardinal_1, filter_func=filter_cardinal_1)) -filters.append(Filter(class_type="ORDINAL", - process_func=process_ordinal_1, filter_func=filter_ordinal_1)) -filters.append(Filter(class_type="DECIMAL", - process_func=process_decimal_1, filter_func=filter_decimal_1)) -filters.append(Filter(class_type="MEASURE", - process_func=process_measure_1, filter_func=filter_measure_1)) -filters.append(Filter(class_type="MONEY", - process_func=process_money_1, filter_func=filter_money_1)) -filters.append(Filter(class_type="TIME", - process_func=process_time_1, filter_func=filter_time_1)) - -filters.append(Filter(class_type="DATE", - process_func=process_date_1, filter_func=filter_date_1)) -filters.append(Filter(class_type="PLAIN", - process_func=process_plain_1, filter_func=filter_plain_1)) -filters.append(Filter(class_type="PUNCT", - process_func=process_punct_1, filter_func=filter_punct_1)) -filters.append(Filter(class_type="LETTERS", - process_func=process_letters_1, filter_func=filter_letters_1)) -filters.append(Filter(class_type="VERBATIM", - process_func=process_verbatim_1, filter_func=filter_verbatim_1)) -filters.append(Filter(class_type="DIGIT", - process_func=process_digit_1, filter_func=filter_digit_1)) -filters.append(Filter(class_type="TELEPHONE", - process_func=process_telephone_1, filter_func=filter_telephone_1)) -filters.append(Filter(class_type="ELECTRONIC", - process_func=process_electronic_1, filter_func=filter_electronic_1)) -filters.append(Filter(class_type="FRACTION", - process_func=process_fraction_1, filter_func=filter_fraction_1)) -filters.append(Filter(class_type="ADDRESS", - process_func=process_address_1, filter_func=filter_address_1)) -filters.append(Filter(class_type=EOS_TYPE, - process_func=lambda x: x, filter_func=lambda x: True)) +filters.append(Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)) +filters.append(Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)) +filters.append(Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)) +filters.append(Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)) +filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1)) +filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1)) + +filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1)) +filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1)) +filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1)) +filters.append(Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)) +filters.append(Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)) +filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1)) +filters.append(Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)) +filters.append(Filter(class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1)) +filters.append(Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)) +filters.append(Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)) +filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True)) def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: @@ -344,10 +327,8 @@ def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Inst def parse_args(): parser = ArgumentParser() - parser.add_argument("--input", help="input file path", - type=str, default='./en_with_types/output-00001-of-00100') - parser.add_argument( - "--verbose", help="print filtered instances", action='store_true') + parser.add_argument("--input", help="input file path", type=str, default='./en_with_types/output-00001-of-00100') + parser.add_argument("--verbose", help="print filtered instances", action='store_true') return parser.parse_args() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index df5804fc0..09cc03909 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -19,6 +19,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals @@ -37,14 +38,14 @@ def __init__(self): graph_negative = pynini.cross("마이너스", "-") graph_negative += delete_space - + ten = pynutil.delete("십") ten_alt = pynini.cross("십", "1") ### Responsible for second digit of two digit number. ex) 20's 2 graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, graph_ten_component += graph_digit | pynutil.insert("0") - + hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) @@ -59,29 +60,36 @@ def __init__(self): tenthousand_alt = pynini.cross("만", "1") ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits - graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component = pynini.union( + ((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000") + ) graph_tenthousand_component += graph_thousand_component hundredmillion = pynutil.delete("억") hundredmillion_alt = pynini.cross("억", "1") - graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) - graph_hundredmillion_component += graph_tenthousand_component - + graph_hundredmillion_component = pynini.union( + ((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000") + ) + graph_hundredmillion_component += graph_tenthousand_component + trillion = pynutil.delete("조") trillion_alt = pynini.cross("조", "1") - graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component = pynini.union( + ((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000") + ) graph_trillion_component += graph_hundredmillion_component tenquadrillion = pynutil.delete("경") tenquadrillion_alt = pynini.cross("경", "1") - graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component = pynini.union( + ((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000") + ) graph_tenquadrillion_component += graph_trillion_component - graph = pynini.union( ### From biggest unit to smallest, everything is included - graph_tenquadrillion_component| - graph_zero + graph_tenquadrillion_component + | graph_zero ) leading_zero = ( @@ -89,16 +97,18 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - + graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + optional_sign = pynini.closure( + (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 + ) final_graph = ( optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 760ce6829..2842a4167 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,15 +19,15 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst -from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( INPUT_LOWER_CASED, GraphFst, delete_extra_space, delete_space, generator_main, ) +from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst +from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst class ClassifyFst(GraphFst): @@ -64,8 +64,8 @@ def __init__( cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) - + classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) @@ -73,4 +73,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py index 0d6ccd5c5..0e4dbb93c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -27,6 +27,5 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert( - "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py index 0222cc0b8..d198c3835 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -15,9 +15,6 @@ import os - def get_abs_path(rel_path): return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path - - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index da950f35e..f541211af 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -14,4 +14,4 @@ from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file +from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py index 1800a6dc8..fb9a76d8e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -15,11 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class CardinalFst(GraphFst): @@ -34,21 +30,17 @@ def __init__(self): pynutil.delete("negative:") + delete_space + pynutil.delete("\"") - + pynini.accep("-") + + pynini.accep("-") + pynutil.delete("\"") ) optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) - digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) integer_cardinal = ( - pynutil.delete("integer:") - + delete_space - + pynutil.delete("\"") - + digits_from_tag - + pynutil.delete("\"") + pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"") ) graph = integer_cardinal final_graph = optional_sign_output + graph - self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file + self.fst = self.delete_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 9d750d757..d8851e206 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst class VerbalizeFst(GraphFst): @@ -30,7 +30,6 @@ def __init__(self): cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - - graph = (cardinal_graph|word_graph) + + graph = cardinal_graph | word_graph self.fst = graph - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8554fc161..09b4cbc8b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -18,9 +18,9 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space class VerbalizeFinalFst(GraphFst): @@ -28,6 +28,7 @@ class VerbalizeFinalFst(GraphFst): Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index d79957ca8..c134fe63a 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -20,7 +20,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space - class WordFst(GraphFst): ''' tokens { name: "一" } -> 一 diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 7bfdd3399..133474940 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja", "ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index 9fd366ea6..526747668 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -33,7 +33,5 @@ def test_denorm(self, test_input, expected): assert pred == expected normalizer_with_audio_ko = ( - NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) - if RUN_AUDIO_BASED_TESTS - else None - ) \ No newline at end of file + NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) if RUN_AUDIO_BASED_TESTS else None + ) diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 0df099774..d1ba34a37 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,7 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', - 'ko' + 'ko', ], type=str, default='en', From 9f7e876841b518a5b4d3d5e68df760cb7126729c Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Fri, 16 May 2025 13:10:40 -0700 Subject: [PATCH 3/8] fixing all the feedbacks Signed-off-by: hmlee245 --- .../ko/clean_eval_data.py | 361 ------------------ .../ko/data/numbers/zero.tsv | 1 - .../ko/graph_utils.py | 2 +- .../ko/taggers/__init__.py | 3 - .../ko/taggers/cardinal.py | 6 +- .../ko/taggers/tokenize_and_classify.py | 2 - .../ko/verbalizers/__init__.py | 4 - .../ko/verbalizers/verbalize_final.py | 1 - .../ko/verbalizers/word.py | 4 +- .../nemo_text_processing/ko/test_cardinal.py | 12 +- 10 files changed, 5 insertions(+), 391 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv diff --git a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py b/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py deleted file mode 100644 index 3c1193333..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/clean_eval_data.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from argparse import ArgumentParser -from typing import List - -import regex as re - -from nemo_text_processing.text_normalization.data_loader_utils import ( - EOS_TYPE, - Instance, - load_files, - training_data_to_sentences, -) - -""" -This file is for evaluation purposes. -filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually. -For example, normalized text should only include characters and whitespace characters but no punctuation. - Cardinal unnormalized instances should contain at least one integer and all other characters are removed. -""" - - -class Filter: - """ - Filter class - - Args: - class_type: semiotic class used in dataset - process_func: function to transform text - filter_func: function to filter text - - """ - - def __init__(self, class_type: str, process_func: object, filter_func: object): - self.class_type = class_type - self.process_func = process_func - self.filter_func = filter_func - - def filter(self, instance: Instance) -> bool: - """ - filter function - - Args: - filters given instance with filter function - - Returns: True if given instance fulfills criteria or does not belong to class type - """ - if instance.token_type != self.class_type: - return True - return self.filter_func(instance) - - def process(self, instance: Instance) -> Instance: - """ - process function - - Args: - processes given instance with process function - - Returns: processed instance if instance belongs to expected class type or original instance - """ - if instance.token_type != self.class_type: - return instance - return self.process_func(instance) - - -def filter_cardinal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_cardinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[^0-9]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_ordinal_1(instance: Instance) -> bool: - ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized) - return ok - - -def process_ordinal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r"[,\s]", "", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_decimal_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_decimal_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_measure_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_measure_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"m2", "m²", un_normalized) - un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized) - normalized = re.sub(r"[^a-z\s]", "", normalized) - normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_money_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_money_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - un_normalized = re.sub(r",", "", un_normalized) - un_normalized = re.sub(r"a\$", r"$", un_normalized) - un_normalized = re.sub(r"us\$", r"$", un_normalized) - un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized) - un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized) - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_time_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_time_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r": ", ":", un_normalized) - un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized) - un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_plain_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_plain_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_punct_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_punct_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_date_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_date_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - un_normalized = re.sub(r",", "", un_normalized) - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_letters_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_letters_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_verbatim_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_verbatim_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_digit_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_digit_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_telephone_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_telephone_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_electronic_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_electronic_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_fraction_1(instance: Instance) -> bool: - ok = re.search(r"[0-9]", instance.un_normalized) - return ok - - -def process_fraction_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -def filter_address_1(instance: Instance) -> bool: - ok = True - return ok - - -def process_address_1(instance: Instance) -> Instance: - un_normalized = instance.un_normalized - normalized = instance.normalized - normalized = re.sub(r"[^a-z ]", "", normalized) - return Instance(token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized) - - -filters = [] -filters.append(Filter(class_type="CARDINAL", - process_func=process_cardinal_1, filter_func=filter_cardinal_1)) -filters.append(Filter(class_type="ORDINAL", - process_func=process_ordinal_1, filter_func=filter_ordinal_1)) -filters.append(Filter(class_type="DECIMAL", - process_func=process_decimal_1, filter_func=filter_decimal_1)) -filters.append(Filter(class_type="MEASURE", - process_func=process_measure_1, filter_func=filter_measure_1)) -filters.append(Filter(class_type="MONEY", - process_func=process_money_1, filter_func=filter_money_1)) -filters.append(Filter(class_type="TIME", - process_func=process_time_1, filter_func=filter_time_1)) - -filters.append(Filter(class_type="DATE", - process_func=process_date_1, filter_func=filter_date_1)) -filters.append(Filter(class_type="PLAIN", - process_func=process_plain_1, filter_func=filter_plain_1)) -filters.append(Filter(class_type="PUNCT", - process_func=process_punct_1, filter_func=filter_punct_1)) -filters.append(Filter(class_type="LETTERS", - process_func=process_letters_1, filter_func=filter_letters_1)) -filters.append(Filter(class_type="VERBATIM", - process_func=process_verbatim_1, filter_func=filter_verbatim_1)) -filters.append(Filter(class_type="DIGIT", - process_func=process_digit_1, filter_func=filter_digit_1)) -filters.append(Filter(class_type="TELEPHONE", - process_func=process_telephone_1, filter_func=filter_telephone_1)) -filters.append(Filter(class_type="ELECTRONIC", - process_func=process_electronic_1, filter_func=filter_electronic_1)) -filters.append(Filter(class_type="FRACTION", - process_func=process_fraction_1, filter_func=filter_fraction_1)) -filters.append(Filter(class_type="ADDRESS", - process_func=process_address_1, filter_func=filter_address_1)) -filters.append(Filter(class_type=EOS_TYPE, - process_func=lambda x: x, filter_func=lambda x: True)) - - -def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]: - """ - Filters list of instances - - Args: - data: list of instances - - Returns: filtered and transformed list of instances - """ - updates_instances = [] - for instance in data: - updated_instance = False - for fil in filters: - if fil.class_type == instance.token_type and fil.filter(instance): - instance = fil.process(instance) - updated_instance = True - if updated_instance: - if verbose: - print(instance) - updates_instances.append(instance) - return updates_instances - - -def parse_args(): - parser = ArgumentParser() - parser.add_argument("--input", help="input file path", - type=str, default='./en_with_types/output-00001-of-00100') - parser.add_argument( - "--verbose", help="print filtered instances", action='store_true') - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - file_path = args.input - - print("Loading training data: " + file_path) - instance_list = load_files([file_path]) # List of instances - filtered_instance_list = filter_loaded_data(instance_list, args.verbose) - training_data_to_sentences(filtered_instance_list) diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv deleted file mode 100644 index 43baac7c1..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv +++ /dev/null @@ -1 +0,0 @@ -영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py index 7a9fd8720..50f1eb3b9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/graph_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # Copyright 2015 and onwards Google, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py index f541211af..f6e3c3795 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -12,6 +12,3 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index df5804fc0..7253019f0 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -31,13 +31,9 @@ class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="classify") - graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) - graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) graph_zero = pynini.cross("영", "0") + graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) - graph_negative = pynini.cross("마이너스", "-") - graph_negative += delete_space - ten = pynutil.delete("십") ten_alt = pynini.cross("십", "1") ### Responsible for second digit of two digit number. ex) 20's 2 diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 760ce6829..bb6b35d41 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -24,8 +24,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( INPUT_LOWER_CASED, GraphFst, - delete_extra_space, - delete_space, generator_main, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index da950f35e..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -11,7 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8554fc161..8d40d2804 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -19,7 +19,6 @@ from pynini.lib import pynutil from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index d79957ca8..a423d5d0c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -13,11 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index 9fd366ea6..872a5aa2a 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -16,10 +16,8 @@ from parameterized import parameterized from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer -from nemo_text_processing.text_normalization.normalize import Normalizer -from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file +from ..utils import CACHE_DIR, parse_test_case_file class TestCardinal: @@ -30,10 +28,4 @@ class TestCardinal: @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) - assert pred == expected - - normalizer_with_audio_ko = ( - NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) - if RUN_AUDIO_BASED_TESTS - else None - ) \ No newline at end of file + assert pred == expected \ No newline at end of file From 4df2965feae682f7762f3c6f292613339869a89b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 20:23:32 +0000 Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/ko/taggers/__init__.py | 1 - .../ko/taggers/tokenize_and_classify.py | 6 +----- .../ko/verbalizers/verbalize_final.py | 5 ++++- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py index f6e3c3795..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/__init__.py @@ -11,4 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 30e0f5df4..75e3f6f20 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,11 +19,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - INPUT_LOWER_CASED, - GraphFst, - generator_main, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 648285758..09c917d00 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -20,10 +20,13 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst + <<<<<<< HEAD -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main + ======= from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst + >>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 From 41ac59d791511cd82c03b242e8ec671c91360c6e Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Fri, 16 May 2025 13:36:00 -0700 Subject: [PATCH 5/8] This reverts commit f893d89bd8890e1b46df1e40054cc9176ac7ce7a, reversing changes made to 9f7e876841b518a5b4d3d5e68df760cb7126729c. Signed-off-by: hmlee245 --- .../inverse_normalize.py | 4 +- .../ko/taggers/cardinal.py | 42 ++++++------------- .../ko/taggers/tokenize_and_classify.py | 12 ++++-- .../ko/taggers/word.py | 3 +- .../inverse_text_normalization/ko/utils.py | 3 ++ .../ko/verbalizers/__init__.py | 7 ---- .../ko/verbalizers/cardinal.py | 18 +++++--- .../ko/verbalizers/verbalize.py | 7 ++-- .../ko/verbalizers/verbalize_final.py | 11 +---- .../ko/verbalizers/word.py | 1 + .../run_evaluate.py | 2 +- .../nemo_text_processing/ko/test_cardinal.py | 10 +---- .../pynini_export.py | 2 +- 13 files changed, 50 insertions(+), 72 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index acda8b7f9..e505a8ad0 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -135,7 +135,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( VerbalizeFinalFst, - ) + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -180,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index f3fa597e3..7253019f0 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -19,7 +19,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path - class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals @@ -35,19 +34,13 @@ def __init__(self): graph_zero = pynini.cross("영", "0") graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) -<<<<<<< HEAD -======= - graph_negative = pynini.cross("마이너스", "-") - graph_negative += delete_space - ->>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 ten = pynutil.delete("십") ten_alt = pynini.cross("십", "1") ### Responsible for second digit of two digit number. ex) 20's 2 graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, graph_ten_component += graph_digit | pynutil.insert("0") - + hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) @@ -62,36 +55,29 @@ def __init__(self): tenthousand_alt = pynini.cross("만", "1") ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits - graph_tenthousand_component = pynini.union( - ((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000") - ) + graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) graph_tenthousand_component += graph_thousand_component hundredmillion = pynutil.delete("억") hundredmillion_alt = pynini.cross("억", "1") - graph_hundredmillion_component = pynini.union( - ((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000") - ) - graph_hundredmillion_component += graph_tenthousand_component - + graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) + graph_hundredmillion_component += graph_tenthousand_component + trillion = pynutil.delete("조") trillion_alt = pynini.cross("조", "1") - graph_trillion_component = pynini.union( - ((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000") - ) + graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) graph_trillion_component += graph_hundredmillion_component tenquadrillion = pynutil.delete("경") tenquadrillion_alt = pynini.cross("경", "1") - graph_tenquadrillion_component = pynini.union( - ((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000") - ) + graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) graph_tenquadrillion_component += graph_trillion_component + graph = pynini.union( ### From biggest unit to smallest, everything is included - graph_tenquadrillion_component - | graph_zero + graph_tenquadrillion_component| + graph_zero ) leading_zero = ( @@ -99,18 +85,16 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - + graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure( - (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 - ) + optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) final_graph = ( optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() + self.fst = final_graph.optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index 75e3f6f20..bb6b35d41 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,9 +19,13 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + INPUT_LOWER_CASED, + GraphFst, + generator_main, +) class ClassifyFst(GraphFst): @@ -58,8 +62,8 @@ def __init__( cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) - + classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) @@ -67,4 +71,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") + logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py index 0e4dbb93c..0d6ccd5c5 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -27,5 +27,6 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + word = pynutil.insert( + "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py index d198c3835..0222cc0b8 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -15,6 +15,9 @@ import os + def get_abs_path(rel_path): return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path + + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py index b8e634eef..341a77c5b 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/__init__.py @@ -11,10 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -<<<<<<< HEAD -======= - -from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst ->>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py index fb9a76d8e..1800a6dc8 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -15,7 +15,11 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( + NEMO_NOT_QUOTE, + GraphFst, + delete_space, +) class CardinalFst(GraphFst): @@ -30,17 +34,21 @@ def __init__(self): pynutil.delete("negative:") + delete_space + pynutil.delete("\"") - + pynini.accep("-") + + pynini.accep("-") + pynutil.delete("\"") ) optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) - digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) integer_cardinal = ( - pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"") + pynutil.delete("integer:") + + delete_space + + pynutil.delete("\"") + + digits_from_tag + + pynutil.delete("\"") ) graph = integer_cardinal final_graph = optional_sign_output + graph - self.fst = self.delete_tokens(final_graph).optimize() + self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index d8851e206..9d750d757 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst class VerbalizeFst(GraphFst): @@ -30,6 +30,7 @@ def __init__(self): cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - - graph = cardinal_graph | word_graph + + graph = (cardinal_graph|word_graph) self.fst = graph + diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 09c917d00..8d40d2804 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -18,16 +18,8 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst - -<<<<<<< HEAD -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main - -======= -from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst - ->>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space class VerbalizeFinalFst(GraphFst): @@ -35,7 +27,6 @@ class VerbalizeFinalFst(GraphFst): Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ - def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index ecf62bfe3..a423d5d0c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -18,6 +18,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst + class WordFst(GraphFst): ''' tokens { name: "一" } -> 一 diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 133474940..7bfdd3399 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja", "ko"], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index ff5950f2a..872a5aa2a 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -28,12 +28,4 @@ class TestCardinal: @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) -<<<<<<< HEAD - assert pred == expected -======= - assert pred == expected - - normalizer_with_audio_ko = ( - NormalizerWithAudio(lang='ko', cache_dir=CACHE_DIR, overwrite_cache=False) if RUN_AUDIO_BASED_TESTS else None - ) ->>>>>>> 77da79d12b1378502cc2b382cd6933b02e7c2545 + assert pred == expected \ No newline at end of file diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index d1ba34a37..0df099774 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,7 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', - 'ko', + 'ko' ], type=str, default='en', From a5164dc157fdfd6af8aeca449eb7875c80ba6aae Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 20:55:36 +0000 Subject: [PATCH 6/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_normalize.py | 4 +-- .../ko/taggers/cardinal.py | 36 ++++++++++++------- .../ko/taggers/tokenize_and_classify.py | 12 +++---- .../ko/taggers/word.py | 3 +- .../inverse_text_normalization/ko/utils.py | 3 -- .../ko/verbalizers/cardinal.py | 18 +++------- .../ko/verbalizers/verbalize.py | 7 ++-- .../ko/verbalizers/verbalize_final.py | 3 +- .../ko/verbalizers/word.py | 1 - .../run_evaluate.py | 2 +- .../nemo_text_processing/ko/test_cardinal.py | 2 +- .../pynini_export.py | 2 +- 12 files changed, 43 insertions(+), 50 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py index e505a8ad0..acda8b7f9 100644 --- a/nemo_text_processing/inverse_text_normalization/inverse_normalize.py +++ b/nemo_text_processing/inverse_text_normalization/inverse_normalize.py @@ -135,7 +135,7 @@ def __init__( from nemo_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import ( VerbalizeFinalFst, - ) + ) self.tagger = ClassifyFst( cache_dir=cache_dir, whitelist=whitelist, overwrite_cache=overwrite_cache, input_case=input_case @@ -180,7 +180,7 @@ def parse_args(): parser.add_argument( "--language", help="language", - choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja','ko'], + choices=['en', 'de', 'es', 'pt', 'ru', 'fr', 'sv', 'vi', 'ar', 'es_en', 'zh', 'hi', 'hy', 'mr', 'ja', 'ko'], default="en", type=str, ) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index 7253019f0..14172b4e9 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -19,6 +19,7 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst, delete_space from nemo_text_processing.inverse_text_normalization.ko.utils import get_abs_path + class CardinalFst(GraphFst): """ Finite state transducer for classifying cardinals @@ -40,7 +41,7 @@ def __init__(self): graph_ten_component = pynini.union((graph_digit + ten) | ten_alt, pynutil.insert("0")) ### Responsible for the first digit of number. ex) 1,2,3,4,5,,, graph_ten_component += graph_digit | pynutil.insert("0") - + hundred = pynutil.delete("백") hundred_alt = pynini.cross("백", "1") graph_hundred_component = pynini.union(((graph_digit + hundred) | hundred_alt), pynutil.insert("0")) @@ -55,29 +56,36 @@ def __init__(self): tenthousand_alt = pynini.cross("만", "1") ### "만" can express next four digits of numbers until the next unit "억", so insert "0000" to allocate four digit worth of space ### From "만", keep adding four digits and graph_thousand_component(0000-9999), because Korean units increase every four digits - graph_tenthousand_component = pynini.union(((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000")) + graph_tenthousand_component = pynini.union( + ((graph_thousand_component + tenthousand) | tenthousand_alt), pynutil.insert("0000") + ) graph_tenthousand_component += graph_thousand_component hundredmillion = pynutil.delete("억") hundredmillion_alt = pynini.cross("억", "1") - graph_hundredmillion_component = pynini.union(((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000")) - graph_hundredmillion_component += graph_tenthousand_component - + graph_hundredmillion_component = pynini.union( + ((graph_thousand_component + hundredmillion) | hundredmillion_alt), pynutil.insert("0000") + ) + graph_hundredmillion_component += graph_tenthousand_component + trillion = pynutil.delete("조") trillion_alt = pynini.cross("조", "1") - graph_trillion_component = pynini.union(((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000")) + graph_trillion_component = pynini.union( + ((graph_thousand_component + trillion) | trillion_alt), pynutil.insert("0000") + ) graph_trillion_component += graph_hundredmillion_component tenquadrillion = pynutil.delete("경") tenquadrillion_alt = pynini.cross("경", "1") - graph_tenquadrillion_component = pynini.union(((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000")) + graph_tenquadrillion_component = pynini.union( + ((graph_thousand_component + tenquadrillion) | tenquadrillion_alt), pynutil.insert("0000") + ) graph_tenquadrillion_component += graph_trillion_component - graph = pynini.union( ### From biggest unit to smallest, everything is included - graph_tenquadrillion_component| - graph_zero + graph_tenquadrillion_component + | graph_zero ) leading_zero = ( @@ -85,16 +93,18 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - + graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + optional_sign = pynini.closure( + (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 + ) final_graph = ( optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) - self.fst = final_graph.optimize() \ No newline at end of file + self.fst = final_graph.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py index bb6b35d41..75e3f6f20 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/tokenize_and_classify.py @@ -19,13 +19,9 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import INPUT_LOWER_CASED, GraphFst, generator_main from nemo_text_processing.inverse_text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - INPUT_LOWER_CASED, - GraphFst, - generator_main, -) class ClassifyFst(GraphFst): @@ -62,8 +58,8 @@ def __init__( cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - classify = (pynutil.add_weight(cardinal_graph, 1.1)| pynutil.add_weight(word_graph, 100)) - + classify = pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(word_graph, 100) + token = pynutil.insert("tokens { ") + classify + pynutil.insert(" } ") tagger = pynini.closure(token, 1) @@ -71,4 +67,4 @@ def __init__( if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) - logging.info(f"ClassifyFst grammars are saved to {far_file}.") \ No newline at end of file + logging.info(f"ClassifyFst grammars are saved to {far_file}.") diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py index 0d6ccd5c5..0e4dbb93c 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/word.py @@ -27,6 +27,5 @@ class WordFst(GraphFst): def __init__(self): super().__init__(name="word", kind="classify") - word = pynutil.insert( - "name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") + word = pynutil.insert("name: \"") + pynini.closure(NEMO_NOT_SPACE, 1) + pynutil.insert("\"") self.fst = word.optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/utils.py b/nemo_text_processing/inverse_text_normalization/ko/utils.py index 0222cc0b8..d198c3835 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/utils.py +++ b/nemo_text_processing/inverse_text_normalization/ko/utils.py @@ -15,9 +15,6 @@ import os - def get_abs_path(rel_path): return os.path.dirname(os.path.abspath(__file__)) + '/' + rel_path - - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py index 1800a6dc8..fb9a76d8e 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/cardinal.py @@ -15,11 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import ( - NEMO_NOT_QUOTE, - GraphFst, - delete_space, -) +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space class CardinalFst(GraphFst): @@ -34,21 +30,17 @@ def __init__(self): pynutil.delete("negative:") + delete_space + pynutil.delete("\"") - + pynini.accep("-") + + pynini.accep("-") + pynutil.delete("\"") ) optional_sign_output = pynini.closure(negative_sign + delete_space, 0, 1) - digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) + digits_from_tag = pynini.closure(NEMO_NOT_QUOTE, 1) integer_cardinal = ( - pynutil.delete("integer:") - + delete_space - + pynutil.delete("\"") - + digits_from_tag - + pynutil.delete("\"") + pynutil.delete("integer:") + delete_space + pynutil.delete("\"") + digits_from_tag + pynutil.delete("\"") ) graph = integer_cardinal final_graph = optional_sign_output + graph - self.fst = self.delete_tokens(final_graph).optimize() \ No newline at end of file + self.fst = self.delete_tokens(final_graph).optimize() diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py index 9d750d757..d8851e206 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize.py @@ -13,9 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.cardinal import CardinalFst from nemo_text_processing.inverse_text_normalization.ko.verbalizers.word import WordFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst class VerbalizeFst(GraphFst): @@ -30,7 +30,6 @@ def __init__(self): cardinal = CardinalFst() cardinal_graph = cardinal.fst word_graph = WordFst().fst - - graph = (cardinal_graph|word_graph) + + graph = cardinal_graph | word_graph self.fst = graph - diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py index 8d40d2804..17f547740 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/verbalize_final.py @@ -18,8 +18,8 @@ import pynini from pynini.lib import pynutil +from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main from nemo_text_processing.inverse_text_normalization.ko.verbalizers.verbalize import VerbalizeFst -from nemo_text_processing.inverse_text_normalization.ko.graph_utils import GraphFst, generator_main, delete_space class VerbalizeFinalFst(GraphFst): @@ -27,6 +27,7 @@ class VerbalizeFinalFst(GraphFst): Finite state transducer that verbalizes an entire sentence, e.g. tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now """ + def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_cache: bool = False): super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic) far_file = None diff --git a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py index a423d5d0c..ecf62bfe3 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py +++ b/nemo_text_processing/inverse_text_normalization/ko/verbalizers/word.py @@ -18,7 +18,6 @@ from nemo_text_processing.inverse_text_normalization.ko.graph_utils import NEMO_NOT_QUOTE, GraphFst - class WordFst(GraphFst): ''' tokens { name: "一" } -> 一 diff --git a/nemo_text_processing/inverse_text_normalization/run_evaluate.py b/nemo_text_processing/inverse_text_normalization/run_evaluate.py index 7bfdd3399..133474940 100644 --- a/nemo_text_processing/inverse_text_normalization/run_evaluate.py +++ b/nemo_text_processing/inverse_text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja","ko"], + choices=["ar", "de", "en", "es", "es_en", "fr", "hi", "hy", "mr", "pt", "ru", "sv", "vi", "zh", "ja", "ko"], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/test_cardinal.py b/tests/nemo_text_processing/ko/test_cardinal.py index 872a5aa2a..f95d74107 100644 --- a/tests/nemo_text_processing/ko/test_cardinal.py +++ b/tests/nemo_text_processing/ko/test_cardinal.py @@ -28,4 +28,4 @@ class TestCardinal: @pytest.mark.unit def test_denorm(self, test_input, expected): pred = self.inverse_normalizer_ko.inverse_normalize(test_input, verbose=False) - assert pred == expected \ No newline at end of file + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 0df099774..d1ba34a37 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -106,7 +106,7 @@ def parse_args(): 'mr', 'ja', 'rw', - 'ko' + 'ko', ], type=str, default='en', From 7842d1324e32a40bd522b99eba726f962dafc742 Mon Sep 17 00:00:00 2001 From: hmlee245 Date: Fri, 23 May 2025 16:31:36 -0700 Subject: [PATCH 7/8] third draft of korean ITN work. Mainly fixing minor issues and adding test cases Signed-off-by: hmlee245 --- Jenkinsfile | 22 +++++++++++++++++++ .../ko/data/numbers/thousands.tsv | 11 ---------- .../ko/data/numbers/zero.tsv | 1 + .../ko/taggers/cardinal.py | 8 +++---- .../test_cases_cardinal.txt | 12 +++++++++- 5 files changed, 37 insertions(+), 17 deletions(-) delete mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv create mode 100644 nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv diff --git a/Jenkinsfile b/Jenkinsfile index c94c107c6..32375f28f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,6 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-03-25-1' + KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/05-21-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -318,6 +319,22 @@ pipeline { } } } + stage('L0: Create KO ITN Grammars') { + when { + anyOf { + branch 'main' + changeRequest target: 'main' + } + } + failFast true + parallel { + stage('L0: KO ITN grammars') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}' + } + } + } + } // L1 Tests starts here @@ -406,6 +423,11 @@ pipeline { sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}' } } + stage('L1: Run all KO TN/ITN tests (restore grammars from cache)') { + steps { + sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ko/ -m "not pleasefixme" --cpu --tn_cache_dir ${KO_TN_CACHE}' + } + } } } diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv deleted file mode 100644 index 541752211..000000000 --- a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/thousands.tsv +++ /dev/null @@ -1,11 +0,0 @@ -억 -조 -경 -해 -자 -양 -구 -간 -정 -재 -극 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv new file mode 100644 index 000000000..cbf967001 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/ko/data/numbers/zero.tsv @@ -0,0 +1 @@ +영 0 \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index 7253019f0..a1cf1012f 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -31,7 +31,7 @@ class CardinalFst(GraphFst): def __init__(self): super().__init__(name="cardinal", kind="classify") - graph_zero = pynini.cross("영", "0") + graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")) ten = pynutil.delete("십") @@ -85,15 +85,13 @@ def __init__(self): ) graph_nonzero = graph @ leading_zero graph = pynini.union(graph_nonzero, graph_zero) - - graph = graph @ leading_zero | graph_zero self.just_cardinals = graph - optional_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + negative_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) final_graph = ( - optional_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") + negative_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"") ) | (pynutil.insert("integer: \"") + graph + pynutil.insert("\"")) final_graph = self.add_tokens(final_graph) diff --git a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt index 007273e5e..4f64116e5 100644 --- a/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt +++ b/tests/nemo_text_processing/ko/data_inverse_text_normalization/test_cases_cardinal.txt @@ -24,4 +24,14 @@ 백오십억칠천만~15070000000 오천억~500000000000 일조~1000000000000 -이조오천억~2500000000000 \ No newline at end of file +이조오천억~2500000000000 +영영영~000 +영영백이십삼~00123 +만천~11000 +만천백십일~11111 +경~10000000000000000 +마이너스일~-1 +마이너스 일~-1 +- 일~-1 +마이너스일억사천이백칠십구만구천팔십이~-142799082 +마이너스 칠백삼십오~-735 \ No newline at end of file From ff52238330b0fe3f0974a3a883127c7eeba6624d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 27 May 2025 22:53:51 +0000 Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../inverse_text_normalization/ko/taggers/cardinal.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py index 1c78f6000..13d6271df 100644 --- a/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/inverse_text_normalization/ko/taggers/cardinal.py @@ -96,7 +96,9 @@ def __init__(self): self.just_cardinals = graph - negative_sign = pynini.closure((pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space,0, 1) + negative_sign = pynini.closure( + (pynini.cross("마이너스", 'negative: "-"') | pynini.cross("-", 'negative: "-"')) + delete_space, 0, 1 + ) final_graph = ( negative_sign + pynutil.insert(" ") + pynutil.insert("integer: \"") + graph + pynutil.insert("\"")