Skip to content

Commit e7e78c6

Browse files
Add Korean TN support for cardinal numbers and postprocessing (#285)
* Add Korean TN support for cardinal numbers and postprocessing Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Refactor Korean TN cardinal and postprocessing logic based on review feedback Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add __init__.py to ko/data directory Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> * Update KO_TN_CACHE to trigger Korean CI run Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> --------- Signed-off-by: Jinwoo Bae <bbae7050@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent ac07488 commit e7e78c6

File tree

23 files changed

+1038
-1
lines changed

23 files changed

+1038
-1
lines changed

Jenkinsfile

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ pipeline {
2828
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
2929
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
3030
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-22-25-0'
31+
KO_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-03-25-0'
3132
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
3233
}
3334
stages {
@@ -318,6 +319,22 @@ pipeline {
318319
}
319320
}
320321
}
322+
stage('L0: Create KO TN Grammars') {
323+
when {
324+
anyOf {
325+
branch 'main'
326+
changeRequest target: 'main'
327+
}
328+
}
329+
failFast true
330+
parallel {
331+
stage('L0: KO TN grammars') {
332+
steps {
333+
sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=ko --text="100" --cache_dir ${KO_TN_CACHE}'
334+
}
335+
}
336+
}
337+
}
321338

322339

323340
// L1 Tests starts here
@@ -406,6 +423,11 @@ pipeline {
406423
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/hy/ -m "not pleasefixme" --cpu --tn_cache_dir ${HY_TN_CACHE}'
407424
}
408425
}
426+
stage('L1: Run all KO TN/ITN tests (restore grammars from cache)') {
427+
steps {
428+
sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/ko/ -m "not pleasefixme" --cpu --tn_cache_dir ${KO_TN_CACHE}'
429+
}
430+
}
409431
}
410432
}
411433

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
16+
from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst
17+
from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
1
2+
2
3+
3
4+
4
5+
5
6+
6
7+
7
8+
8
9+
9
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
1
2+
2 이십
3+
3 삼십
4+
4 사십
5+
5 오십
6+
6 육십
7+
7 칠십
8+
8 팔십
9+
9 구십
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright 2015 and onwards Google, Inc.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import os
17+
import string
18+
from pathlib import Path
19+
from typing import Dict
20+
21+
import pynini
22+
from pynini import Far
23+
from pynini.export import export
24+
from pynini.lib import byte, pynutil, utf8
25+
26+
from nemo_text_processing.text_normalization.en.utils import load_labels
27+
from nemo_text_processing.utils.logging import logger
28+
29+
NEMO_CHAR = utf8.VALID_UTF8_CHAR
30+
31+
NEMO_DIGIT = byte.DIGIT
32+
NEMO_ALPHA = pynini.union(*[chr(i) for i in range(ord('가'), ord('힣') + 1)]).optimize()
33+
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
34+
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
35+
NEMO_SPACE = " "
36+
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize()
37+
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
38+
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
39+
40+
NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
41+
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()
42+
43+
NEMO_SIGMA = pynini.closure(NEMO_CHAR)
44+
45+
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
46+
delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1))
47+
insert_space = pynutil.insert(" ")
48+
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
49+
delete_preserve_order = pynini.closure(
50+
pynutil.delete(" preserve_order: true")
51+
| (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
52+
)
53+
54+
55+
# Common string literals; expand as you see fit.
56+
username_string = "username"
57+
double_quotes = '"'
58+
domain_string = "domain"
59+
protocol_string = "protocol"
60+
slash = "/"
61+
double_slash = "//"
62+
triple_slash = "///"
63+
file = "file"
64+
period = "."
65+
at = "@"
66+
colon = ":"
67+
https = "https"
68+
http = "http"
69+
www = "www"
70+
71+
72+
def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
73+
"""
74+
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
75+
76+
Args:
77+
file_name: exported file name
78+
graphs: Mapping of a rule name and Pynini WFST graph to be exported
79+
"""
80+
exporter = export.Exporter(file_name)
81+
for rule, graph in graphs.items():
82+
exporter[rule] = graph.optimize()
83+
exporter.close()
84+
logger.info(f"Created {file_name}")
85+
86+
87+
def convert_space(fst) -> "pynini.FstLike":
88+
"""
89+
Converts space to nonbreaking space.
90+
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
91+
This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
92+
93+
Args:
94+
fst: input fst
95+
96+
Returns output fst where breaking spaces are converted to non breaking spaces
97+
"""
98+
return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, "\u00a0"), "", "", NEMO_SIGMA)
99+
100+
101+
def string_map_cased(input_file: str, input_case: str = "lower_cased"):
102+
labels = load_labels(input_file)
103+
whitelist = pynini.string_map(labels).invert().optimize()
104+
return whitelist
105+
106+
107+
class GraphFst:
108+
"""
109+
Base class for all grammar fsts.
110+
111+
Args:
112+
name: name of grammar class
113+
kind: either 'classify' or 'verbalize'
114+
deterministic: if True will provide a single transduction option,
115+
for False multiple transduction are generated (used for audio-based normalization)
116+
"""
117+
118+
def __init__(self, name: str, kind: str, deterministic: bool = True):
119+
self.name = name
120+
self.kind = kind
121+
self._fst = None
122+
self.deterministic = deterministic
123+
124+
self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
125+
if self.far_exist():
126+
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
127+
128+
def far_exist(self) -> bool:
129+
"""
130+
Returns true if FAR can be loaded
131+
"""
132+
return self.far_path.exists()
133+
134+
@property
135+
def fst(self) -> "pynini.FstLike":
136+
return self._fst
137+
138+
@fst.setter
139+
def fst(self, fst):
140+
self._fst = fst
141+
142+
def add_tokens(self, fst) -> "pynini.FstLike":
143+
"""
144+
Wraps class name around to given fst
145+
146+
Args:
147+
fst: input fst
148+
149+
Returns:
150+
Fst: fst
151+
"""
152+
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
153+
154+
def delete_tokens(self, fst) -> "pynini.FstLike":
155+
"""
156+
Deletes class name wrap around output of given fst
157+
158+
Args:
159+
fst: input fst
160+
161+
Returns:
162+
Fst: fst
163+
"""
164+
res = (
165+
pynutil.delete(f"{self.name}")
166+
+ delete_space
167+
+ pynutil.delete("{")
168+
+ delete_space
169+
+ fst
170+
+ delete_space
171+
+ pynutil.delete("}")
172+
)
173+
return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

0 commit comments

Comments
 (0)