Skip to content

Commit e606249

Browse files
committed
Add test files and updates for Korean TN
Signed-off-by: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com>
1 parent d9adadc commit e606249

File tree

28 files changed

+952
-6
lines changed

28 files changed

+952
-6
lines changed
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
16+
from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst
17+
from nemo_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
1
2+
2
3+
3
4+
4
5+
5
6+
6
7+
7
8+
8
9+
9
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
100
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
10
2+
11 십일
3+
12 십이
4+
13 십삼
5+
14 십사
6+
15 십오
7+
16 십육
8+
17 십칠
9+
18 십팔
10+
19 십구
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
10000 만
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1000
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
20 이십
2+
21 이십일
3+
22 이십이
4+
23 이십삼
5+
24 이십사
6+
25 이십오
7+
26 이십육
8+
27 이십칠
9+
28 이십팔
10+
29 이십구
11+
30 삼십
12+
31 삼십일
13+
32 삼십이
14+
33 삼십삼
15+
34 삼십사
16+
35 삼십오
17+
36 삼십육
18+
37 삼십칠
19+
38 삼십팔
20+
39 삼십구
21+
40 사십
22+
41 사십일
23+
42 사십이
24+
43 사십삼
25+
44 사십사
26+
45 사십오
27+
46 사십육
28+
47 사십칠
29+
48 사십팔
30+
49 사십구
31+
50 오십
32+
51 오십일
33+
52 오십이
34+
53 오십삼
35+
54 오십사
36+
55 오십오
37+
56 오십육
38+
57 오십칠
39+
58 오십팔
40+
59 오십구
41+
60 육십
42+
61 육십일
43+
62 육십이
44+
63 육십삼
45+
64 육십사
46+
65 육십오
47+
66 육십육
48+
67 육십칠
49+
68 육십팔
50+
69 육십구
51+
70 칠십
52+
71 칠십일
53+
72 칠십이
54+
73 칠십삼
55+
74 칠십사
56+
75 칠십오
57+
76 칠십육
58+
77 칠십칠
59+
78 칠십팔
60+
79 칠십구
61+
80 팔십
62+
81 팔십일
63+
82 팔십이
64+
83 팔십삼
65+
84 팔십사
66+
85 팔십오
67+
86 팔십육
68+
87 팔십칠
69+
88 팔십팔
70+
89 팔십구
71+
90 구십
72+
91 구십일
73+
92 구십이
74+
93 구십삼
75+
94 구십사
76+
95 구십오
77+
96 구십육
78+
97 구십칠
79+
98 구십팔
80+
99 구십구
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright 2015 and onwards Google, Inc.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import os
17+
import string
18+
from pathlib import Path
19+
from typing import Dict
20+
21+
import pynini
22+
from pynini import Far
23+
from pynini.export import export
24+
from pynini.lib import byte, pynutil, utf8
25+
26+
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
27+
from nemo_text_processing.utils.logging import logger
28+
29+
NEMO_CHAR = utf8.VALID_UTF8_CHAR
30+
31+
NEMO_DIGIT = byte.DIGIT
32+
NEMO_ALPHA = pynini.union(*[chr(i) for i in range(ord('가'), ord('힣') + 1)]).optimize()
33+
NEMO_ALNUM = pynini.union(NEMO_DIGIT, NEMO_ALPHA).optimize()
34+
NEMO_HEX = pynini.union(*string.hexdigits).optimize()
35+
NEMO_SPACE = " "
36+
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00a0").optimize()
37+
NEMO_NOT_SPACE = pynini.difference(NEMO_CHAR, NEMO_WHITE_SPACE).optimize()
38+
NEMO_NOT_QUOTE = pynini.difference(NEMO_CHAR, r'"').optimize()
39+
40+
NEMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
41+
NEMO_GRAPH = pynini.union(NEMO_ALNUM, NEMO_PUNCT).optimize()
42+
43+
NEMO_SIGMA = pynini.closure(NEMO_CHAR)
44+
45+
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
46+
delete_zero_or_one_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE, 0, 1))
47+
insert_space = pynutil.insert(" ")
48+
delete_extra_space = pynini.cross(pynini.closure(NEMO_WHITE_SPACE, 1), " ")
49+
delete_preserve_order = pynini.closure(
50+
pynutil.delete(" preserve_order: true")
51+
| (pynutil.delete(' field_order: "') + NEMO_NOT_QUOTE + pynutil.delete('"'))
52+
)
53+
54+
55+
# Common string literals; expand as you see fit.
56+
username_string = "username"
57+
double_quotes = '"'
58+
domain_string = "domain"
59+
protocol_string = "protocol"
60+
slash = "/"
61+
double_slash = "//"
62+
triple_slash = "///"
63+
file = "file"
64+
period = "."
65+
at = "@"
66+
colon = ":"
67+
https = "https"
68+
http = "http"
69+
www = "www"
70+
71+
72+
def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
73+
"""
74+
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
75+
76+
Args:
77+
file_name: exported file name
78+
graphs: Mapping of a rule name and Pynini WFST graph to be exported
79+
"""
80+
exporter = export.Exporter(file_name)
81+
for rule, graph in graphs.items():
82+
exporter[rule] = graph.optimize()
83+
exporter.close()
84+
logger.info(f"Created {file_name}")
85+
86+
87+
def convert_space(fst) -> "pynini.FstLike":
88+
"""
89+
Converts space to nonbreaking space.
90+
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
91+
This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
92+
93+
Args:
94+
fst: input fst
95+
96+
Returns output fst where breaking spaces are converted to non breaking spaces
97+
"""
98+
return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, "\u00a0"), "", "", NEMO_SIGMA)
99+
100+
101+
def string_map_cased(input_file: str, input_case: str = "lower_cased"):
102+
labels = load_labels(input_file)
103+
whitelist = pynini.string_map(labels).invert().optimize()
104+
return whitelist
105+
106+
107+
class GraphFst:
108+
"""
109+
Base class for all grammar fsts.
110+
111+
Args:
112+
name: name of grammar class
113+
kind: either 'classify' or 'verbalize'
114+
deterministic: if True will provide a single transduction option,
115+
for False multiple transduction are generated (used for audio-based normalization)
116+
"""
117+
118+
def __init__(self, name: str, kind: str, deterministic: bool = True):
119+
self.name = name
120+
self.kind = kind
121+
self._fst = None
122+
self.deterministic = deterministic
123+
124+
self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
125+
if self.far_exist():
126+
self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
127+
128+
def far_exist(self) -> bool:
129+
"""
130+
Returns true if FAR can be loaded
131+
"""
132+
return self.far_path.exists()
133+
134+
@property
135+
def fst(self) -> "pynini.FstLike":
136+
return self._fst
137+
138+
@fst.setter
139+
def fst(self, fst):
140+
self._fst = fst
141+
142+
def add_tokens(self, fst) -> "pynini.FstLike":
143+
"""
144+
Wraps class name around to given fst
145+
146+
Args:
147+
fst: input fst
148+
149+
Returns:
150+
Fst: fst
151+
"""
152+
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
153+
154+
def delete_tokens(self, fst) -> "pynini.FstLike":
155+
"""
156+
Deletes class name wrap around output of given fst
157+
158+
Args:
159+
fst: input fst
160+
161+
Returns:
162+
Fst: fst
163+
"""
164+
res = (
165+
pynutil.delete(f"{self.name}")
166+
+ delete_space
167+
+ pynutil.delete("{")
168+
+ delete_space
169+
+ fst
170+
+ delete_space
171+
+ pynutil.delete("}")
172+
)
173+
return res @ pynini.cdrewrite(pynini.cross("\u00a0", " "), "", "", NEMO_SIGMA)
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

0 commit comments

Comments
 (0)