Skip to content

Commit 061a73a

Browse files
committed
Refactor Korean TN cardinal and postprocessing logic based on review feedback
1 parent eb6a8c0 commit 061a73a

File tree

10 files changed

+143
-188
lines changed

10 files changed

+143
-188
lines changed

nemo_text_processing/text_normalization/ko/data/number/teen.tsv

Lines changed: 0 additions & 10 deletions
This file was deleted.

nemo_text_processing/text_normalization/ko/data/number/ty.tsv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
1
12
2 이십
23
3 삼십
34
4 사십

nemo_text_processing/text_normalization/ko/graph_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from pynini.export import export
2424
from pynini.lib import byte, pynutil, utf8
2525

26-
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
26+
from nemo_text_processing.text_normalization.en.utils import load_labels
2727
from nemo_text_processing.utils.logging import logger
2828

2929
NEMO_CHAR = utf8.VALID_UTF8_CHAR

nemo_text_processing/text_normalization/ko/taggers/cardinal.py

Lines changed: 105 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,26 +28,31 @@ def __init__(self, deterministic: bool = True):
2828
graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
2929

3030
digit_except_one = pynini.difference(NEMO_DIGIT, "1")
31-
digit_except_zero_one = pynini.difference(digit_except_one, "0")
31+
digit_except_zero_one = pynini.difference(digit_except_one, "0") <<<<<<< HEAD
3232

3333
graph_digit_alt = digit_except_zero_one @ graph_digit
3434
graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv"))
3535
graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv"))
36+
=======
37+
38+
graph_digit_no_zero_one = digit_except_zero_one @ graph_digit
39+
graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv"))
40+
>>>>>>> 68b18fa8 (Refactor Korean TN cardinal and postprocessing logic based on review feedback)
3641

3742
# Compose all basic number forms
38-
graph_all = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit
43+
graph_1_to_99 = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_digit
3944

4045
hundreds = NEMO_DIGIT**3
41-
graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_alt + pynutil.insert('백'))) + pynini.union(
42-
pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_all)
46+
graph_hundred_component = (pynini.cross('1', '백') | (graph_digit_no_zero_one + pynutil.insert('백'))) + pynini.union(
47+
pynini.closure(pynutil.delete('0')), (pynini.closure(pynutil.delete('0')) + graph_1_to_99)
4348
)
4449
graph_hundred = hundreds @ graph_hundred_component
4550

4651
thousands = NEMO_DIGIT**4
47-
graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_alt + pynutil.insert('천'))) + pynini.union(
52+
graph_thousand_component = (pynini.cross('1', '천') | (graph_digit_no_zero_one + pynutil.insert('천'))) + pynini.union(
4853
pynini.closure(pynutil.delete('0')),
4954
graph_hundred_component,
50-
(pynini.closure(pynutil.delete('0')) + graph_all),
55+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
5156
)
5257
graph_thousand = thousands @ graph_thousand_component
5358

@@ -56,36 +61,44 @@ def __init__(self, deterministic: bool = True):
5661
pynini.closure(pynutil.delete('0')),
5762
graph_thousand_component,
5863
(pynutil.delete('0') + graph_hundred_component),
59-
(pynini.closure(pynutil.delete('0')) + graph_all),
64+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
6065
)
6166
graph_ten_thousand = ten_thousands @ graph_ten_thousand_component
6267

6368
hundred_thousands = NEMO_DIGIT**6
69+
<<<<<<< HEAD
6470
graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('만')) + pynini.union(
71+
=======
72+
graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_1_to_99) + pynutil.insert('만')) + pynini.union(
73+
>>>>>>> 68b18fa8 (Refactor Korean TN cardinal and postprocessing logic based on review feedback)
6574
pynini.closure(pynutil.delete('0')),
6675
graph_thousand_component,
6776
(pynutil.delete('0') + graph_hundred_component),
68-
(pynini.closure(pynutil.delete('0')) + graph_all),
77+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
6978
)
7079
graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component
7180

7281
millions = NEMO_DIGIT**7
73-
graph_million_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('만')) + pynini.union(
82+
graph_million_component = ((graph_hundred) + pynutil.insert('만')) + pynini.union(
7483
pynini.closure(pynutil.delete('0')),
7584
graph_thousand_component,
7685
(pynutil.delete('0') + graph_hundred_component),
77-
(pynini.closure(pynutil.delete('0')) + graph_all),
86+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
7887
)
7988
graph_million = millions @ graph_million_component
8089

8190
ten_millions = NEMO_DIGIT**8
91+
<<<<<<< HEAD
8292
graph_ten_million_component = (
8393
(NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만')
8494
) + pynini.union(
95+
=======
96+
graph_ten_million_component = ((graph_thousand) + pynutil.insert('만')) + pynini.union(
97+
>>>>>>> 68b18fa8 (Refactor Korean TN cardinal and postprocessing logic based on review feedback)
8598
pynini.closure(pynutil.delete('0')),
8699
graph_thousand_component,
87100
(pynutil.delete('0') + graph_hundred_component),
88-
(pynini.closure(pynutil.delete('0')) + graph_all),
101+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
89102
)
90103
graph_ten_million = ten_millions @ graph_ten_million_component
91104

@@ -98,48 +111,52 @@ def __init__(self, deterministic: bool = True):
98111
(pynutil.delete('000') + graph_ten_thousand_component),
99112
(pynutil.delete('0000') + graph_thousand_component),
100113
((pynutil.delete('00000') + graph_hundred_component)),
101-
(pynini.closure(pynutil.delete('0')) + graph_all),
114+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
102115
)
103116
graph_hundred_million = hundred_millions @ graph_hundred_million_component
104117

105118
thousand_millions = NEMO_DIGIT**10
106-
graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('억')) + pynini.union(
119+
graph_thousand_million_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('억')) + pynini.union(
107120
pynini.closure(pynutil.delete('0')),
108121
graph_ten_million_component,
109122
(pynutil.delete('0') + graph_million_component),
110123
(pynutil.delete('00') + graph_hundred_thousand_component),
111124
(pynutil.delete('000') + graph_ten_thousand_component),
112125
(pynutil.delete('0000') + graph_thousand_component),
113126
((pynutil.delete('00000') + graph_hundred_component)),
114-
(pynini.closure(pynutil.delete('0')) + graph_all),
127+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
115128
)
116129
graph_thousand_million = thousand_millions @ graph_thousand_million_component
117130

118131
billions = NEMO_DIGIT**11
119-
graph_billions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('억')) + pynini.union(
132+
graph_billions_component = ((graph_hundred) + pynutil.insert('억')) + pynini.union(
120133
pynini.closure(pynutil.delete('0')),
121134
graph_ten_million_component,
122135
(pynutil.delete('0') + graph_million_component),
123136
(pynutil.delete('00') + graph_hundred_thousand_component),
124137
(pynutil.delete('000') + graph_ten_thousand_component),
125138
(pynutil.delete('0000') + graph_thousand_component),
126139
((pynutil.delete('00000') + graph_hundred_component)),
127-
(pynini.closure(pynutil.delete('0')) + graph_all),
140+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
128141
)
129142
graph_billions = billions @ graph_billions_component
130143

131144
ten_billions = NEMO_DIGIT**12
145+
<<<<<<< HEAD
132146
graph_ten_billions_component = (
133147
(NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억')
134148
) + pynini.union(
149+
=======
150+
graph_ten_billions_component = ((graph_thousand) + pynutil.insert('억')) + pynini.union(
151+
>>>>>>> 68b18fa8 (Refactor Korean TN cardinal and postprocessing logic based on review feedback)
135152
pynini.closure(pynutil.delete('0')),
136153
graph_ten_million_component,
137154
(pynutil.delete('0') + graph_million_component),
138155
(pynutil.delete('00') + graph_hundred_thousand_component),
139156
(pynutil.delete('000') + graph_ten_thousand_component),
140157
(pynutil.delete('0000') + graph_thousand_component),
141158
((pynutil.delete('00000') + graph_hundred_component)),
142-
(pynini.closure(pynutil.delete('0')) + graph_all),
159+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
143160
)
144161
graph_ten_billions = ten_billions @ graph_ten_billions_component
145162

@@ -156,11 +173,12 @@ def __init__(self, deterministic: bool = True):
156173
pynutil.delete('0000000') + graph_ten_thousand_component,
157174
pynutil.delete('00000000') + graph_thousand_component,
158175
pynutil.delete('000000000') + graph_hundred_component,
159-
(pynini.closure(pynutil.delete('0')) + graph_all),
176+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99),
160177
)
161178
graph_hundred_billions = hundred_billions @ graph_hundred_billions_component
162179

163180
trillion = NEMO_DIGIT**14
181+
<<<<<<< HEAD
164182
graph_trillion_component = (
165183
(NEMO_DIGIT**2 @ graph_all)
166184
+ pynutil.insert('조')
@@ -177,11 +195,27 @@ def __init__(self, deterministic: bool = True):
177195
pynutil.delete('00000000') + graph_thousand_component,
178196
pynutil.delete('000000000') + graph_hundred_component,
179197
(pynini.closure(pynutil.delete('0')) + graph_all),
198+
=======
199+
graph_trillion_component = ((NEMO_DIGIT**2 @ graph_1_to_99) + pynutil.insert('조') + pynini.union(
200+
pynini.closure(pynutil.delete('0')),
201+
graph_ten_billions_component,
202+
pynutil.delete('0') + graph_billions_component,
203+
pynutil.delete('00') + graph_thousand_million_component,
204+
pynutil.delete('000') + graph_hundred_million_component,
205+
pynutil.delete('0000') + graph_ten_million_component,
206+
pynutil.delete('00000') + graph_million_component,
207+
pynutil.delete('000000') + graph_hundred_thousand_component,
208+
pynutil.delete('0000000') + graph_ten_thousand_component,
209+
pynutil.delete('00000000') + graph_thousand_component,
210+
pynutil.delete('000000000') + graph_hundred_component,
211+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99)
212+
>>>>>>> 68b18fa8 (Refactor Korean TN cardinal and postprocessing logic based on review feedback)
180213
)
181214
)
182215
graph_trillions = trillion @ graph_trillion_component
183216

184217
ten_trillions = NEMO_DIGIT**15
218+
<<<<<<< HEAD
185219
graph_ten_trillions_component = (
186220
(NEMO_DIGIT**3 @ graph_hundred_component)
187221
+ pynutil.insert('조')
@@ -199,10 +233,27 @@ def __init__(self, deterministic: bool = True):
199233
pynutil.delete('000000000') + graph_hundred_component,
200234
(pynini.closure(pynutil.delete('0')) + graph_all),
201235
)
236+
=======
237+
graph_ten_trillions_component = ((graph_hundred) + pynutil.insert('조') + pynini.union(
238+
pynini.closure(pynutil.delete('0')),
239+
graph_ten_billions_component,
240+
pynutil.delete('0') + graph_billions_component,
241+
pynutil.delete('00') + graph_thousand_million_component,
242+
pynutil.delete('000') + graph_hundred_million_component,
243+
pynutil.delete('0000') + graph_ten_million_component,
244+
pynutil.delete('00000') + graph_million_component,
245+
pynutil.delete('000000') + graph_hundred_thousand_component,
246+
pynutil.delete('0000000') + graph_ten_thousand_component,
247+
pynutil.delete('00000000') + graph_thousand_component,
248+
pynutil.delete('000000000') + graph_hundred_component,
249+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99)
250+
)
251+
>>>>>>> 68b18fa8 (Refactor Korean TN cardinal and postprocessing logic based on review feedback)
202252
)
203253
graph_ten_trillions = ten_trillions @ graph_ten_trillions_component
204254

205255
hundred_trillions = NEMO_DIGIT**16
256+
<<<<<<< HEAD
206257
graph_hundred_trillions_component = (
207258
(NEMO_DIGIT**4 @ graph_thousand_component)
208259
+ pynutil.insert('조')
@@ -219,11 +270,27 @@ def __init__(self, deterministic: bool = True):
219270
pynutil.delete('00000000') + graph_thousand_component,
220271
pynutil.delete('000000000') + graph_hundred_component,
221272
(pynini.closure(pynutil.delete('0')) + graph_all),
273+
=======
274+
graph_hundred_trillions_component = ((graph_thousand) + pynutil.insert('조') + pynini.union(
275+
pynini.closure(pynutil.delete('0')),
276+
graph_ten_billions_component,
277+
pynutil.delete('0') + graph_billions_component,
278+
pynutil.delete('00') + graph_thousand_million_component,
279+
pynutil.delete('000') + graph_hundred_million_component,
280+
pynutil.delete('0000') + graph_ten_million_component,
281+
pynutil.delete('00000') + graph_million_component,
282+
pynutil.delete('000000') + graph_hundred_thousand_component,
283+
pynutil.delete('0000000') + graph_ten_thousand_component,
284+
pynutil.delete('00000000') + graph_thousand_component,
285+
pynutil.delete('000000000') + graph_hundred_component,
286+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99)
287+
>>>>>>> 68b18fa8 (Refactor Korean TN cardinal and postprocessing logic based on review feedback)
222288
)
223289
)
224290
graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component
225291

226292
thousand_trillions = NEMO_DIGIT**17
293+
<<<<<<< HEAD
227294
graph_thousand_trillions_component = (
228295
graph_digit
229296
+ pynutil.insert('경')
@@ -244,6 +311,25 @@ def __init__(self, deterministic: bool = True):
244311
pynutil.delete('000000000000') + graph_thousand_component,
245312
pynutil.delete('0000000000000') + graph_hundred_component,
246313
(pynini.closure(pynutil.delete('0')) + graph_all),
314+
=======
315+
graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union(
316+
pynini.closure(pynutil.delete('0')),
317+
graph_hundred_trillions_component,
318+
pynutil.delete('0') + graph_ten_trillions_component,
319+
pynutil.delete('00') + graph_trillion_component,
320+
pynutil.delete('000') + graph_hundred_billions_component,
321+
pynutil.delete('0000') + graph_ten_billions_component,
322+
pynutil.delete('00000') + graph_billions_component,
323+
pynutil.delete('000000') + graph_thousand_million_component,
324+
pynutil.delete('0000000') + graph_hundred_million_component,
325+
pynutil.delete('00000000') + graph_ten_million_component,
326+
pynutil.delete('000000000') + graph_million_component,
327+
pynutil.delete('0000000000') + graph_hundred_thousand_component,
328+
pynutil.delete('00000000000') + graph_ten_thousand_component,
329+
pynutil.delete('000000000000') + graph_thousand_component,
330+
pynutil.delete('0000000000000') + graph_hundred_component,
331+
(pynini.closure(pynutil.delete('0')) + graph_1_to_99)
332+
>>>>>>> 68b18fa8 (Refactor Korean TN cardinal and postprocessing logic based on review feedback)
247333
)
248334
)
249335
graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component
@@ -265,7 +351,7 @@ def __init__(self, deterministic: bool = True):
265351
graph_ten_thousand,
266352
graph_thousand,
267353
graph_hundred,
268-
graph_all,
354+
graph_1_to_99,
269355
graph_zero,
270356
).optimize()
271357

nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,7 @@
1818
from pynini.lib import pynutil
1919

2020
from nemo_text_processing.text_normalization.ko.graph_utils import (
21-
NEMO_WHITE_SPACE,
2221
GraphFst,
23-
delete_extra_space,
24-
delete_space,
2522
generator_main,
2623
)
2724

0 commit comments

Comments
 (0)