Skip to content

Commit 4c104f0

Browse files
committed
Refactor Korean TN cardinal and postprocessing logic based on review feedback
Signed-off-by: Jinwoo Bae <bbae7050@gmail.com>
1 parent eb6a8c0 commit 4c104f0

File tree

10 files changed

+120
-267
lines changed

10 files changed

+120
-267
lines changed

nemo_text_processing/text_normalization/ko/data/number/teen.tsv

Lines changed: 0 additions & 10 deletions
This file was deleted.

nemo_text_processing/text_normalization/ko/data/number/ty.tsv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
1
12
2 이십
23
3 삼십
34
4 사십

nemo_text_processing/text_normalization/ko/graph_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from pynini.export import export
2424
from pynini.lib import byte, pynutil, utf8
2525

26-
from nemo_text_processing.text_normalization.en.utils import get_abs_path, load_labels
26+
from nemo_text_processing.text_normalization.en.utils import load_labels
2727
from nemo_text_processing.utils.logging import logger
2828

2929
NEMO_CHAR = utf8.VALID_UTF8_CHAR

nemo_text_processing/text_normalization/ko/taggers/cardinal.py

Lines changed: 82 additions & 98 deletions
Large diffs are not rendered by default.

nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,7 @@
1818
from pynini.lib import pynutil
1919

2020
from nemo_text_processing.text_normalization.ko.graph_utils import (
21-
NEMO_WHITE_SPACE,
2221
GraphFst,
23-
delete_extra_space,
24-
delete_space,
2522
generator_main,
2623
)
2724

nemo_text_processing/text_normalization/ko/verbalizers/post_processing.py

Lines changed: 5 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818
import pynini
1919

2020
from nemo_text_processing.text_normalization.en.graph_utils import (
21-
NEMO_NOT_SPACE,
2221
NEMO_SIGMA,
23-
delete_space,
2422
generator_main,
2523
)
2624
from nemo_text_processing.utils.logging import logger
@@ -41,73 +39,15 @@ def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
4139
far_file = None
4240
if cache_dir is not None and cache_dir != "None":
4341
os.makedirs(cache_dir, exist_ok=True)
44-
far_file = os.path.join(cache_dir, "zh_tn_post_processing.far")
42+
far_file = os.path.join(cache_dir, "ko_tn_post_processing.far")
4543
if not overwrite_cache and far_file and os.path.exists(far_file):
4644
self.fst = pynini.Far(far_file, mode="r")["post_process_graph"]
4745
logger.info(f'Post processing graph was restored from {far_file}.')
4846
else:
49-
self.set_punct_dict()
50-
self.fst = self.get_punct_postprocess_graph()
47+
self.fst = self.get_postprocess_graph()
5148

5249
if far_file:
5350
generator_main(far_file, {"post_process_graph": self.fst})
54-
55-
def set_punct_dict(self):
56-
self.punct_marks = {
57-
"'": [
58-
"'",
59-
'´',
60-
'ʹ',
61-
'ʻ',
62-
'ʼ',
63-
'ʽ',
64-
'ʾ',
65-
'ˈ',
66-
'ˊ',
67-
'ˋ',
68-
'˴',
69-
'ʹ',
70-
'΄',
71-
'՚',
72-
'՝',
73-
'י',
74-
'׳',
75-
'ߴ',
76-
'ߵ',
77-
'ᑊ',
78-
'ᛌ',
79-
'᾽',
80-
'᾿',
81-
'`',
82-
'´',
83-
'῾',
84-
'‘',
85-
'’',
86-
'‛',
87-
'′',
88-
'‵',
89-
'ꞌ',
90-
''',
91-
'`',
92-
'𖽑',
93-
'𖽒',
94-
],
95-
}
96-
97-
def get_punct_postprocess_graph(self):
98-
"""
99-
Returns graph to post process punctuation marks.
100-
101-
{``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept.
102-
By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks.
103-
"""
104-
105-
remove_space_around_single_quote = pynini.cdrewrite(
106-
delete_space, NEMO_NOT_SPACE, NEMO_NOT_SPACE, pynini.closure(NEMO_SIGMA)
107-
)
108-
# this works if spaces in between (good)
109-
# delete space between 2 NEMO_NOT_SPACE(left and right to the space) that are with in a content of NEMO_SIGMA
110-
111-
graph = remove_space_around_single_quote.optimize()
112-
113-
return graph
51+
52+
def get_postprocess_graph(self):
53+
return pynini.cdrewrite(pynini.cross("", ""), "", "", pynini.closure(NEMO_SIGMA)).optimize()

nemo_text_processing/text_normalization/ko/verbalizers/verbalize.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import pynini
16-
1715
from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst
1816
from nemo_text_processing.text_normalization.ko.verbalizers.cardinal import CardinalFst
1917

tests/nemo_text_processing/ko/data_text_normalization/test_cases_cardinal.txt

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,33 @@
1616
9999999999999~구조구천구백구십구억구천구백구십구만구천구백구십구
1717
99999999999999~구십구조구천구백구십구억구천구백구십구만구천구백구십구
1818
999999999999999~구백구십구조구천구백구십구억구천구백구십구만구천구백구십구
19-
9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구
19+
9999999999999999~구천구백구십구조구천구백구십구억구천구백구십구만구천구백구십구
20+
19~십구
21+
76~칠십육
22+
379~삼백칠십구
23+
850~팔백오십
24+
1004~천사
25+
8326~팔천삼백이십육
26+
10383~만삼백팔십삼
27+
34892~삼만사천팔백구십이
28+
573234~오십칠만삼천이백삼십사
29+
982010~구십팔만이천십
30+
2349023~이백삼십사만구천이십삼
31+
4303189~사백삼십만삼천백팔십구
32+
60321589~육천삼십이만천오백팔십구
33+
88234568~팔천팔백이십삼만사천오백육십팔
34+
792133923~칠억구천이백십삼만삼천구백이십삼
35+
187624689~일억팔천칠백육십이만사천육백팔십구
36+
2304050708~이십삼억사백오만칠백팔
37+
6436789729~육십사억삼천육백칠십팔만구천칠백이십구
38+
78234580257~칠백팔십이억삼천사백오십팔만이백오십칠
39+
987654321345~구천팔백칠십육억오천사백삼십이만천삼백사십오
40+
2345678901234~이조삼천사백오십육억칠천팔백구십만천이백삼십사
41+
35791357913579~삼십오조칠천구백십삼억오천칠백구십일만삼천오백칠십구
42+
470369258147036~사백칠십조삼천육백구십이억오천팔백십사만칠천삼십육
43+
5048258149517395~오천사십팔조이천오백팔십일억사천구백오십일만칠천삼백구십오
44+
67890123045607890~육경칠천팔백구십조천이백삼십억사천오백육십만칠천팔백구십
45+
-2~마이너스 이
46+
-93~마이너스 구십삼
47+
-90325~마이너스 구만삼백이십오
48+
-3234567~마이너스 삼백이십삼만사천오백육십칠

tests/nemo_text_processing/ko/test_cardinal.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,9 @@
1515
import pytest
1616
from parameterized import parameterized
1717

18-
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
1918
from nemo_text_processing.text_normalization.normalize import Normalizer
2019

21-
from ..utils import CACHE_DIR, parse_test_case_file
20+
from ..utils import parse_test_case_file
2221

2322

2423
class TestCardinal:

tests/nemo_text_processing/ko/test_sparrowhawk_normalization.sh

Lines changed: 0 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -31,91 +31,6 @@ testTNCardinal() {
3131
runtest $input
3232
}
3333

34-
#testTNSpecialText() {
35-
# input=$TEST_DIR/data_text_normalization/test_cases_special_text.txt
36-
# runtest $input
37-
#}
38-
39-
#testTNDate() {
40-
# input=$TEST_DIR/data_text_normalization/test_cases_date.txt
41-
# runtest $input
42-
#}
43-
44-
#testTNDecimal() {
45-
# input=$TEST_DIR/data_text_normalization/test_cases_decimal.txt
46-
# runtest $input
47-
#}
48-
49-
#testTNRange() {
50-
# input=$TEST_DIR/data_text_normalization/test_cases_range.txt
51-
# runtest $input
52-
#}
53-
54-
#testTNSerial() {
55-
# input=$TEST_DIR/data_text_normalization/test_cases_serial.txt
56-
# runtest $input
57-
#}
58-
59-
#testTNRoman() {
60-
# input=$TEST_DIR/data_text_normalization/test_cases_roman.txt
61-
# runtest $input
62-
#}
63-
64-
#testTNElectronic() {
65-
# input=$TEST_DIR/data_text_normalization/test_cases_electronic.txt
66-
# runtest $input
67-
#}
68-
69-
#testTNFraction() {
70-
# input=$TEST_DIR/data_text_normalization/test_cases_fraction.txt
71-
# runtest $input
72-
#}
73-
74-
#testTNMoney() {
75-
# input=$TEST_DIR/data_text_normalization/test_cases_money.txt
76-
# runtest $input
77-
#}
78-
79-
#testTNOrdinal() {
80-
# input=$TEST_DIR/data_text_normalization/test_cases_ordinal.txt
81-
# runtest $input
82-
#}
83-
84-
#testTNTelephone() {
85-
# input=$TEST_DIR/data_text_normalization/test_cases_telephone.txt
86-
# runtest $input
87-
#}
88-
89-
#testTNTime() {
90-
# input=$TEST_DIR/data_text_normalization/test_cases_time.txt
91-
# runtest $input
92-
#}
93-
94-
#testTNMeasure() {
95-
# input=$TEST_DIR/data_text_normalization/test_cases_measure.txt
96-
# runtest $input
97-
#}
98-
99-
#testTNWhitelist() {
100-
# input=$TEST_DIR/data_text_normalization/test_cases_whitelist.txt
101-
# runtest $input
102-
#}
103-
104-
#testTNWord() {
105-
# input=$TEST_DIR/data_text_normalization/test_cases_word.txt
106-
# runtest $input
107-
#}
108-
109-
#testTNAddress() {
110-
# input=$TEST_DIR/data_text_normalization/test_cases_address.txt
111-
# runtest $input
112-
#}
113-
114-
#testTNMath() {
115-
# input=$TEST_DIR/data_text_normalization/test_cases_math.txt
116-
# runtest $input
117-
#}
118-
11934
# Remove all command-line arguments
12035
shift $#
12136

0 commit comments

Comments
 (0)