Skip to content

Commit eb6a8c0

Browse files
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
1 parent 034d05b commit eb6a8c0

File tree

3 files changed

+92
-86
lines changed

3 files changed

+92
-86
lines changed

nemo_text_processing/text_normalization/ko/taggers/cardinal.py

Lines changed: 90 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ class CardinalFst(GraphFst):
2424
def __init__(self, deterministic: bool = True):
2525
super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
2626
# Load base .tsv files
27-
graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
28-
graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
29-
27+
graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
28+
graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
29+
3030
digit_except_one = pynini.difference(NEMO_DIGIT, "1")
3131
digit_except_zero_one = pynini.difference(digit_except_one, "0")
32-
32+
3333
graph_digit_alt = digit_except_zero_one @ graph_digit
3434
graph_ty = pynini.string_file(get_abs_path("data/number/ty.tsv"))
35-
graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv"))
35+
graph_teen = pynini.string_file(get_abs_path("data/number/teen.tsv"))
3636

3737
# Compose all basic number forms
3838
graph_all = (graph_ty + (graph_digit | pynutil.delete('0'))) | graph_teen | graph_digit
@@ -50,7 +50,7 @@ def __init__(self, deterministic: bool = True):
5050
(pynini.closure(pynutil.delete('0')) + graph_all),
5151
)
5252
graph_thousand = thousands @ graph_thousand_component
53-
53+
5454
ten_thousands = NEMO_DIGIT**5
5555
graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union(
5656
pynini.closure(pynutil.delete('0')),
@@ -59,16 +59,16 @@ def __init__(self, deterministic: bool = True):
5959
(pynini.closure(pynutil.delete('0')) + graph_all),
6060
)
6161
graph_ten_thousand = ten_thousands @ graph_ten_thousand_component
62-
62+
6363
hundred_thousands = NEMO_DIGIT**6
64-
graph_hundred_thousand_component = ((NEMO_DIGIT ** 2 @ graph_all) + pynutil.insert('만')) + pynini.union(
64+
graph_hundred_thousand_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('만')) + pynini.union(
6565
pynini.closure(pynutil.delete('0')),
6666
graph_thousand_component,
6767
(pynutil.delete('0') + graph_hundred_component),
6868
(pynini.closure(pynutil.delete('0')) + graph_all),
6969
)
7070
graph_hundred_thousand = hundred_thousands @ graph_hundred_thousand_component
71-
71+
7272
millions = NEMO_DIGIT**7
7373
graph_million_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('만')) + pynini.union(
7474
pynini.closure(pynutil.delete('0')),
@@ -79,15 +79,17 @@ def __init__(self, deterministic: bool = True):
7979
graph_million = millions @ graph_million_component
8080

8181
ten_millions = NEMO_DIGIT**8
82-
graph_ten_million_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만')) + pynini.union(
82+
graph_ten_million_component = (
83+
(NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('만')
84+
) + pynini.union(
8385
pynini.closure(pynutil.delete('0')),
8486
graph_thousand_component,
8587
(pynutil.delete('0') + graph_hundred_component),
8688
(pynini.closure(pynutil.delete('0')) + graph_all),
8789
)
8890
graph_ten_million = ten_millions @ graph_ten_million_component
89-
90-
hundred_millions = NEMO_DIGIT ** 9
91+
92+
hundred_millions = NEMO_DIGIT**9
9193
graph_hundred_million_component = (graph_digit + pynutil.insert('억')) + pynini.union(
9294
pynini.closure(pynutil.delete('0')),
9395
graph_ten_million_component,
@@ -127,7 +129,9 @@ def __init__(self, deterministic: bool = True):
127129
graph_billions = billions @ graph_billions_component
128130

129131
ten_billions = NEMO_DIGIT**12
130-
graph_ten_billions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억')) + pynini.union(
132+
graph_ten_billions_component = (
133+
(NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('억')
134+
) + pynini.union(
131135
pynini.closure(pynutil.delete('0')),
132136
graph_ten_million_component,
133137
(pynutil.delete('0') + graph_million_component),
@@ -138,7 +142,7 @@ def __init__(self, deterministic: bool = True):
138142
(pynini.closure(pynutil.delete('0')) + graph_all),
139143
)
140144
graph_ten_billions = ten_billions @ graph_ten_billions_component
141-
145+
142146
hundred_billions = NEMO_DIGIT**13
143147
graph_hundred_billions_component = (graph_digit + pynutil.insert('조')) + pynini.union(
144148
pynini.closure(pynutil.delete('0')),
@@ -155,79 +159,91 @@ def __init__(self, deterministic: bool = True):
155159
(pynini.closure(pynutil.delete('0')) + graph_all),
156160
)
157161
graph_hundred_billions = hundred_billions @ graph_hundred_billions_component
158-
162+
159163
trillion = NEMO_DIGIT**14
160-
graph_trillion_component = ((NEMO_DIGIT**2 @ graph_all) + pynutil.insert('조') + pynini.union(
161-
pynini.closure(pynutil.delete('0')),
162-
graph_ten_billions_component,
163-
pynutil.delete('0') + graph_billions_component,
164-
pynutil.delete('00') + graph_thousand_million_component,
165-
pynutil.delete('000') + graph_hundred_million_component,
166-
pynutil.delete('0000') + graph_ten_million_component,
167-
pynutil.delete('00000') + graph_million_component,
168-
pynutil.delete('000000') + graph_hundred_thousand_component,
169-
pynutil.delete('0000000') + graph_ten_thousand_component,
170-
pynutil.delete('00000000') + graph_thousand_component,
171-
pynutil.delete('000000000') + graph_hundred_component,
172-
(pynini.closure(pynutil.delete('0')) + graph_all)
164+
graph_trillion_component = (
165+
(NEMO_DIGIT**2 @ graph_all)
166+
+ pynutil.insert('조')
167+
+ pynini.union(
168+
pynini.closure(pynutil.delete('0')),
169+
graph_ten_billions_component,
170+
pynutil.delete('0') + graph_billions_component,
171+
pynutil.delete('00') + graph_thousand_million_component,
172+
pynutil.delete('000') + graph_hundred_million_component,
173+
pynutil.delete('0000') + graph_ten_million_component,
174+
pynutil.delete('00000') + graph_million_component,
175+
pynutil.delete('000000') + graph_hundred_thousand_component,
176+
pynutil.delete('0000000') + graph_ten_thousand_component,
177+
pynutil.delete('00000000') + graph_thousand_component,
178+
pynutil.delete('000000000') + graph_hundred_component,
179+
(pynini.closure(pynutil.delete('0')) + graph_all),
173180
)
174181
)
175182
graph_trillions = trillion @ graph_trillion_component
176183

177184
ten_trillions = NEMO_DIGIT**15
178-
graph_ten_trillions_component = ((NEMO_DIGIT**3 @ graph_hundred_component) + pynutil.insert('조') + pynini.union(
179-
pynini.closure(pynutil.delete('0')),
180-
graph_ten_billions_component,
181-
pynutil.delete('0') + graph_billions_component,
182-
pynutil.delete('00') + graph_thousand_million_component,
183-
pynutil.delete('000') + graph_hundred_million_component,
184-
pynutil.delete('0000') + graph_ten_million_component,
185-
pynutil.delete('00000') + graph_million_component,
186-
pynutil.delete('000000') + graph_hundred_thousand_component,
187-
pynutil.delete('0000000') + graph_ten_thousand_component,
188-
pynutil.delete('00000000') + graph_thousand_component,
189-
pynutil.delete('000000000') + graph_hundred_component,
190-
(pynini.closure(pynutil.delete('0')) + graph_all)
191-
)
185+
graph_ten_trillions_component = (
186+
(NEMO_DIGIT**3 @ graph_hundred_component)
187+
+ pynutil.insert('조')
188+
+ pynini.union(
189+
pynini.closure(pynutil.delete('0')),
190+
graph_ten_billions_component,
191+
pynutil.delete('0') + graph_billions_component,
192+
pynutil.delete('00') + graph_thousand_million_component,
193+
pynutil.delete('000') + graph_hundred_million_component,
194+
pynutil.delete('0000') + graph_ten_million_component,
195+
pynutil.delete('00000') + graph_million_component,
196+
pynutil.delete('000000') + graph_hundred_thousand_component,
197+
pynutil.delete('0000000') + graph_ten_thousand_component,
198+
pynutil.delete('00000000') + graph_thousand_component,
199+
pynutil.delete('000000000') + graph_hundred_component,
200+
(pynini.closure(pynutil.delete('0')) + graph_all),
201+
)
192202
)
193203
graph_ten_trillions = ten_trillions @ graph_ten_trillions_component
194204

195205
hundred_trillions = NEMO_DIGIT**16
196-
graph_hundred_trillions_component = ((NEMO_DIGIT**4 @ graph_thousand_component) + pynutil.insert('조') + pynini.union(
197-
pynini.closure(pynutil.delete('0')),
198-
graph_ten_billions_component,
199-
pynutil.delete('0') + graph_billions_component,
200-
pynutil.delete('00') + graph_thousand_million_component,
201-
pynutil.delete('000') + graph_hundred_million_component,
202-
pynutil.delete('0000') + graph_ten_million_component,
203-
pynutil.delete('00000') + graph_million_component,
204-
pynutil.delete('000000') + graph_hundred_thousand_component,
205-
pynutil.delete('0000000') + graph_ten_thousand_component,
206-
pynutil.delete('00000000') + graph_thousand_component,
207-
pynutil.delete('000000000') + graph_hundred_component,
208-
(pynini.closure(pynutil.delete('0')) + graph_all)
206+
graph_hundred_trillions_component = (
207+
(NEMO_DIGIT**4 @ graph_thousand_component)
208+
+ pynutil.insert('조')
209+
+ pynini.union(
210+
pynini.closure(pynutil.delete('0')),
211+
graph_ten_billions_component,
212+
pynutil.delete('0') + graph_billions_component,
213+
pynutil.delete('00') + graph_thousand_million_component,
214+
pynutil.delete('000') + graph_hundred_million_component,
215+
pynutil.delete('0000') + graph_ten_million_component,
216+
pynutil.delete('00000') + graph_million_component,
217+
pynutil.delete('000000') + graph_hundred_thousand_component,
218+
pynutil.delete('0000000') + graph_ten_thousand_component,
219+
pynutil.delete('00000000') + graph_thousand_component,
220+
pynutil.delete('000000000') + graph_hundred_component,
221+
(pynini.closure(pynutil.delete('0')) + graph_all),
209222
)
210223
)
211224
graph_hundred_trillions = hundred_trillions @ graph_hundred_trillions_component
212225

213226
thousand_trillions = NEMO_DIGIT**17
214-
graph_thousand_trillions_component = (graph_digit + pynutil.insert('경') + pynini.union(
215-
pynini.closure(pynutil.delete('0')),
216-
graph_hundred_trillions_component,
217-
pynutil.delete('0') + graph_ten_trillions_component,
218-
pynutil.delete('00') + graph_trillion_component,
219-
pynutil.delete('000') + graph_hundred_billions_component,
220-
pynutil.delete('0000') + graph_ten_billions_component,
221-
pynutil.delete('00000') + graph_billions_component,
222-
pynutil.delete('000000') + graph_thousand_million_component,
223-
pynutil.delete('0000000') + graph_hundred_million_component,
224-
pynutil.delete('00000000') + graph_ten_million_component,
225-
pynutil.delete('000000000') + graph_million_component,
226-
pynutil.delete('0000000000') + graph_hundred_thousand_component,
227-
pynutil.delete('00000000000') + graph_ten_thousand_component,
228-
pynutil.delete('000000000000') + graph_thousand_component,
229-
pynutil.delete('0000000000000') + graph_hundred_component,
230-
(pynini.closure(pynutil.delete('0')) + graph_all)
227+
graph_thousand_trillions_component = (
228+
graph_digit
229+
+ pynutil.insert('경')
230+
+ pynini.union(
231+
pynini.closure(pynutil.delete('0')),
232+
graph_hundred_trillions_component,
233+
pynutil.delete('0') + graph_ten_trillions_component,
234+
pynutil.delete('00') + graph_trillion_component,
235+
pynutil.delete('000') + graph_hundred_billions_component,
236+
pynutil.delete('0000') + graph_ten_billions_component,
237+
pynutil.delete('00000') + graph_billions_component,
238+
pynutil.delete('000000') + graph_thousand_million_component,
239+
pynutil.delete('0000000') + graph_hundred_million_component,
240+
pynutil.delete('00000000') + graph_ten_million_component,
241+
pynutil.delete('000000000') + graph_million_component,
242+
pynutil.delete('0000000000') + graph_hundred_thousand_component,
243+
pynutil.delete('00000000000') + graph_ten_thousand_component,
244+
pynutil.delete('000000000000') + graph_thousand_component,
245+
pynutil.delete('0000000000000') + graph_hundred_component,
246+
(pynini.closure(pynutil.delete('0')) + graph_all),
231247
)
232248
)
233249
graph_thousand_trillions = thousand_trillions @ graph_thousand_trillions_component
@@ -254,14 +270,7 @@ def __init__(self, deterministic: bool = True):
254270
).optimize()
255271

256272
# Sign and final formatting
257-
optional_sign = pynini.closure(
258-
pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1
259-
)
260-
final_graph = (
261-
optional_sign
262-
+ pynutil.insert('integer: "')
263-
+ graph_num
264-
+ pynutil.insert('"')
265-
)
273+
optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1)
274+
final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"')
266275
final_graph = self.add_tokens(final_graph)
267276
self.fst = final_graph.optimize()

nemo_text_processing/text_normalization/normalize.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def __init__(
176176
from nemo_text_processing.text_normalization.ja.verbalizers.verbalize_final import VerbalizeFinalFst
177177
elif lang == 'ko':
178178
from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import ClassifyFst
179-
from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst
179+
from nemo_text_processing.text_normalization.ko.verbalizers.verbalize_final import VerbalizeFinalFst
180180
else:
181181
raise NotImplementedError(f"Language {lang} has not been supported yet.")
182182

@@ -768,7 +768,6 @@ def parse_args():
768768
parser.add_argument("--n_jobs", default=-2, type=int, help="The maximum number of concurrently running jobs")
769769
parser.add_argument("--batch_size", default=200, type=int, help="Number of examples for each process")
770770
parser.add_argument(
771-
772771
"--max_number_of_permutations_per_split",
773772
default=729,
774773
type=int,

tools/text_processing_deployment/pynini_export.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -317,12 +317,10 @@ def parse_args():
317317
from nemo_text_processing.text_normalization.ko.taggers.tokenize_and_classify import (
318318
ClassifyFst as TNClassifyFst,
319319
)
320-
from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import (
321-
VerbalizeFst as TNVerbalizeFst,
322-
)
323320
from nemo_text_processing.text_normalization.ko.verbalizers.post_processing import (
324321
PostProcessingFst as TNPostProcessingFst,
325322
)
323+
from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst
326324
output_dir = os.path.join(args.output_dir, f"{args.language}_{args.grammars}_{args.input_case}")
327325
export_grammars(
328326
output_dir=output_dir,

0 commit comments

Comments
 (0)