Skip to content

Commit 171de10

Browse files
add base coverage for fr tn date (#267) (#269)
* add base coverage for fr tn date * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 57d47b7 commit 171de10

File tree

11 files changed

+219
-2
lines changed

11 files changed

+219
-2
lines changed

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ pipeline {
1616
EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-04-24-0'
1717
ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-25-24-0'
1818
ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
19-
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
19+
FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-07-25-0'
2020
HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
2121
PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
2222
RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
20s twenties
2+
30s thirties
3+
40s forties
4+
50s fifties
5+
60s sixties
6+
70s seventies
7+
80s eighties
8+
90s nineties
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
1 janvier
2+
2 février
3+
3 mars
4+
4 avril
5+
5 mai
6+
6 juin
7+
7 juillet
8+
8 août
9+
9 septembre
10+
10 octobre
11+
11 novembre
12+
12 décembre
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import pynini
2+
from pynini.lib import pynutil
3+
4+
from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst
5+
from nemo_text_processing.text_normalization.fr.utils import get_abs_path
6+
7+
# TODO: add articles? 'le...'
8+
9+
month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv"))
10+
eras = pynini.string_file(get_abs_path("data/dates/eras.tsv"))
11+
delete_leading_zero = (
12+
pynutil.delete("0") | (NEMO_DIGIT - "0")
13+
) + NEMO_DIGIT # reminder, NEMO_DIGIT = filter on digits
14+
15+
16+
class DateFst(GraphFst):
17+
''' Finite state transducer for classyfing dates, e.g.:
18+
'02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true}
19+
'''
20+
21+
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
22+
super().__init__(name="dates", kind="classify")
23+
24+
cardinal_graph = cardinal.all_nums_no_tokens
25+
26+
# 'le' -> 'le', 'les' -> 'les'
27+
le_determiner = pynini.accep("le ") | pynini.accep("les ")
28+
self.optional_le = pynini.closure(le_determiner, 0, 1)
29+
30+
# '01' -> 'un'
31+
optional_leading_zero = delete_leading_zero | NEMO_DIGIT
32+
valid_day_number = pynini.union(*[str(x) for x in range(1, 32)])
33+
premier = pynini.string_map([("1", "premier")])
34+
day_number_to_word = premier | cardinal_graph
35+
36+
digit_to_day = self.optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word
37+
self.day_graph = pynutil.insert("day: \"") + digit_to_day + pynutil.insert("\"")
38+
39+
# '03' -> 'mars'
40+
normalize_month_number = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)])
41+
number_to_month = month_numbers.optimize()
42+
month_graph = normalize_month_number @ number_to_month
43+
self.month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
44+
45+
# 2025 -> deux mille vingt cinq
46+
accept_year_digits = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)
47+
digits_to_year = accept_year_digits @ cardinal_graph
48+
self.year_graph = pynutil.insert("year: \"") + digits_to_year + pynutil.insert("\"")
49+
50+
# Putting it all together
51+
self.fst = pynini.accep("")
52+
53+
for separator in ["/", ".", "-"]:
54+
self.fst |= (
55+
pynutil.insert("date { ")
56+
+ self.day_graph
57+
+ pynutil.delete(separator)
58+
+ pynutil.insert(" ")
59+
+ self.month_graph
60+
+ pynini.closure(pynutil.delete(separator) + pynutil.insert(" ") + self.year_graph, 0, 1)
61+
+ pynutil.insert(" preserve_order: true }")
62+
)
63+
64+
# Accepts "janvier", "février", etc
65+
month_name_graph = pynutil.insert("month: \"") + month_numbers.project("output") + pynutil.insert("\"")
66+
67+
self.fst |= (
68+
pynutil.insert("date { ")
69+
+ self.day_graph
70+
+ pynini.accep(" ")
71+
+ month_name_graph
72+
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
73+
+ pynutil.insert(" preserve_order: true}")
74+
)
75+
76+
# Accepts "70s", "80s", etc
77+
self.fst |= pynutil.insert("date { year: \"") + eras + pynutil.insert("\" preserve_order: true }")
78+
79+
# Accepts date ranges, "17-18-19 juin" -> date { day: "17" day: "18": day: "19"}
80+
for separator in ["-", "/"]:
81+
day_range_graph = (
82+
pynutil.insert("day: \"")
83+
+ pynini.closure(digit_to_day + pynutil.delete(separator) + pynutil.insert(" "), 1)
84+
+ digit_to_day
85+
+ pynutil.insert("\"")
86+
)
87+
88+
self.fst |= (
89+
pynutil.insert("date { ")
90+
+ day_range_graph
91+
+ pynini.accep(" ")
92+
+ month_name_graph
93+
+ pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
94+
+ pynutil.insert(" preserve_order: true }")
95+
)
96+
97+
self.fst = self.fst.optimize()

nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
)
2727
from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
2828
from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst
29+
from nemo_text_processing.text_normalization.fr.taggers.date import DateFst
2930
from nemo_text_processing.text_normalization.fr.taggers.decimals import DecimalFst
3031
from nemo_text_processing.text_normalization.fr.taggers.fraction import FractionFst
3132
from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst
@@ -86,8 +87,12 @@ def __init__(
8687
whitelist_graph = self.whitelist.fst
8788
punct_graph = PunctuationFst(deterministic=deterministic).fst
8889

90+
self.date = DateFst(self.cardinal, deterministic=deterministic)
91+
date_graph = self.date.fst
92+
8993
classify = (
9094
pynutil.add_weight(whitelist_graph, 1.01)
95+
| pynutil.add_weight(date_graph, 1.1)
9196
| pynutil.add_weight(cardinal_graph, 1.1)
9297
| pynutil.add_weight(fraction_graph, 1.09)
9398
| pynutil.add_weight(ordinal_graph, 1.1)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pynini
16+
from pynini.lib import pynutil
17+
18+
from nemo_text_processing.text_normalization.en.graph_utils import (
19+
NEMO_NOT_QUOTE,
20+
NEMO_SPACE,
21+
GraphFst,
22+
delete_preserve_order,
23+
)
24+
25+
26+
class DateFst(GraphFst):
27+
"""
28+
Finite state transducer for verbalizing date, e.g.
29+
date {day: "deux" month: "mars" year: "deux mille trois" preserve_order: true} -> deux mars deux mille trois
30+
Args:
31+
ordinal: OrdinalFst
32+
deterministic: if True will provide a single transduction option,
33+
for False multiple transduction are generated (used for audio-based normalization)
34+
"""
35+
36+
def __init__(self, deterministic: bool = True):
37+
super().__init__(name="date", kind="verbalize", deterministic=deterministic)
38+
39+
day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
40+
month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
41+
year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
42+
decade = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
43+
44+
graph_dmy = day + NEMO_SPACE + month + pynini.closure(NEMO_SPACE + year, 0, 1) + delete_preserve_order
45+
graph_my = month + NEMO_SPACE + year + delete_preserve_order
46+
graph_decade = decade + delete_preserve_order
47+
48+
self.graph = graph_dmy | graph_my | graph_decade
49+
50+
delete_tokens = self.delete_tokens(self.graph)
51+
self.fst = delete_tokens.optimize()

nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
1515
from nemo_text_processing.text_normalization.en.verbalizers.whitelist import WhiteListFst
1616
from nemo_text_processing.text_normalization.fr.verbalizers.cardinal import CardinalFst
17+
from nemo_text_processing.text_normalization.fr.verbalizers.date import DateFst
1718
from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst
1819
from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst
1920
from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst
@@ -40,6 +41,8 @@ def __init__(self, deterministic: bool = True):
4041
fraction = FractionFst(ordinal=ordinal, deterministic=deterministic)
4142
fraction_graph = fraction.fst
4243
whitelist_graph = WhiteListFst(deterministic=deterministic).fst
44+
date = DateFst(deterministic=deterministic)
45+
date_graph = date.fst
4346

44-
graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph
47+
graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph | date_graph
4548
self.fst = graph
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
02.03.2003~deux mars deux mille trois
2+
02/03/2003~deux mars deux mille trois
3+
02-03-2003~deux mars deux mille trois
4+
le 02.03.2003~le deux mars deux mille trois
5+
17.06~dix-sept juin
6+
17 janvier~dix-sept janvier
7+
10 mars 2023~dix mars deux mille vingt-trois
8+
le 10 mars 2023~le dix mars deux mille vingt-trois
9+
les 80s~les eighties
10+
les 17/18 juin~les dix-sept dix-huit juin
11+
les 17/18/19 mars~les dix-sept dix-huit dix-neuf mars
12+
les 17-18-19 juin~les dix-sept dix-huit dix-neuf juin
13+
les 17-18-19 juin 2025~les dix-sept dix-huit dix-neuf juin deux mille vingt-cinq

tests/nemo_text_processing/fr/test_date.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from parameterized import parameterized
1717

1818
from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
19+
from nemo_text_processing.text_normalization.normalize import Normalizer
1920

2021
from ..utils import CACHE_DIR, parse_test_case_file
2122

@@ -29,3 +30,12 @@ class TestDate:
2930
def test_denorm(self, test_input, expected):
3031
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
3132
assert pred == expected
33+
34+
normalizer = Normalizer(input_case='cased', lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False)
35+
36+
@parameterized.expand(parse_test_case_file('fr/data_text_normalization/test_cases_date.txt'))
37+
@pytest.mark.run_only_on('CPU')
38+
@pytest.mark.unit
39+
def test_norm(self, test_input, expected):
40+
pred = self.normalizer.normalize(test_input, verbose=False)
41+
assert pred == expected

tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ testTNCardinal() {
2727
runtest $input
2828
}
2929

30+
testTNDate() {
31+
input=$PROJECT_DIR/fr/data_text_normalization/test_cases_date.txt
32+
runtest $input
33+
}
34+
3035
testTNDecimal() {
3136
input=$PROJECT_DIR/fr/data_text_normalization/test_cases_decimal.txt
3237
runtest $input

0 commit comments

Comments
 (0)