add base coverage for fr tn date (#267) (#269)

mgrafu · pre-commit-ci[bot] · web-flow · commit 171de106f89d · 2025-04-07T15:49:56.000-04:00
* add base coverage for fr tn date * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Mariana Graterol Fuenmayor <marianag@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -16,7 +16,7 @@ pipeline {
     EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-04-24-0'
     ES_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/09-25-24-0'
     ES_EN_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/08-30-24-0'
-    FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-04-24-0'
+    FR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/04-07-25-0'
     HU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/07-16-24-0'
     PT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
     RU_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
diff --git a/nemo_text_processing/text_normalization/fr/data/dates/__init__.py b/nemo_text_processing/text_normalization/fr/data/dates/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv b/nemo_text_processing/text_normalization/fr/data/dates/eras.tsv
@@ -0,0 +1,8 @@
+20s	twenties
+30s thirties
+40s	forties
+50s	fifties
+60s	sixties 
+70s	seventies
+80s	eighties
+90s	nineties
diff --git a/nemo_text_processing/text_normalization/fr/data/dates/months.tsv b/nemo_text_processing/text_normalization/fr/data/dates/months.tsv
@@ -0,0 +1,12 @@
+1	janvier
+2	février 
+3	mars
+4	avril
+5	mai 
+6	juin
+7	juillet
+8	août 
+9	septembre
+10	octobre
+11	novembre 
+12	décembre
diff --git a/nemo_text_processing/text_normalization/fr/taggers/date.py b/nemo_text_processing/text_normalization/fr/taggers/date.py
@@ -0,0 +1,97 @@
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import NEMO_DIGIT, GraphFst
+from nemo_text_processing.text_normalization.fr.utils import get_abs_path
+
+# TODO: add articles? 'le...'
+
+month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv"))
+eras = pynini.string_file(get_abs_path("data/dates/eras.tsv"))
+delete_leading_zero = (
+    pynutil.delete("0") | (NEMO_DIGIT - "0")
+) + NEMO_DIGIT  # reminder, NEMO_DIGIT = filter on digits
+
+
+class DateFst(GraphFst):
+    ''' Finite state transducer for classyfing dates, e.g.:
+        '02.03.2003' -> date {day: 'deux' month: 'mai' year: 'deux mille trois' preserve order: true} 
+    '''
+
+    def __init__(self, cardinal: GraphFst, deterministic: bool = True):
+        super().__init__(name="dates", kind="classify")
+
+        cardinal_graph = cardinal.all_nums_no_tokens
+
+        # 'le' -> 'le', 'les' -> 'les'
+        le_determiner = pynini.accep("le ") | pynini.accep("les ")
+        self.optional_le = pynini.closure(le_determiner, 0, 1)
+
+        # '01' -> 'un'
+        optional_leading_zero = delete_leading_zero | NEMO_DIGIT
+        valid_day_number = pynini.union(*[str(x) for x in range(1, 32)])
+        premier = pynini.string_map([("1", "premier")])
+        day_number_to_word = premier | cardinal_graph
+
+        digit_to_day = self.optional_le + optional_leading_zero @ valid_day_number @ day_number_to_word
+        self.day_graph = pynutil.insert("day: \"") + digit_to_day + pynutil.insert("\"")
+
+        # '03' -> 'mars'
+        normalize_month_number = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)])
+        number_to_month = month_numbers.optimize()
+        month_graph = normalize_month_number @ number_to_month
+        self.month_graph = pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")
+
+        # 2025 -> deux mille vingt cinq
+        accept_year_digits = (NEMO_DIGIT - "0") + pynini.closure(NEMO_DIGIT, 1, 3)
+        digits_to_year = accept_year_digits @ cardinal_graph
+        self.year_graph = pynutil.insert("year: \"") + digits_to_year + pynutil.insert("\"")
+
+        # Putting it all together
+        self.fst = pynini.accep("")
+
+        for separator in ["/", ".", "-"]:
+            self.fst |= (
+                pynutil.insert("date { ")
+                + self.day_graph
+                + pynutil.delete(separator)
+                + pynutil.insert(" ")
+                + self.month_graph
+                + pynini.closure(pynutil.delete(separator) + pynutil.insert(" ") + self.year_graph, 0, 1)
+                + pynutil.insert(" preserve_order: true }")
+            )
+
+        # Accepts "janvier", "février", etc
+        month_name_graph = pynutil.insert("month: \"") + month_numbers.project("output") + pynutil.insert("\"")
+
+        self.fst |= (
+            pynutil.insert("date { ")
+            + self.day_graph
+            + pynini.accep(" ")
+            + month_name_graph
+            + pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
+            + pynutil.insert(" preserve_order: true}")
+        )
+
+        # Accepts "70s", "80s", etc
+        self.fst |= pynutil.insert("date { year: \"") + eras + pynutil.insert("\" preserve_order: true }")
+
+        # Accepts date ranges, "17-18-19 juin"  -> date { day: "17" day: "18": day: "19"}
+        for separator in ["-", "/"]:
+            day_range_graph = (
+                pynutil.insert("day: \"")
+                + pynini.closure(digit_to_day + pynutil.delete(separator) + pynutil.insert(" "), 1)
+                + digit_to_day
+                + pynutil.insert("\"")
+            )
+
+            self.fst |= (
+                pynutil.insert("date { ")
+                + day_range_graph
+                + pynini.accep(" ")
+                + month_name_graph
+                + pynini.closure(pynini.accep(" ") + self.year_graph, 0, 1)
+                + pynutil.insert(" preserve_order: true }")
+            )
+
+        self.fst = self.fst.optimize()
diff --git a/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/fr/taggers/tokenize_and_classify.py
@@ -26,6 +26,7 @@
 )
 from nemo_text_processing.text_normalization.en.taggers.punctuation import PunctuationFst
 from nemo_text_processing.text_normalization.fr.taggers.cardinal import CardinalFst
+from nemo_text_processing.text_normalization.fr.taggers.date import DateFst
 from nemo_text_processing.text_normalization.fr.taggers.decimals import DecimalFst
 from nemo_text_processing.text_normalization.fr.taggers.fraction import FractionFst
 from nemo_text_processing.text_normalization.fr.taggers.ordinal import OrdinalFst
@@ -86,8 +87,12 @@ def __init__(
             whitelist_graph = self.whitelist.fst
             punct_graph = PunctuationFst(deterministic=deterministic).fst
 
+            self.date = DateFst(self.cardinal, deterministic=deterministic)
+            date_graph = self.date.fst
+
             classify = (
                 pynutil.add_weight(whitelist_graph, 1.01)
+                | pynutil.add_weight(date_graph, 1.1)
                 | pynutil.add_weight(cardinal_graph, 1.1)
                 | pynutil.add_weight(fraction_graph, 1.09)
                 | pynutil.add_weight(ordinal_graph, 1.1)
diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/date.py b/nemo_text_processing/text_normalization/fr/verbalizers/date.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.en.graph_utils import (
+    NEMO_NOT_QUOTE,
+    NEMO_SPACE,
+    GraphFst,
+    delete_preserve_order,
+)
+
+
+class DateFst(GraphFst):
+    """
+    Finite state transducer for verbalizing date, e.g.
+        date {day: "deux" month: "mars" year: "deux mille trois" preserve_order: true} -> deux mars deux mille trois
+    Args:
+        ordinal: OrdinalFst
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="date", kind="verbalize", deterministic=deterministic)
+
+        day = pynutil.delete("day: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
+        month = pynutil.delete("month: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
+        year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
+        decade = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
+
+        graph_dmy = day + NEMO_SPACE + month + pynini.closure(NEMO_SPACE + year, 0, 1) + delete_preserve_order
+        graph_my = month + NEMO_SPACE + year + delete_preserve_order
+        graph_decade = decade + delete_preserve_order
+
+        self.graph = graph_dmy | graph_my | graph_decade
+
+        delete_tokens = self.delete_tokens(self.graph)
+        self.fst = delete_tokens.optimize()
diff --git a/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/fr/verbalizers/verbalize.py
@@ -14,6 +14,7 @@
 from nemo_text_processing.text_normalization.en.graph_utils import GraphFst
 from nemo_text_processing.text_normalization.en.verbalizers.whitelist import WhiteListFst
 from nemo_text_processing.text_normalization.fr.verbalizers.cardinal import CardinalFst
+from nemo_text_processing.text_normalization.fr.verbalizers.date import DateFst
 from nemo_text_processing.text_normalization.fr.verbalizers.decimals import DecimalFst
 from nemo_text_processing.text_normalization.fr.verbalizers.fraction import FractionFst
 from nemo_text_processing.text_normalization.fr.verbalizers.ordinal import OrdinalFst
@@ -40,6 +41,8 @@ def __init__(self, deterministic: bool = True):
         fraction = FractionFst(ordinal=ordinal, deterministic=deterministic)
         fraction_graph = fraction.fst
         whitelist_graph = WhiteListFst(deterministic=deterministic).fst
+        date = DateFst(deterministic=deterministic)
+        date_graph = date.fst
 
-        graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph
+        graph = cardinal_graph | decimal_graph | ordinal_graph | fraction_graph | whitelist_graph | date_graph
         self.fst = graph
diff --git a/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/fr/data_text_normalization/test_cases_date.txt
@@ -0,0 +1,13 @@
+02.03.2003~deux mars deux mille trois
+02/03/2003~deux mars deux mille trois
+02-03-2003~deux mars deux mille trois
+le 02.03.2003~le deux mars deux mille trois
+17.06~dix-sept juin
+17 janvier~dix-sept janvier
+10 mars 2023~dix mars deux mille vingt-trois
+le 10 mars 2023~le dix mars deux mille vingt-trois
+les 80s~les eighties
+les 17/18 juin~les dix-sept dix-huit juin
+les 17/18/19 mars~les dix-sept dix-huit dix-neuf mars
+les 17-18-19 juin~les dix-sept dix-huit dix-neuf juin
+les 17-18-19 juin 2025~les dix-sept dix-huit dix-neuf juin deux mille vingt-cinq
diff --git a/tests/nemo_text_processing/fr/test_date.py b/tests/nemo_text_processing/fr/test_date.py
@@ -16,6 +16,7 @@
 from parameterized import parameterized
 
 from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from nemo_text_processing.text_normalization.normalize import Normalizer
 
 from ..utils import CACHE_DIR, parse_test_case_file
 
@@ -29,3 +30,12 @@ class TestDate:
     def test_denorm(self, test_input, expected):
         pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
         assert pred == expected
+
+    normalizer = Normalizer(input_case='cased', lang='fr', cache_dir=CACHE_DIR, overwrite_cache=False)
+
+    @parameterized.expand(parse_test_case_file('fr/data_text_normalization/test_cases_date.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_norm(self, test_input, expected):
+        pred = self.normalizer.normalize(test_input, verbose=False)
+        assert pred == expected
diff --git a/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/fr/test_sparrowhawk_normalization.sh
@@ -27,6 +27,11 @@ testTNCardinal() {
   runtest $input
 }
 
+testTNDate() {
+  input=$PROJECT_DIR/fr/data_text_normalization/test_cases_date.txt
+  runtest $input
+}
+
 testTNDecimal() {
   input=$PROJECT_DIR/fr/data_text_normalization/test_cases_decimal.txt
   runtest $input