18
18
_whitespace_re = re .compile (r"\s+" )
19
19
20
20
21
- def expand_abbreviations (text , lang = "en" ):
21
+ def expand_abbreviations (text : str , lang : str = "en" ) -> str :
22
22
if lang == "en" :
23
23
_abbreviations = abbreviations_en
24
24
elif lang == "fr" :
25
25
_abbreviations = abbreviations_fr
26
+ else :
27
+ msg = f"Language { lang } not supported in expand_abbreviations"
28
+ raise ValueError (msg )
26
29
for regex , replacement in _abbreviations :
27
30
text = re .sub (regex , replacement , text )
28
31
return text
29
32
30
33
31
- def lowercase (text ) :
34
+ def lowercase (text : str ) -> str :
32
35
return text .lower ()
33
36
34
37
35
- def collapse_whitespace (text ) :
38
+ def collapse_whitespace (text : str ) -> str :
36
39
return re .sub (_whitespace_re , " " , text ).strip ()
37
40
38
41
39
- def convert_to_ascii (text ) :
42
+ def convert_to_ascii (text : str ) -> str :
40
43
return anyascii (text )
41
44
42
45
43
- def remove_aux_symbols (text ) :
46
+ def remove_aux_symbols (text : str ) -> str :
44
47
text = re .sub (r"[\<\>\(\)\[\]\"]+" , "" , text )
45
48
return text
46
49
47
50
48
- def replace_symbols (text , lang : Optional [str ] = "en" ):
51
+ def replace_symbols (text : str , lang : Optional [str ] = "en" ) -> str :
49
52
"""Replace symbols based on the language tag.
50
53
51
54
Args:
@@ -78,38 +81,38 @@ def replace_symbols(text, lang: Optional[str] = "en"):
78
81
return text
79
82
80
83
81
- def basic_cleaners (text ) :
84
+ def basic_cleaners (text : str ) -> str :
82
85
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
83
86
text = lowercase (text )
84
87
text = collapse_whitespace (text )
85
88
return text
86
89
87
90
88
- def transliteration_cleaners (text ) :
91
+ def transliteration_cleaners (text : str ) -> str :
89
92
"""Pipeline for non-English text that transliterates to ASCII."""
90
93
# text = convert_to_ascii(text)
91
94
text = lowercase (text )
92
95
text = collapse_whitespace (text )
93
96
return text
94
97
95
98
96
- def basic_german_cleaners (text ) :
99
+ def basic_german_cleaners (text : str ) -> str :
97
100
"""Pipeline for German text"""
98
101
text = lowercase (text )
99
102
text = collapse_whitespace (text )
100
103
return text
101
104
102
105
103
106
# TODO: elaborate it
104
- def basic_turkish_cleaners (text ) :
107
+ def basic_turkish_cleaners (text : str ) -> str :
105
108
"""Pipeline for Turkish text"""
106
109
text = text .replace ("I" , "ı" )
107
110
text = lowercase (text )
108
111
text = collapse_whitespace (text )
109
112
return text
110
113
111
114
112
- def english_cleaners (text ) :
115
+ def english_cleaners (text : str ) -> str :
113
116
"""Pipeline for English text, including number and abbreviation expansion."""
114
117
# text = convert_to_ascii(text)
115
118
text = lowercase (text )
@@ -122,7 +125,7 @@ def english_cleaners(text):
122
125
return text
123
126
124
127
125
- def phoneme_cleaners (text ) :
128
+ def phoneme_cleaners (text : str ) -> str :
126
129
"""Pipeline for phonemes mode, including number and abbreviation expansion.
127
130
128
131
NB: This cleaner converts numbers into English words, for other languages
@@ -136,15 +139,15 @@ def phoneme_cleaners(text):
136
139
return text
137
140
138
141
139
- def multilingual_phoneme_cleaners (text ) :
142
+ def multilingual_phoneme_cleaners (text : str ) -> str :
140
143
"""Pipeline for phonemes mode, including number and abbreviation expansion."""
141
144
text = replace_symbols (text , lang = None )
142
145
text = remove_aux_symbols (text )
143
146
text = collapse_whitespace (text )
144
147
return text
145
148
146
149
147
- def french_cleaners (text ) :
150
+ def french_cleaners (text : str ) -> str :
148
151
"""Pipeline for French text. There is no need to expand numbers, phonemizer already does that"""
149
152
text = expand_abbreviations (text , lang = "fr" )
150
153
text = lowercase (text )
@@ -154,7 +157,7 @@ def french_cleaners(text):
154
157
return text
155
158
156
159
157
- def portuguese_cleaners (text ) :
160
+ def portuguese_cleaners (text : str ) -> str :
158
161
"""Basic pipeline for Portuguese text. There is no need to expand abbreviation and
159
162
numbers, phonemizer already does that"""
160
163
text = lowercase (text )
@@ -170,7 +173,7 @@ def chinese_mandarin_cleaners(text: str) -> str:
170
173
return text
171
174
172
175
173
- def multilingual_cleaners (text ) :
176
+ def multilingual_cleaners (text : str ) -> str :
174
177
"""Pipeline for multilingual text"""
175
178
text = lowercase (text )
176
179
text = replace_symbols (text , lang = None )
@@ -179,7 +182,7 @@ def multilingual_cleaners(text):
179
182
return text
180
183
181
184
182
- def no_cleaners (text ) :
185
+ def no_cleaners (text : str ) -> str :
183
186
# remove newline characters
184
187
text = text .replace ("\n " , "" )
185
188
return text
0 commit comments