2
2
3
3
Abbreviations module for tokenization of Icelandic text
4
4
5
- Copyright (C) 2022 Miðeind ehf.
5
+ Copyright (C) 2016-2024 Miðeind ehf.
6
6
Original author: Vilhjálmur Þorsteinsson
7
7
8
8
This software is licensed under the MIT License:
33
33
34
34
"""
35
35
36
- from typing import Generic , Iterator , Optional , Set , List , Dict , TypeVar
36
+ from typing import Generic , Iterator , Optional , TypeVar
37
37
38
38
from threading import Lock
39
39
from collections import defaultdict , OrderedDict
40
- from importlib .resources import open_text
40
+ import importlib .resources as importlib_resources
41
41
42
42
from .definitions import BIN_Tuple
43
43
44
44
45
45
class ConfigError (Exception ):
46
-
47
46
pass
48
47
49
48
50
49
_T = TypeVar ("_T" )
51
50
52
51
53
52
class OrderedSet (Generic [_T ]):
54
-
55
- """ Shim class to provide an ordered set API on top
56
- of an OrderedDict. This is necessary to make abbreviation
57
- lookups predictable and repeatable, which they would not be
58
- if a standard Python set() was used. """
53
+ """Shim class to provide an ordered set API on top
54
+ of an OrderedDict. This is necessary to make abbreviation
55
+ lookups predictable and repeatable, which they would not be
56
+ if a standard Python set() was used."""
59
57
60
58
def __init__ (self ) -> None :
61
- self ._dict : Dict [_T , None ] = OrderedDict ()
59
+ self ._dict : dict [_T , None ] = OrderedDict ()
62
60
63
61
def add (self , item : _T ) -> None :
64
- """ Add an item at the end of the ordered set """
62
+ """Add an item at the end of the ordered set"""
65
63
if item not in self ._dict :
66
64
self ._dict [item ] = None
67
65
@@ -73,42 +71,41 @@ def __iter__(self) -> Iterator[_T]:
73
71
74
72
75
73
class Abbreviations :
76
-
77
- """ Wrapper around dictionary of abbreviations,
78
- initialized from the config file """
74
+ """Wrapper around dictionary of abbreviations,
75
+ initialized from the config file"""
79
76
80
77
# Dictionary of abbreviations and their meanings
81
- DICT : Dict [str , OrderedSet [BIN_Tuple ]] = defaultdict (OrderedSet )
78
+ DICT : dict [str , OrderedSet [BIN_Tuple ]] = defaultdict (OrderedSet )
82
79
# Wrong versions of abbreviations
83
- WRONGDICT : Dict [str , OrderedSet [BIN_Tuple ]] = defaultdict (OrderedSet )
80
+ WRONGDICT : dict [str , OrderedSet [BIN_Tuple ]] = defaultdict (OrderedSet )
84
81
# All abbreviation meanings
85
- MEANINGS : Set [str ] = set ()
82
+ MEANINGS : set [str ] = set ()
86
83
# Single-word abbreviations, i.e. those with only one dot at the end
87
- SINGLES : Set [str ] = set ()
84
+ SINGLES : set [str ] = set ()
88
85
# Set of abbreviations without periods, e.g. "td", "osfrv"
89
- WRONGSINGLES : Set [str ] = set ()
86
+ WRONGSINGLES : set [str ] = set ()
90
87
# Potential sentence finishers, i.e. those with a dot at the end,
91
88
# marked with an asterisk in the config file
92
- FINISHERS : Set [str ] = set ()
89
+ FINISHERS : set [str ] = set ()
93
90
# Abbreviations that should not be seen as such at the end of sentences,
94
91
# marked with an exclamation mark in the config file
95
- NOT_FINISHERS : Set [str ] = set ()
92
+ NOT_FINISHERS : set [str ] = set ()
96
93
# Abbreviations that should not be seen as such at the end of sentences, but
97
94
# are allowed in front of person names; marked with a hat ^ in the config file
98
- NAME_FINISHERS : Set [str ] = set ()
95
+ NAME_FINISHERS : set [str ] = set ()
99
96
# Wrong versions of abbreviations with possible corrections
100
97
# wrong version : [correction1, correction2, ...]
101
- WRONGDOTS : Dict [str , List [str ]] = defaultdict (list )
98
+ WRONGDOTS : dict [str , list [str ]] = defaultdict (list )
102
99
# Word forms that should never be interpreted as abbreviations
103
- NOT_ABBREVIATIONS : Set [str ] = set ()
100
+ NOT_ABBREVIATIONS : set [str ] = set ()
104
101
105
102
# Ensure that only one thread initializes the abbreviations
106
103
_lock = Lock ()
107
104
108
105
@staticmethod
109
106
def add (abbrev : str , meaning : str , gender : str , fl : Optional [str ] = None ) -> None :
110
- """ Add an abbreviation to the dictionary.
111
- Called from the config file handler. """
107
+ """Add an abbreviation to the dictionary.
108
+ Called from the config file handler."""
112
109
# Check for sentence finishers
113
110
finisher = False
114
111
not_finisher = False
@@ -152,7 +149,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
152
149
# Append the abbreviation and its meaning in tuple form
153
150
# Multiple meanings are supported for each abbreviation
154
151
Abbreviations .DICT [abbrev ].add (
155
- BIN_Tuple (meaning , 0 , gender , "skst" if fl is None else fl , abbrev , "-" ,)
152
+ BIN_Tuple (
153
+ meaning ,
154
+ 0 ,
155
+ gender ,
156
+ "skst" if fl is None else fl ,
157
+ abbrev ,
158
+ "-" ,
159
+ )
156
160
)
157
161
Abbreviations .MEANINGS .add (meaning )
158
162
# Adding wrong versions of abbreviations
@@ -169,7 +173,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
169
173
# as abbreviations, even though they are listed as such
170
174
# in the form 'Í.' and 'Á.' for use within person names
171
175
Abbreviations .WRONGDICT [wabbrev ].add (
172
- BIN_Tuple (meaning , 0 , gender , "skst" if fl is None else fl , wabbrev , "-" ,)
176
+ BIN_Tuple (
177
+ meaning ,
178
+ 0 ,
179
+ gender ,
180
+ "skst" if fl is None else fl ,
181
+ wabbrev ,
182
+ "-" ,
183
+ )
173
184
)
174
185
175
186
elif "." in abbrev :
@@ -182,15 +193,22 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
182
193
wabbrev = abbrev [:i ] + abbrev [i + 1 :]
183
194
Abbreviations .WRONGDOTS [wabbrev ].append (abbrev )
184
195
Abbreviations .WRONGDICT [wabbrev ].add (
185
- BIN_Tuple (meaning , 0 , gender , "skst" if fl is None else fl , wabbrev , "-" ,)
196
+ BIN_Tuple (
197
+ meaning ,
198
+ 0 ,
199
+ gender ,
200
+ "skst" if fl is None else fl ,
201
+ wabbrev ,
202
+ "-" ,
203
+ )
186
204
)
187
205
if len (indices ) > 2 :
188
206
# 3 or 4 dots currently in vocabulary
189
207
# Not all cases with 4 dots are handled.
190
208
i1 = indices [0 ]
191
209
i2 = indices [1 ]
192
210
i3 = indices [2 ]
193
- wabbrevs : List [str ] = []
211
+ wabbrevs : list [str ] = []
194
212
# 1 and 2 removed
195
213
wabbrevs .append (abbrev [:i1 ] + abbrev [i1 + 1 : i2 ] + abbrev [i2 + 1 :])
196
214
# 1 and 3 removed
@@ -214,7 +232,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
214
232
Abbreviations .WRONGSINGLES .add (wabbrev )
215
233
Abbreviations .WRONGDOTS [wabbrev ].append (abbrev )
216
234
Abbreviations .WRONGDICT [wabbrev ].add (
217
- BIN_Tuple (meaning , 0 , gender , "skst" if fl is None else fl , wabbrev , "-" ,)
235
+ BIN_Tuple (
236
+ meaning ,
237
+ 0 ,
238
+ gender ,
239
+ "skst" if fl is None else fl ,
240
+ wabbrev ,
241
+ "-" ,
242
+ )
218
243
)
219
244
if finisher :
220
245
Abbreviations .FINISHERS .add (abbrev )
@@ -232,16 +257,16 @@ def has_abbreviation(meaning: str) -> bool:
232
257
return meaning in Abbreviations .MEANINGS
233
258
234
259
@staticmethod
235
- def get_meaning (abbrev : str ) -> Optional [List [BIN_Tuple ]]:
236
- """ Lookup meaning(s) of abbreviation, if available. """
260
+ def get_meaning (abbrev : str ) -> Optional [list [BIN_Tuple ]]:
261
+ """Look up meaning(s) of abbreviation, if available."""
237
262
m = Abbreviations .DICT .get (abbrev )
238
263
if not m :
239
264
m = Abbreviations .WRONGDICT .get (abbrev )
240
265
return list (m ) if m else None
241
266
242
267
@staticmethod
243
268
def _handle_abbreviations (s : str ) -> None :
244
- """ Handle abbreviations in the settings section """
269
+ """Handle abbreviations in the settings section"""
245
270
# Format: abbrev[*] = "meaning" gender (kk|kvk|hk)
246
271
# An asterisk after an abbreviation ending with a period
247
272
# indicates that the abbreviation may finish a sentence
@@ -272,22 +297,25 @@ def _handle_abbreviations(s: str) -> None:
272
297
273
298
@staticmethod
274
299
def _handle_not_abbreviations (s : str ) -> None :
275
- """ Handle not_abbreviations in the settings section """
300
+ """Handle not_abbreviations in the settings section"""
276
301
if len (s ) < 3 or s [0 ] != '"' or s [- 1 ] != '"' :
277
302
raise ConfigError ("not_abbreviations should be enclosed in double quotes" )
278
303
Abbreviations .NOT_ABBREVIATIONS .add (s [1 :- 1 ])
279
304
280
305
@staticmethod
281
306
def initialize ():
282
- """ Read the abbreviations config file """
307
+ """Read the abbreviations config file"""
283
308
with Abbreviations ._lock :
284
309
if len (Abbreviations .DICT ):
285
310
# Already initialized
286
311
return
287
312
288
313
section = None
289
- config = open_text (package = "tokenizer" , resource = "Abbrev.conf" , encoding = "utf-8" )
290
- for s in config :
314
+
315
+ p = importlib_resources .files ("tokenizer" ).joinpath ("Abbrev.conf" )
316
+ config = p .read_text (encoding = "utf-8" )
317
+
318
+ for s in config .split ("\n " ):
291
319
# Ignore comments
292
320
ix = s .find ("#" )
293
321
if ix >= 0 :
0 commit comments