Skip to content

Commit 340ecb7

Browse files
authored
Merge pull request #49 from mideind/modernize
Modernization
2 parents 8750e9c + 7f5c92b commit 340ecb7

14 files changed

+365
-176
lines changed

.github/workflows/python-package.yml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ jobs:
1515
strategy:
1616
matrix:
1717
os: [ubuntu-latest]
18-
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.9", "pypy-3.10"]
18+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"]
1919

2020
steps:
2121
- uses: actions/checkout@v4
22+
2223
- name: Set up Python ${{ matrix.python-version }}
2324
uses: actions/setup-python@v5
2425
with:
@@ -29,10 +30,10 @@ jobs:
2930
python -m pip install --upgrade pip wheel setuptools
3031
python -m pip install -e ".[dev]"
3132
32-
- name: Type check with mypy (only on Python 3.8)
33+
- name: Type check with mypy (only on oldest supported Python version)
3334
run: |
34-
if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi
35-
if [ "${{ matrix.python-version }}" == "3.8" ]; then mypy --python-version=3.8 src/tokenizer; fi
35+
if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi
36+
if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi
3637
3738
- name: Test with pytest
3839
run: |

LICENSE.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (C) 2023 Miðeind ehf.
3+
Copyright (C) 2016-2024 Miðeind ehf.
44
Original author: Vilhjálmur Þorsteinsson
55

66
Permission is hereby granted, free of charge, to any person obtaining a copy

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
graft src
22
prune src/tokenizer/__pycache__
33
prune src/tokenizer/.mypy_cache
4+
prune src/tokenizer/.DS_Store

README.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing
1212
tasks, such as word counting, parsing, spell checking, corpus generation, and
1313
statistical analysis of text.
1414

15-
**Tokenizer** is a compact pure-Python (>= 3.8) executable
15+
**Tokenizer** is a compact pure-Python (>=3.9) executable
1616
program and module for tokenizing Icelandic text. It converts input text to
1717
streams of *tokens*, where each token is a separate word, punctuation sign,
1818
number/amount, date, e-mail, URL/URI, etc. It also segments the token stream
@@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
809809
Changelog
810810
---------
811811

812+
* Version 3.4.5: Compatibility with Python 3.13. Now requires Python 3.9 or later.
812813
* Version 3.4.4: Better handling of abbreviations
813814
* Version 3.4.3: Various minor fixes. Now requires Python 3.8 or later.
814815
* Version 3.4.2: Abbreviations and phrases added, ``META_BEGIN`` token added.

pyproject.toml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,33 @@
11
[project]
22
name = "tokenizer"
3-
version = "3.4.4"
3+
version = "3.4.5"
44
description = "A tokenizer for Icelandic text"
55
authors = [{ name = "Miðeind ehf.", email = "mideind@mideind.is" }]
66
readme = { file = "README.rst", content-type = "text/x-rst" }
7-
license = { file = "LICENSE.txt" }
8-
# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers
7+
license = { text = "MIT" }
98
classifiers = [
109
"Development Status :: 5 - Production/Stable",
1110
"Intended Audience :: Developers",
1211
"License :: OSI Approved :: MIT License",
1312
"Operating System :: Unix",
1413
"Operating System :: POSIX",
14+
"Operating System :: MacOS",
1515
"Operating System :: Microsoft :: Windows",
1616
"Natural Language :: Icelandic",
1717
"Programming Language :: Python",
1818
"Programming Language :: Python :: 3",
19-
"Programming Language :: Python :: 3.8",
2019
"Programming Language :: Python :: 3.9",
2120
"Programming Language :: Python :: 3.10",
2221
"Programming Language :: Python :: 3.11",
2322
"Programming Language :: Python :: 3.12",
23+
"Programming Language :: Python :: 3.13",
2424
"Programming Language :: Python :: Implementation :: CPython",
2525
"Programming Language :: Python :: Implementation :: PyPy",
2626
"Topic :: Software Development :: Libraries :: Python Modules",
2727
"Topic :: Utilities",
2828
"Topic :: Text Processing :: Linguistic",
2929
]
30-
requires-python = ">=3.8"
30+
requires-python = ">=3.9"
3131

3232
[project.urls]
3333
Repository = "https://github.com/mideind/Tokenizer"
@@ -51,17 +51,17 @@ where = ["src"]
5151
[tool.pytest.ini_options]
5252
filterwarnings = [
5353
# Ignore deprecation warnings in libraries, their problem not ours
54-
"ignore::DeprecationWarning",
54+
# "ignore::DeprecationWarning",
5555
]
5656

5757
[tool.ruff]
58-
line-length = 120
58+
line-length = 88
5959

6060
[tool.black]
61-
line-length = 120
61+
line-length = 88
6262

6363
[tool.isort]
6464
# This forces these imports to placed at the top
6565
known_future_library = ["__future__", "typing", "typing_extensions"]
6666
profile = "black"
67-
line_length = 120
67+
line_length = 88

src/tokenizer/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright(C) 2022 Miðeind ehf.
3+
Copyright(C) 2016-2024 Miðeind ehf.
44
Original author: Vilhjálmur Þorsteinsson
55
66
This software is licensed under the MIT License:
@@ -63,9 +63,8 @@
6363
from .abbrev import Abbreviations, ConfigError
6464

6565
__author__ = "Miðeind ehf."
66-
__copyright__ = "(C) 2023 Miðeind ehf."
67-
__version__ = importlib.metadata.version("tokenizer")
68-
66+
__copyright__ = "(C) 2016-2024 Miðeind ehf."
67+
__version__ = importlib.metadata.version(__name__)
6968

7069
__all__ = (
7170
"__author__",

src/tokenizer/abbrev.py

Lines changed: 66 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
Abbreviations module for tokenization of Icelandic text
44
5-
Copyright (C) 2022 Miðeind ehf.
5+
Copyright (C) 2016-2024 Miðeind ehf.
66
Original author: Vilhjálmur Þorsteinsson
77
88
This software is licensed under the MIT License:
@@ -33,35 +33,33 @@
3333
3434
"""
3535

36-
from typing import Generic, Iterator, Optional, Set, List, Dict, TypeVar
36+
from typing import Generic, Iterator, Optional, TypeVar
3737

3838
from threading import Lock
3939
from collections import defaultdict, OrderedDict
40-
from importlib.resources import open_text
40+
import importlib.resources as importlib_resources
4141

4242
from .definitions import BIN_Tuple
4343

4444

4545
class ConfigError(Exception):
46-
4746
pass
4847

4948

5049
_T = TypeVar("_T")
5150

5251

5352
class OrderedSet(Generic[_T]):
54-
55-
""" Shim class to provide an ordered set API on top
56-
of an OrderedDict. This is necessary to make abbreviation
57-
lookups predictable and repeatable, which they would not be
58-
if a standard Python set() was used. """
53+
"""Shim class to provide an ordered set API on top
54+
of an OrderedDict. This is necessary to make abbreviation
55+
lookups predictable and repeatable, which they would not be
56+
if a standard Python set() was used."""
5957

6058
def __init__(self) -> None:
61-
self._dict: Dict[_T, None] = OrderedDict()
59+
self._dict: dict[_T, None] = OrderedDict()
6260

6361
def add(self, item: _T) -> None:
64-
""" Add an item at the end of the ordered set """
62+
"""Add an item at the end of the ordered set"""
6563
if item not in self._dict:
6664
self._dict[item] = None
6765

@@ -73,42 +71,41 @@ def __iter__(self) -> Iterator[_T]:
7371

7472

7573
class Abbreviations:
76-
77-
""" Wrapper around dictionary of abbreviations,
78-
initialized from the config file """
74+
"""Wrapper around dictionary of abbreviations,
75+
initialized from the config file"""
7976

8077
# Dictionary of abbreviations and their meanings
81-
DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
78+
DICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
8279
# Wrong versions of abbreviations
83-
WRONGDICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
80+
WRONGDICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
8481
# All abbreviation meanings
85-
MEANINGS: Set[str] = set()
82+
MEANINGS: set[str] = set()
8683
# Single-word abbreviations, i.e. those with only one dot at the end
87-
SINGLES: Set[str] = set()
84+
SINGLES: set[str] = set()
8885
# Set of abbreviations without periods, e.g. "td", "osfrv"
89-
WRONGSINGLES: Set[str] = set()
86+
WRONGSINGLES: set[str] = set()
9087
# Potential sentence finishers, i.e. those with a dot at the end,
9188
# marked with an asterisk in the config file
92-
FINISHERS: Set[str] = set()
89+
FINISHERS: set[str] = set()
9390
# Abbreviations that should not be seen as such at the end of sentences,
9491
# marked with an exclamation mark in the config file
95-
NOT_FINISHERS: Set[str] = set()
92+
NOT_FINISHERS: set[str] = set()
9693
# Abbreviations that should not be seen as such at the end of sentences, but
9794
# are allowed in front of person names; marked with a hat ^ in the config file
98-
NAME_FINISHERS: Set[str] = set()
95+
NAME_FINISHERS: set[str] = set()
9996
# Wrong versions of abbreviations with possible corrections
10097
# wrong version : [correction1, correction2, ...]
101-
WRONGDOTS: Dict[str, List[str]] = defaultdict(list)
98+
WRONGDOTS: dict[str, list[str]] = defaultdict(list)
10299
# Word forms that should never be interpreted as abbreviations
103-
NOT_ABBREVIATIONS: Set[str] = set()
100+
NOT_ABBREVIATIONS: set[str] = set()
104101

105102
# Ensure that only one thread initializes the abbreviations
106103
_lock = Lock()
107104

108105
@staticmethod
109106
def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> None:
110-
""" Add an abbreviation to the dictionary.
111-
Called from the config file handler. """
107+
"""Add an abbreviation to the dictionary.
108+
Called from the config file handler."""
112109
# Check for sentence finishers
113110
finisher = False
114111
not_finisher = False
@@ -152,7 +149,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
152149
# Append the abbreviation and its meaning in tuple form
153150
# Multiple meanings are supported for each abbreviation
154151
Abbreviations.DICT[abbrev].add(
155-
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, abbrev, "-",)
152+
BIN_Tuple(
153+
meaning,
154+
0,
155+
gender,
156+
"skst" if fl is None else fl,
157+
abbrev,
158+
"-",
159+
)
156160
)
157161
Abbreviations.MEANINGS.add(meaning)
158162
# Adding wrong versions of abbreviations
@@ -169,7 +173,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
169173
# as abbreviations, even though they are listed as such
170174
# in the form 'Í.' and 'Á.' for use within person names
171175
Abbreviations.WRONGDICT[wabbrev].add(
172-
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
176+
BIN_Tuple(
177+
meaning,
178+
0,
179+
gender,
180+
"skst" if fl is None else fl,
181+
wabbrev,
182+
"-",
183+
)
173184
)
174185

175186
elif "." in abbrev:
@@ -182,15 +193,22 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
182193
wabbrev = abbrev[:i] + abbrev[i + 1 :]
183194
Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
184195
Abbreviations.WRONGDICT[wabbrev].add(
185-
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
196+
BIN_Tuple(
197+
meaning,
198+
0,
199+
gender,
200+
"skst" if fl is None else fl,
201+
wabbrev,
202+
"-",
203+
)
186204
)
187205
if len(indices) > 2:
188206
# 3 or 4 dots currently in vocabulary
189207
# Not all cases with 4 dots are handled.
190208
i1 = indices[0]
191209
i2 = indices[1]
192210
i3 = indices[2]
193-
wabbrevs: List[str] = []
211+
wabbrevs: list[str] = []
194212
# 1 and 2 removed
195213
wabbrevs.append(abbrev[:i1] + abbrev[i1 + 1 : i2] + abbrev[i2 + 1 :])
196214
# 1 and 3 removed
@@ -214,7 +232,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
214232
Abbreviations.WRONGSINGLES.add(wabbrev)
215233
Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
216234
Abbreviations.WRONGDICT[wabbrev].add(
217-
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
235+
BIN_Tuple(
236+
meaning,
237+
0,
238+
gender,
239+
"skst" if fl is None else fl,
240+
wabbrev,
241+
"-",
242+
)
218243
)
219244
if finisher:
220245
Abbreviations.FINISHERS.add(abbrev)
@@ -232,16 +257,16 @@ def has_abbreviation(meaning: str) -> bool:
232257
return meaning in Abbreviations.MEANINGS
233258

234259
@staticmethod
235-
def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
236-
""" Lookup meaning(s) of abbreviation, if available. """
260+
def get_meaning(abbrev: str) -> Optional[list[BIN_Tuple]]:
261+
"""Look up meaning(s) of abbreviation, if available."""
237262
m = Abbreviations.DICT.get(abbrev)
238263
if not m:
239264
m = Abbreviations.WRONGDICT.get(abbrev)
240265
return list(m) if m else None
241266

242267
@staticmethod
243268
def _handle_abbreviations(s: str) -> None:
244-
""" Handle abbreviations in the settings section """
269+
"""Handle abbreviations in the settings section"""
245270
# Format: abbrev[*] = "meaning" gender (kk|kvk|hk)
246271
# An asterisk after an abbreviation ending with a period
247272
# indicates that the abbreviation may finish a sentence
@@ -272,22 +297,25 @@ def _handle_abbreviations(s: str) -> None:
272297

273298
@staticmethod
274299
def _handle_not_abbreviations(s: str) -> None:
275-
""" Handle not_abbreviations in the settings section """
300+
"""Handle not_abbreviations in the settings section"""
276301
if len(s) < 3 or s[0] != '"' or s[-1] != '"':
277302
raise ConfigError("not_abbreviations should be enclosed in double quotes")
278303
Abbreviations.NOT_ABBREVIATIONS.add(s[1:-1])
279304

280305
@staticmethod
281306
def initialize():
282-
""" Read the abbreviations config file """
307+
"""Read the abbreviations config file"""
283308
with Abbreviations._lock:
284309
if len(Abbreviations.DICT):
285310
# Already initialized
286311
return
287312

288313
section = None
289-
config = open_text(package="tokenizer", resource="Abbrev.conf", encoding="utf-8")
290-
for s in config:
314+
315+
p = importlib_resources.files("tokenizer").joinpath("Abbrev.conf")
316+
config = p.read_text(encoding="utf-8")
317+
318+
for s in config.split("\n"):
291319
# Ignore comments
292320
ix = s.find("#")
293321
if ix >= 0:

0 commit comments

Comments
 (0)