mideind
diff --git a/‎.github/workflows/python-package.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/python-package.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.rst
Lines changed: 32 additions & 29 deletions b/‎README.rst
Lines changed: 32 additions & 29 deletions
diff --git a/‎setup.py
Lines changed: 8 additions & 12 deletions b/‎setup.py
Lines changed: 8 additions & 12 deletions
diff --git a/‎src/tokenizer/__init__.py
Lines changed: 7 additions & 7 deletions b/‎src/tokenizer/__init__.py
Lines changed: 7 additions & 7 deletions
@@ -16,7 +16,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: [2.7, 3.6, 3.7, 3.8, 3.9, pypy-3.6]
+        python-version: [3.6, 3.7, 3.8, 3.9, pypy-3.6]
 
     steps:
     - uses: actions/checkout@v2
 
@@ -114,3 +114,6 @@ ENV/
 # mypy
 .mypy_cache/
 mypy.ini
+
+# Vim swap files
+.*.swp
@@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing
 tasks, such as word counting, parsing, spell checking, corpus generation, and
 statistical analysis of text.
 
-**Tokenizer** is a compact pure-Python (2 and 3) executable
+**Tokenizer** is a compact pure-Python (>= 3.6) executable
 program and module for tokenizing Icelandic text. It converts input text to
 streams of *tokens*, where each token is a separate word, punctuation sign,
 number/amount, date, e-mail, URL/URI, etc. It also segments the token stream
@@ -194,10 +194,6 @@ An example of shallow tokenization from Python code goes something like this:
 
 .. code-block:: python
 
-    from __future__ import print_function
-    # The following import is optional but convenient under Python 2.7
-    from __future__ import unicode_literals
-
     from tokenizer import split_into_sentences
 
     # A string to be tokenized, containing two sentences
@@ -213,12 +209,12 @@ An example of shallow tokenization from Python code goes something like this:
         tokens = sentence.split()
 
         # Print the tokens, comma-separated
-        print(", ".join(tokens))
+        print("|".join(tokens))
 
 The program outputs::
 
-    3., janúar, sl., keypti, ég, 64kWst, rafbíl, .
-    Hann, kostaði, €30.000, .
+    3.|janúar|sl.|keypti|ég|64kWst|rafbíl|.
+    Hann|kostaði|€30.000|.
 
 Deep tokenization example
 =========================
@@ -227,8 +223,6 @@ To do deep tokenization from within Python code:
 
 .. code-block:: python
 
-    # The following import is optional but convenient under Python 2.7
-    from __future__ import unicode_literals
     from tokenizer import tokenize, TOK
 
     text = ("Málinu var vísað til stjórnskipunar- og eftirlitsnefndar "
@@ -312,11 +306,6 @@ Alternatively, create a token list from the returned generator::
 
     token_list = list(tokenizer.tokenize(mystring))
 
-In Python 2.7, you can pass either ``unicode`` strings or ``str``
-byte strings to ``tokenizer.tokenize()``. In the latter case, the
-byte string is assumed to be encoded in UTF-8.
-
-
 The ``split_into_sentences()`` function
 ---------------------------------------
 
@@ -504,14 +493,14 @@ functions:
 The token object
 ----------------
 
-Each token is represented by a ``namedtuple`` with three fields:
-``(kind, txt, val)``.
+Each token is an instance of the class ``Tok`` that has three main properties:
+``kind``, ``txt`` and ``val``.
 
 
-The ``kind`` field
-==================
+The ``kind`` property
+=====================
 
-The ``kind`` field contains one of the following integer constants,
+The ``kind`` property contains one of the following integer constants,
 defined within the ``TOK`` class:
 
 +---------------+---------+---------------------+---------------------------+
@@ -627,14 +616,14 @@ To obtain a descriptive text for a token kind, use
 ``TOK.descr[token.kind]`` (see example above).
 
 
-The ``txt`` field
-==================
+The ``txt`` property
+====================
 
-The ``txt`` field contains the original source text for the token,
+The ``txt`` property contains the original source text for the token,
 with the following exceptions:
 
 * All contiguous whitespace (spaces, tabs, newlines) is coalesced
-  into single spaces (``" "``) within the ``txt`` field. A date
+  into single spaces (``" "``) within the ``txt`` string. A date
   token that is parsed from a source text of ``"29.  \n   janúar"``
   thus has a ``txt`` of ``"29. janúar"``.
 
@@ -655,10 +644,10 @@ with the following exceptions:
   being escaped (``á``).
 
 
-The ``val`` field
-==================
+The ``val`` property
+====================
 
-The ``val`` field contains auxiliary information, corresponding to
+The ``val`` property contains auxiliary information, corresponding to
 the token kind, as follows:
 
 - For ``TOK.PUNCTUATION``, the ``val`` field contains a tuple with
@@ -676,40 +665,52 @@ the token kind, as follows:
   quotes are represented as Icelandic ones (i.e. „these“ or ‚these‘) in
   normalized form, and ellipsis ("...") are represented as the single
   character "…".
+
 - For ``TOK.TIME``, the ``val`` field contains an
   ``(hour, minute, second)`` tuple.
+
 - For ``TOK.DATEABS``, the ``val`` field contains a
   ``(year, month, day)`` tuple (all 1-based).
+
 - For ``TOK.DATEREL``, the ``val`` field contains a
   ``(year, month, day)`` tuple (all 1-based),
   except that a least one of the tuple fields is missing and set to 0.
   Example: *3. júní* becomes ``TOK.DATEREL`` with the fields ``(0, 6, 3)``
   as the year is missing.
+
 - For ``TOK.YEAR``, the ``val`` field contains the year as an integer.
   A negative number indicates that the year is BCE (*fyrir Krist*),
   specified with the suffix *f.Kr.* (e.g. *árið 33 f.Kr.*).
+
 - For ``TOK.NUMBER``, the ``val`` field contains a tuple
   ``(number, None, None)``.
   (The two empty fields are included for compatibility with Greynir.)
+
 - For ``TOK.WORD``, the ``val`` field contains the full expansion
   of an abbreviation, as a list containing a single tuple, or ``None``
   if the word is not abbreviated.
+
 - For ``TOK.PERCENT``, the ``val`` field contains a tuple
   of ``(percentage, None, None)``.
+
 - For ``TOK.ORDINAL``, the ``val`` field contains the ordinal value
   as an integer. The original ordinal may be a decimal number
   or a Roman numeral.
+
 - For ``TOK.TIMESTAMP``, the ``val`` field contains
   a ``(year, month, day, hour, minute, second)`` tuple.
+
 - For ``TOK.AMOUNT``, the ``val`` field contains
   an ``(amount, currency, None, None)`` tuple. The amount is a float, and
   the currency is an ISO currency code, e.g. *USD* for dollars ($ sign),
   *EUR* for euros (€ sign) or *ISK* for Icelandic króna
   (*kr.* abbreviation). (The two empty fields are included for
   compatibility with Greynir.)
+
 - For ``TOK.MEASUREMENT``, the ``val`` field contains a ``(unit, value)``
   tuple, where ``unit`` is a base SI unit (such as ``g``, ``m``,
   ``m²``, ``s``, ``W``, ``Hz``, ``K`` for temperature in Kelvin).
+
 - For ``TOK.TELNO``, the ``val`` field contains a tuple: ``(number, cc)``
   where the first item is the phone number
   in a normalized ``NNN-NNNN`` format, i.e. always including a hyphen,
@@ -733,8 +734,8 @@ An example is *o.s.frv.*, which results in a ``val`` field equal to
 ``[('og svo framvegis', 0, 'ao', 'frasi', 'o.s.frv.', '-')]``.
 
 The tuple format is designed to be compatible with the
-*Database of Modern Icelandic Inflection* (*DMII*),
-*Beygingarlýsing íslensks nútímamáls*.
+*Database of Icelandic Morphology* (*DIM*),
+*Beygingarlýsing íslensks nútímamáls*, i.e. the so-called *Sigrúnarsnið*.
 
 
 Development installation
@@ -804,6 +805,8 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
 Changelog
 ---------
 
+* Version 3.0.0: Added tracking of character offsets for tokens within the
+  original source text. Added full type annotations. Dropped Python 2.7 support.
 * Version 2.5.0: Added arguments for all tokenizer options to the
   command-line tool. Type annotations enhanced.
 * Version 2.4.0: Fixed bug where certain well-known word forms (*fá*, *fær*, *mín*, *sá*...)
 
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- encoding: utf-8 -*-
 """
 
     Tokenizer for Icelandic text
@@ -30,16 +29,17 @@
 
 """
 
-from __future__ import absolute_import
+from typing import Any
 
 import io
 import re
+
 from glob import glob
 from os.path import basename, dirname, join, splitext
 from setuptools import find_packages, setup  # type: ignore
 
 
-def read(*names, **kwargs):
+def read(*names: str, **kwargs: Any) -> str:
     try:
         return io.open(
             join(dirname(__file__), *names),
@@ -48,13 +48,16 @@ def read(*names, **kwargs):
     except (IOError, OSError):
         return ""
 
+# Load version string from file
+__version__ = "[missing]"
+exec(open(join("src", "tokenizer", "version.py")).read())
 
 setup(
     name="tokenizer",
-    version="2.5.0",  # Also update src/tokenizer/__init__.py
+    version=__version__,
     license="MIT",
     description="A tokenizer for Icelandic text",
-    long_description=u"{0}\n{1}".format(
+    long_description="{0}\n{1}".format(
         re.compile("^.. start-badges.*^.. end-badges", re.M | re.S)
             .sub("", read("README.rst")
         ),
@@ -79,9 +82,7 @@ def read(*names, **kwargs):
         "Operating System :: Microsoft :: Windows",
         "Natural Language :: Icelandic",
         "Programming Language :: Python",
-        "Programming Language :: Python :: 2.7",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
@@ -93,11 +94,6 @@ def read(*names, **kwargs):
         "Topic :: Text Processing :: Linguistic",
     ],
     keywords=["nlp", "tokenizer", "icelandic"],
-    # Install the typing module if it isn't already in the
-    # Python standard library (i.e. in versions prior to 3.5)
-    install_requires=[
-        "typing;python_version<'3.5'"
-    ],
     # Set up a tokenize command (tokenize.exe on Windows),
     # which calls main() in src/tokenizer/main.py
     entry_points={
 
@@ -1,4 +1,3 @@
-# -*- encoding: utf-8 -*-
 """
 
     Copyright(C) 2021 Miðeind ehf.
@@ -27,19 +26,20 @@
 
 """
 
-from __future__ import absolute_import
-
 from .definitions import (
     TP_LEFT, TP_CENTER, TP_RIGHT, TP_NONE, TP_WORD,
     EN_DASH, EM_DASH,
-    KLUDGY_ORDINALS_PASS_THROUGH, KLUDGY_ORDINALS_MODIFY, KLUDGY_ORDINALS_TRANSLATE
+    KLUDGY_ORDINALS_PASS_THROUGH, KLUDGY_ORDINALS_MODIFY, KLUDGY_ORDINALS_TRANSLATE,
+    BIN_Tuple, BIN_TupleList
 )
 from .tokenizer import (
     TOK, Tok, tokenize, tokenize_without_annotation, split_into_sentences,
     parse_tokens, correct_spaces, detokenize, mark_paragraphs, paragraphs,
-    normalized_text, normalized_text_from_tokens, text_from_tokens
+    normalized_text, normalized_text_from_tokens, text_from_tokens,
+    calculate_indexes, generate_rough_tokens
 )
 from .abbrev import Abbreviations, ConfigError
+from .version import __version__
 
-__author__ = u"Miðeind ehf"
-__version__ = u"2.5.0"  # Also update setup.py
+__author__ = "Miðeind ehf"
+__copyright__ = "(C) 2021 Miðeind ehf."