Merge branch 'master' of https://github.com/mideind/Tokenizer

sveinbjornt · sveinbjornt · commit 18dc777bece4 · 2020-06-24T15:46:54.000Z
diff --git a/README.rst b/README.rst
@@ -560,6 +560,8 @@ defined within the ``TOK`` class:
 | SERIALNUMBER  |    29   | Serial number       | | 394-5388                |
 |               |         |                     | | 12-345-6789             |
 +---------------+---------+---------------------+---------------------------+
+| COMPANY *     |    30   | Company name        | [Unused]                  |
++---------------+---------+---------------------+---------------------------+
 | S_BEGIN       |  11001  | Start of sentence   |                           |
 +---------------+---------+---------------------+---------------------------+
 | S_END         |  11002  | End of sentence     |                           |
@@ -752,6 +754,9 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
 Changelog
 ---------
 
+* Version 2.0.7: Added ``TOK.COMPANY`` token type; fixed a few abbreviations;
+  renamed parameter ``text`` to ``text_or_gen`` in functions that accept a string
+  or a string iterator
 * Version 2.0.6: Fixed handling of abbreviations such as *m.v.* (*miðað við*)
   that should not start a new sentence even if the following word is capitalized
 * Version 2.0.5: Fixed bug where single uppercase letters were erroneously
diff --git a/setup.py b/setup.py
@@ -57,7 +57,7 @@ def read(*names, **kwargs):
 
 setup(
     name="tokenizer",
-    version="2.0.6",  # Also update src/tokenizer/__init__.py
+    version="2.0.7",  # Also update src/tokenizer/__init__.py
     license="MIT",
     description="A tokenizer for Icelandic text",
     long_description=u"{0}\n{1}".format(
diff --git a/src/tokenizer/Abbrev.conf b/src/tokenizer/Abbrev.conf
@@ -1118,6 +1118,7 @@ vlf.* = "verkalýðsfélag" hk
 vmf.* = "verkamannafélag" hk
 ohf.* = "opinbert hlutafélag" hk
 bs.* = "byggðasamlag" hk
+hses.* = "húsnæðissjálfseignarstofnun" kvk
 
 AG = "Aktiengesellschaft" hk erl
 AS = "Aktieselskab" hk erl
diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
@@ -42,4 +42,4 @@
 from .abbrev import Abbreviations, ConfigError
 
 __author__ = u"Miðeind ehf"
-__version__ = u"2.0.6"  # Also update setup.py
+__version__ = u"2.0.7"  # Also update setup.py
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
@@ -2076,7 +2076,7 @@ def convert_to_num(token):
         yield token
 
 
-def tokenize(text, **options):
+def tokenize(text_or_gen, **options):
     """ Tokenize text in several phases, returning a generator
         (iterable sequence) of tokens that processes tokens on-demand. """
 
@@ -2087,7 +2087,7 @@ def tokenize(text, **options):
     with_annotation = options.pop("with_annotation", True)
     coalesce_percent = options.pop("coalesce_percent", False)
 
-    token_stream = parse_tokens(text, **options)
+    token_stream = parse_tokens(text_or_gen, **options)
     token_stream = parse_particles(token_stream, **options)
     token_stream = parse_sentences(token_stream)
     token_stream = parse_phrases_1(token_stream)
@@ -2100,10 +2100,10 @@ def tokenize(text, **options):
     return (t for t in token_stream if t.kind != TOK.X_END)
 
 
-def tokenize_without_annotation(text, **options):
+def tokenize_without_annotation(text_or_gen, **options):
     """ Tokenize without the last pass which can be done more thoroughly if BÍN
         annotation is available, for instance in ReynirPackage. """
-    return tokenize(text, with_annotation=False, **options)
+    return tokenize(text_or_gen, with_annotation=False, **options)
 
 
 def split_into_sentences(text_or_gen, **options):
diff --git a/src/tokenizer/tokenizer.pyi b/src/tokenizer/tokenizer.pyi
@@ -174,8 +174,8 @@ def month_for_token(token: Tok, after_ordinal: bool = ...) -> Optional[int]: ...
 def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: ...
 def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: ...
 def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = ...) -> Iterator[Tok]: ...
-def tokenize(text: StringIterable, **options: Options) -> Iterator[Tok]: ...
-def tokenize_without_annotation(text: StringIterable, **options: Options) -> Iterator[Tok]: ...
+def tokenize(text_or_gen: StringIterable, **options: Options) -> Iterator[Tok]: ...
+def tokenize_without_annotation(text_or_gen: StringIterable, **options: Options) -> Iterator[Tok]: ...
 def split_into_sentences(text_or_gen: StringIterable, **options: Options) -> Iterator[str]: ...
 def mark_paragraphs(txt: str) -> str: ...
 def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[SentenceTuple]]: ...