Skip to content

Commit 18dc777

Browse files
committed
Merge branch 'master' of https://github.com/mideind/Tokenizer
2 parents 6a3c748 + 4cbbe96 commit 18dc777

File tree

6 files changed

+14
-8
lines changed

6 files changed

+14
-8
lines changed

README.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,8 @@ defined within the ``TOK`` class:
560560
| SERIALNUMBER | 29 | Serial number | | 394-5388 |
561561
| | | | | 12-345-6789 |
562562
+---------------+---------+---------------------+---------------------------+
563+
| COMPANY * | 30 | Company name | [Unused] |
564+
+---------------+---------+---------------------+---------------------------+
563565
| S_BEGIN | 11001 | Start of sentence | |
564566
+---------------+---------+---------------------+---------------------------+
565567
| S_END | 11002 | End of sentence | |
@@ -752,6 +754,9 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
752754
Changelog
753755
---------
754756

757+
* Version 2.0.7: Added ``TOK.COMPANY`` token type; fixed a few abbreviations;
758+
renamed parameter ``text`` to ``text_or_gen`` in functions that accept a string
759+
or a string iterator
755760
* Version 2.0.6: Fixed handling of abbreviations such as *m.v.* (*miðað við*)
756761
that should not start a new sentence even if the following word is capitalized
757762
* Version 2.0.5: Fixed bug where single uppercase letters were erroneously

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def read(*names, **kwargs):
5757

5858
setup(
5959
name="tokenizer",
60-
version="2.0.6", # Also update src/tokenizer/__init__.py
60+
version="2.0.7", # Also update src/tokenizer/__init__.py
6161
license="MIT",
6262
description="A tokenizer for Icelandic text",
6363
long_description=u"{0}\n{1}".format(

src/tokenizer/Abbrev.conf

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,7 @@ vlf.* = "verkalýðsfélag" hk
11181118
vmf.* = "verkamannafélag" hk
11191119
ohf.* = "opinbert hlutafélag" hk
11201120
bs.* = "byggðasamlag" hk
1121+
hses.* = "húsnæðissjálfseignarstofnun" kvk
11211122

11221123
AG = "Aktiengesellschaft" hk erl
11231124
AS = "Aktieselskab" hk erl

src/tokenizer/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,4 @@
4242
from .abbrev import Abbreviations, ConfigError
4343

4444
__author__ = u"Miðeind ehf"
45-
__version__ = u"2.0.6" # Also update setup.py
45+
__version__ = u"2.0.7" # Also update setup.py

src/tokenizer/tokenizer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2076,7 +2076,7 @@ def convert_to_num(token):
20762076
yield token
20772077

20782078

2079-
def tokenize(text, **options):
2079+
def tokenize(text_or_gen, **options):
20802080
""" Tokenize text in several phases, returning a generator
20812081
(iterable sequence) of tokens that processes tokens on-demand. """
20822082

@@ -2087,7 +2087,7 @@ def tokenize(text, **options):
20872087
with_annotation = options.pop("with_annotation", True)
20882088
coalesce_percent = options.pop("coalesce_percent", False)
20892089

2090-
token_stream = parse_tokens(text, **options)
2090+
token_stream = parse_tokens(text_or_gen, **options)
20912091
token_stream = parse_particles(token_stream, **options)
20922092
token_stream = parse_sentences(token_stream)
20932093
token_stream = parse_phrases_1(token_stream)
@@ -2100,10 +2100,10 @@ def tokenize(text, **options):
21002100
return (t for t in token_stream if t.kind != TOK.X_END)
21012101

21022102

2103-
def tokenize_without_annotation(text, **options):
2103+
def tokenize_without_annotation(text_or_gen, **options):
21042104
""" Tokenize without the last pass which can be done more thoroughly if BÍN
21052105
annotation is available, for instance in ReynirPackage. """
2106-
return tokenize(text, with_annotation=False, **options)
2106+
return tokenize(text_or_gen, with_annotation=False, **options)
21072107

21082108

21092109
def split_into_sentences(text_or_gen, **options):

src/tokenizer/tokenizer.pyi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,8 +174,8 @@ def month_for_token(token: Tok, after_ordinal: bool = ...) -> Optional[int]: ...
174174
def parse_phrases_1(token_stream: Iterator[Tok]) -> Iterator[Tok]: ...
175175
def parse_date_and_time(token_stream: Iterator[Tok]) -> Iterator[Tok]: ...
176176
def parse_phrases_2(token_stream: Iterator[Tok], coalesce_percent: bool = ...) -> Iterator[Tok]: ...
177-
def tokenize(text: StringIterable, **options: Options) -> Iterator[Tok]: ...
178-
def tokenize_without_annotation(text: StringIterable, **options: Options) -> Iterator[Tok]: ...
177+
def tokenize(text_or_gen: StringIterable, **options: Options) -> Iterator[Tok]: ...
178+
def tokenize_without_annotation(text_or_gen: StringIterable, **options: Options) -> Iterator[Tok]: ...
179179
def split_into_sentences(text_or_gen: StringIterable, **options: Options) -> Iterator[str]: ...
180180
def mark_paragraphs(txt: str) -> str: ...
181181
def paragraphs(tokens: Iterable[Tok]) -> Iterator[List[SentenceTuple]]: ...

0 commit comments

Comments
 (0)