Refactoring + explicit Python 3.13 support in metadata (tbd)

sveinbjornt · sveinbjornt · commit 491a765797bc · 2024-08-23T00:13:23.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Operating System :: Unix",
     "Operating System :: POSIX",
+    "Operating System :: MacOS",
     "Operating System :: Microsoft :: Windows",
     "Natural Language :: Icelandic",
     "Programming Language :: Python",
@@ -19,6 +20,7 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
     "Topic :: Software Development :: Libraries :: Python Modules",
diff --git a/src/tokenizer/definitions.py b/src/tokenizer/definitions.py
@@ -717,9 +717,8 @@ def roman_to_int(s: str) -> int:
 # Króna amount strings allowed before a number, e.g. "kr. 9.900"
 ISK_AMOUNT_PRECEDING = frozenset(("kr.", "kr", "krónur"))
 
-# URL prefixes. Note that this list should not contain www since
-# www.something.com is a domain token, not a URL token.
-URL_PREFIXES = (
+# URI scheme prefixes
+URI_PREFIXES = (
     "http://",
     "https://",
     "file://",
@@ -735,6 +734,12 @@ def roman_to_int(s: str) -> int:
     "telnet://",
     "udp://",
     "vnc://",
+    "irc://",
+    "nntp://",
+    "wss://",
+    "ws://",
+    "xmpp://",
+    "mtqp://",
 )
 
 TOP_LEVEL_DOMAINS = frozenset(
diff --git a/src/tokenizer/tokenizer.py b/src/tokenizer/tokenizer.py
@@ -1848,7 +1848,7 @@ def parse_mixed(
             ate = True
 
         rtxt = rt.txt
-        if rtxt and rtxt.startswith(URL_PREFIXES):
+        if rtxt and rtxt.startswith(URI_PREFIXES):
             # Handle URL: cut RIGHT_PUNCTUATION characters off its end,
             # even though many of them are actually allowed according to
             # the IETF RFC
diff --git a/test/test_tokenizer.py b/test/test_tokenizer.py
@@ -46,7 +46,7 @@ def strip_originals(tokens: list[Tok]) -> list[Tok]:
     """Remove origin tracking info from a list of tokens.
     This is useful for simplifying tests where we don't care about tracking
     origins.
-    XXX: This could be removed if we get a feature to disable origin
+    TODO: This could be removed if we get a feature to disable origin
     tracking during tokenization.
     """