Skip to content

Commit 491a765

Browse files
committed
Refactoring + explicit Python 3.13 support in metadata (tbd)
1 parent 12a2db2 commit 491a765

File tree

4 files changed

+12
-5
lines changed

4 files changed

+12
-5
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ classifiers = [
1111
"License :: OSI Approved :: MIT License",
1212
"Operating System :: Unix",
1313
"Operating System :: POSIX",
14+
"Operating System :: MacOS",
1415
"Operating System :: Microsoft :: Windows",
1516
"Natural Language :: Icelandic",
1617
"Programming Language :: Python",
@@ -19,6 +20,7 @@ classifiers = [
1920
"Programming Language :: Python :: 3.10",
2021
"Programming Language :: Python :: 3.11",
2122
"Programming Language :: Python :: 3.12",
23+
"Programming Language :: Python :: 3.13",
2224
"Programming Language :: Python :: Implementation :: CPython",
2325
"Programming Language :: Python :: Implementation :: PyPy",
2426
"Topic :: Software Development :: Libraries :: Python Modules",

src/tokenizer/definitions.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -717,9 +717,8 @@ def roman_to_int(s: str) -> int:
717717
# Króna amount strings allowed before a number, e.g. "kr. 9.900"
718718
ISK_AMOUNT_PRECEDING = frozenset(("kr.", "kr", "krónur"))
719719

720-
# URL prefixes. Note that this list should not contain www since
721-
# www.something.com is a domain token, not a URL token.
722-
URL_PREFIXES = (
720+
# URI scheme prefixes
721+
URI_PREFIXES = (
723722
"http://",
724723
"https://",
725724
"file://",
@@ -735,6 +734,12 @@ def roman_to_int(s: str) -> int:
735734
"telnet://",
736735
"udp://",
737736
"vnc://",
737+
"irc://",
738+
"nntp://",
739+
"wss://",
740+
"ws://",
741+
"xmpp://",
742+
"mtqp://",
738743
)
739744

740745
TOP_LEVEL_DOMAINS = frozenset(

src/tokenizer/tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1848,7 +1848,7 @@ def parse_mixed(
18481848
ate = True
18491849

18501850
rtxt = rt.txt
1851-
if rtxt and rtxt.startswith(URL_PREFIXES):
1851+
if rtxt and rtxt.startswith(URI_PREFIXES):
18521852
# Handle URL: cut RIGHT_PUNCTUATION characters off its end,
18531853
# even though many of them are actually allowed according to
18541854
# the IETF RFC

test/test_tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def strip_originals(tokens: list[Tok]) -> list[Tok]:
4646
"""Remove origin tracking info from a list of tokens.
4747
This is useful for simplifying tests where we don't care about tracking
4848
origins.
49-
XXX: This could be removed if we get a feature to disable origin
49+
TODO: This could be removed if we get a feature to disable origin
5050
tracking during tokenization.
5151
"""
5252

0 commit comments

Comments
 (0)