Skip to content

Commit d020b1d

Browse files
Fixed bug with nested paragraph markers
1 parent 0e0844a commit d020b1d

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

src/tokenizer/tokenizer.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1406,18 +1406,18 @@ def generate_rough_tokens(
14061406
for text in splits:
14071407
if is_text:
14081408
# 'text' is text to be tokenized
1409-
insert_paragraph_end = False
1409+
paragraph_end = 0
14101410
if not one_sent_per_line:
14111411
# Convert paragraph separators to TOK.P_BEGIN and TOK.P_END tokens
1412-
if text.startswith("[["):
1412+
while text.startswith("[["):
14131413
# Begin paragraph
14141414
text = text[2:]
14151415
yield TOK.Begin_Paragraph()
1416-
if text.endswith("]]"):
1416+
while text.endswith("]]"):
14171417
# End paragraph
14181418
text = text[:-2]
14191419
# Postpone the yield until after the rough token loop
1420-
insert_paragraph_end = True
1420+
paragraph_end += 1
14211421
tok_big = Tok(TOK.RAW, text, None, text, list(range(len(text))))
14221422
if replace_composite_glyphs:
14231423
# Replace composite glyphs with single code points
@@ -1443,9 +1443,10 @@ def generate_rough_tokens(
14431443
else:
14441444
yield tok
14451445

1446-
if insert_paragraph_end:
1446+
while paragraph_end:
14471447
# Yield the postponed TOK.P_END token
14481448
yield TOK.End_Paragraph()
1449+
paragraph_end -= 1
14491450
elif text == "]][[":
14501451
# Paragraph split: Yield TOK.P_BEGIN and TOK.P_END tokens
14511452
yield TOK.End_Paragraph()

0 commit comments

Comments
 (0)