Skip to content

Commit 664f054

Browse files
Fixed function for sentence splitting
1 parent 697e407 commit 664f054

File tree

1 file changed

+90
-3
lines changed

1 file changed

+90
-3
lines changed

lib/functions.py

+90-3
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,7 @@ def filter_pattern(doc_identifier):
609609
return 'numbers'
610610
return None
611611

612+
r"""
612613
def get_sentences(phoneme_list, max_tokens):
613614
sentences = []
614615
current_sentence = ""
@@ -642,10 +643,96 @@ def get_sentences(phoneme_list, max_tokens):
642643
if current_sentence:
643644
sentences.append(current_sentence.strip())
644645
return sentences
646+
"""
647+
648+
def get_sentences(phoneme_list, max_tokens):
649+
"""
650+
Split a list of phoneme strings into sentences that do not exceed max_tokens.
651+
If a sentence (or a long concatenation of phonemes) has no punctuation,
652+
it is split by grouping words into chunks of max_tokens words.
653+
"""
654+
sentences = []
655+
current_sentence = ""
656+
current_token_count = 0
657+
658+
# Helper: split a sentence into chunks of at most max_tokens words.
659+
def split_sentence_by_tokens(sentence, max_tokens):
660+
words = sentence.split()
661+
if len(words) <= max_tokens:
662+
return [sentence]
663+
chunks = []
664+
for i in range(0, len(words), max_tokens):
665+
chunk = " ".join(words[i:i + max_tokens])
666+
chunks.append(chunk)
667+
return chunks
668+
669+
# Helper: attempt to split a long sentence.
670+
def split_long_sentence(sentence):
671+
# If any punctuation is present, try to split near the middle
672+
if any(p in sentence for p in punctuation_split):
673+
max_chars = max_tokens * 10 # as before
674+
if len(sentence) <= max_chars:
675+
return [sentence]
676+
middle_index = len(sentence) // 2
677+
next_punc_index = -1
678+
for p in punctuation_split:
679+
idx = sentence.find(p, middle_index)
680+
if idx != -1:
681+
if next_punc_index == -1 or idx < next_punc_index:
682+
next_punc_index = idx
683+
if next_punc_index != -1:
684+
split_index = next_punc_index + 1
685+
first_part = sentence[:split_index].strip()
686+
second_part = sentence[split_index:].strip()
687+
# Further split each part by tokens if needed.
688+
return split_sentence_by_tokens(first_part, max_tokens) + split_sentence_by_tokens(second_part, max_tokens)
689+
else:
690+
return split_sentence_by_tokens(sentence, max_tokens)
691+
else:
692+
# No punctuation found: force split by token count.
693+
return split_sentence_by_tokens(sentence, max_tokens)
694+
695+
for phoneme in phoneme_list:
696+
tokens = phoneme.split()
697+
token_count = len(tokens)
698+
if current_token_count + token_count > max_tokens:
699+
# If current sentence ends with punctuation, try the splitting helper.
700+
if any(current_sentence.endswith(p) for p in punctuation_split):
701+
splits = split_long_sentence(current_sentence.strip())
702+
sentences.extend(splits)
703+
current_sentence = phoneme
704+
current_token_count = token_count
705+
else:
706+
# Look for the last punctuation inside current_sentence.
707+
last_punc_index = -1
708+
for p in punctuation_split:
709+
idx = current_sentence.rfind(p)
710+
if idx > last_punc_index:
711+
last_punc_index = idx
712+
if last_punc_index != -1:
713+
first_part = current_sentence[:last_punc_index + 1].strip()
714+
second_part = current_sentence[last_punc_index + 1:].strip()
715+
if first_part:
716+
sentences.append(first_part)
717+
# Combine the remainder with the current phoneme.
718+
current_sentence = (second_part + " " + phoneme).strip() if second_part else phoneme
719+
current_token_count = len(current_sentence.split())
720+
else:
721+
# No punctuation found at all, force a split.
722+
splits = split_sentence_by_tokens(current_sentence.strip(), max_tokens)
723+
sentences.extend(splits)
724+
current_sentence = phoneme
725+
current_token_count = token_count
726+
else:
727+
current_sentence = (current_sentence + " " + phoneme).strip() if current_sentence else phoneme
728+
current_token_count += token_count
729+
730+
if current_sentence:
731+
splits = split_long_sentence(current_sentence.strip())
732+
sentences.extend(splits)
733+
734+
return sentences
645735

646-
import platform
647-
import subprocess
648-
import os
649736

650737
def get_vram():
651738
os_name = platform.system()

0 commit comments

Comments
 (0)