@@ -609,6 +609,7 @@ def filter_pattern(doc_identifier):
609
609
return 'numbers'
610
610
return None
611
611
612
+ r"""
612
613
def get_sentences(phoneme_list, max_tokens):
613
614
sentences = []
614
615
current_sentence = ""
@@ -642,10 +643,96 @@ def get_sentences(phoneme_list, max_tokens):
642
643
if current_sentence:
643
644
sentences.append(current_sentence.strip())
644
645
return sentences
646
+ """
647
+
648
+ def get_sentences (phoneme_list , max_tokens ):
649
+ """
650
+ Split a list of phoneme strings into sentences that do not exceed max_tokens.
651
+ If a sentence (or a long concatenation of phonemes) has no punctuation,
652
+ it is split by grouping words into chunks of max_tokens words.
653
+ """
654
+ sentences = []
655
+ current_sentence = ""
656
+ current_token_count = 0
657
+
658
+ # Helper: split a sentence into chunks of at most max_tokens words.
659
+ def split_sentence_by_tokens (sentence , max_tokens ):
660
+ words = sentence .split ()
661
+ if len (words ) <= max_tokens :
662
+ return [sentence ]
663
+ chunks = []
664
+ for i in range (0 , len (words ), max_tokens ):
665
+ chunk = " " .join (words [i :i + max_tokens ])
666
+ chunks .append (chunk )
667
+ return chunks
668
+
669
+ # Helper: attempt to split a long sentence.
670
+ def split_long_sentence (sentence ):
671
+ # If any punctuation is present, try to split near the middle
672
+ if any (p in sentence for p in punctuation_split ):
673
+ max_chars = max_tokens * 10 # as before
674
+ if len (sentence ) <= max_chars :
675
+ return [sentence ]
676
+ middle_index = len (sentence ) // 2
677
+ next_punc_index = - 1
678
+ for p in punctuation_split :
679
+ idx = sentence .find (p , middle_index )
680
+ if idx != - 1 :
681
+ if next_punc_index == - 1 or idx < next_punc_index :
682
+ next_punc_index = idx
683
+ if next_punc_index != - 1 :
684
+ split_index = next_punc_index + 1
685
+ first_part = sentence [:split_index ].strip ()
686
+ second_part = sentence [split_index :].strip ()
687
+ # Further split each part by tokens if needed.
688
+ return split_sentence_by_tokens (first_part , max_tokens ) + split_sentence_by_tokens (second_part , max_tokens )
689
+ else :
690
+ return split_sentence_by_tokens (sentence , max_tokens )
691
+ else :
692
+ # No punctuation found: force split by token count.
693
+ return split_sentence_by_tokens (sentence , max_tokens )
694
+
695
+ for phoneme in phoneme_list :
696
+ tokens = phoneme .split ()
697
+ token_count = len (tokens )
698
+ if current_token_count + token_count > max_tokens :
699
+ # If current sentence ends with punctuation, try the splitting helper.
700
+ if any (current_sentence .endswith (p ) for p in punctuation_split ):
701
+ splits = split_long_sentence (current_sentence .strip ())
702
+ sentences .extend (splits )
703
+ current_sentence = phoneme
704
+ current_token_count = token_count
705
+ else :
706
+ # Look for the last punctuation inside current_sentence.
707
+ last_punc_index = - 1
708
+ for p in punctuation_split :
709
+ idx = current_sentence .rfind (p )
710
+ if idx > last_punc_index :
711
+ last_punc_index = idx
712
+ if last_punc_index != - 1 :
713
+ first_part = current_sentence [:last_punc_index + 1 ].strip ()
714
+ second_part = current_sentence [last_punc_index + 1 :].strip ()
715
+ if first_part :
716
+ sentences .append (first_part )
717
+ # Combine the remainder with the current phoneme.
718
+ current_sentence = (second_part + " " + phoneme ).strip () if second_part else phoneme
719
+ current_token_count = len (current_sentence .split ())
720
+ else :
721
+ # No punctuation found at all, force a split.
722
+ splits = split_sentence_by_tokens (current_sentence .strip (), max_tokens )
723
+ sentences .extend (splits )
724
+ current_sentence = phoneme
725
+ current_token_count = token_count
726
+ else :
727
+ current_sentence = (current_sentence + " " + phoneme ).strip () if current_sentence else phoneme
728
+ current_token_count += token_count
729
+
730
+ if current_sentence :
731
+ splits = split_long_sentence (current_sentence .strip ())
732
+ sentences .extend (splits )
733
+
734
+ return sentences
645
735
646
- import platform
647
- import subprocess
648
- import os
649
736
650
737
def get_vram ():
651
738
os_name = platform .system ()
0 commit comments