Fixed function for sentence splitting

DrewThomasson · web-flow · commit 664f0547eb40 · 2025-03-11T11:41:32.000-04:00
diff --git a/lib/functions.py b/lib/functions.py
@@ -609,6 +609,7 @@ def filter_pattern(doc_identifier):
             return 'numbers'
     return None
 
+r"""
 def get_sentences(phoneme_list, max_tokens):
     sentences = []
     current_sentence = ""
@@ -642,10 +643,96 @@ def get_sentences(phoneme_list, max_tokens):
     if current_sentence:
         sentences.append(current_sentence.strip())
     return sentences
+"""
+
+def get_sentences(phoneme_list, max_tokens):
+    """
+    Split a list of phoneme strings into sentences that do not exceed max_tokens.
+    If a sentence (or a long concatenation of phonemes) has no punctuation,
+    it is split by grouping words into chunks of max_tokens words.
+    """
+    sentences = []
+    current_sentence = ""
+    current_token_count = 0
+
+    # Helper: split a sentence into chunks of at most max_tokens words.
+    def split_sentence_by_tokens(sentence, max_tokens):
+        words = sentence.split()
+        if len(words) <= max_tokens:
+            return [sentence]
+        chunks = []
+        for i in range(0, len(words), max_tokens):
+            chunk = " ".join(words[i:i + max_tokens])
+            chunks.append(chunk)
+        return chunks
+
+    # Helper: attempt to split a long sentence.
+    def split_long_sentence(sentence):
+        # If any punctuation is present, try to split near the middle
+        if any(p in sentence for p in punctuation_split):
+            max_chars = max_tokens * 10  # as before
+            if len(sentence) <= max_chars:
+                return [sentence]
+            middle_index = len(sentence) // 2
+            next_punc_index = -1
+            for p in punctuation_split:
+                idx = sentence.find(p, middle_index)
+                if idx != -1:
+                    if next_punc_index == -1 or idx < next_punc_index:
+                        next_punc_index = idx
+            if next_punc_index != -1:
+                split_index = next_punc_index + 1
+                first_part = sentence[:split_index].strip()
+                second_part = sentence[split_index:].strip()
+                # Further split each part by tokens if needed.
+                return split_sentence_by_tokens(first_part, max_tokens) + split_sentence_by_tokens(second_part, max_tokens)
+            else:
+                return split_sentence_by_tokens(sentence, max_tokens)
+        else:
+            # No punctuation found: force split by token count.
+            return split_sentence_by_tokens(sentence, max_tokens)
+
+    for phoneme in phoneme_list:
+        tokens = phoneme.split()
+        token_count = len(tokens)
+        if current_token_count + token_count > max_tokens:
+            # If current sentence ends with punctuation, try the splitting helper.
+            if any(current_sentence.endswith(p) for p in punctuation_split):
+                splits = split_long_sentence(current_sentence.strip())
+                sentences.extend(splits)
+                current_sentence = phoneme
+                current_token_count = token_count
+            else:
+                # Look for the last punctuation inside current_sentence.
+                last_punc_index = -1
+                for p in punctuation_split:
+                    idx = current_sentence.rfind(p)
+                    if idx > last_punc_index:
+                        last_punc_index = idx
+                if last_punc_index != -1:
+                    first_part = current_sentence[:last_punc_index + 1].strip()
+                    second_part = current_sentence[last_punc_index + 1:].strip()
+                    if first_part:
+                        sentences.append(first_part)
+                    # Combine the remainder with the current phoneme.
+                    current_sentence = (second_part + " " + phoneme).strip() if second_part else phoneme
+                    current_token_count = len(current_sentence.split())
+                else:
+                    # No punctuation found at all, force a split.
+                    splits = split_sentence_by_tokens(current_sentence.strip(), max_tokens)
+                    sentences.extend(splits)
+                    current_sentence = phoneme
+                    current_token_count = token_count
+        else:
+            current_sentence = (current_sentence + " " + phoneme).strip() if current_sentence else phoneme
+            current_token_count += token_count
+
+    if current_sentence:
+        splits = split_long_sentence(current_sentence.strip())
+        sentences.extend(splits)
+
+    return sentences
 
-import platform
-import subprocess
-import os
 
 def get_vram():
     os_name = platform.system()