Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

another form of the sentence splitting function (Testing...Does NOT work) #473

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 31 additions & 61 deletions lib/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,92 +648,62 @@ def get_sentences(phoneme_list, max_tokens):
def get_sentences(phoneme_list, max_tokens):
"""
Split a list of phoneme strings into sentences that do not exceed max_tokens.
If a sentence (or a long concatenation of phonemes) has no punctuation,
it is split by grouping words into chunks of max_tokens words.
If a sentence has no punctuation and exceeds max_tokens, it is split into chunks.
In normal cases, the sentence remains unchanged.
"""
sentences = []
current_sentence = ""
current_token_count = 0

# Helper: split a sentence into chunks of at most max_tokens words.
# Basic splitting by token count.
def split_sentence_by_tokens(sentence, max_tokens):
words = sentence.split()
if len(words) <= max_tokens:
return [sentence]
chunks = []
for i in range(0, len(words), max_tokens):
chunk = " ".join(words[i:i + max_tokens])
chunks.append(chunk)
return chunks

# Helper: attempt to split a long sentence.
def split_long_sentence(sentence):
# If any punctuation is present, try to split near the middle
return [" ".join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]

# Advanced splitting: only invoked if the sentence exceeds max_tokens.
def advanced_split(sentence):
words = sentence.split()
if len(words) <= max_tokens:
return [sentence]
# If punctuation exists, attempt to split at punctuation.
if any(p in sentence for p in punctuation_split):
max_chars = max_tokens * 10 # as before
if len(sentence) <= max_chars:
return [sentence]
middle_index = len(sentence) // 2
next_punc_index = -1
for p in punctuation_split:
idx = sentence.find(p, middle_index)
if idx != -1:
if next_punc_index == -1 or idx < next_punc_index:
next_punc_index = idx
if next_punc_index != -1:
split_index = next_punc_index + 1
first_part = sentence[:split_index].strip()
second_part = sentence[split_index:].strip()
# Further split each part by tokens if needed.
return split_sentence_by_tokens(first_part, max_tokens) + split_sentence_by_tokens(second_part, max_tokens)
else:
return split_sentence_by_tokens(sentence, max_tokens)
else:
# No punctuation found: force split by token count.
return split_sentence_by_tokens(sentence, max_tokens)
# Find the last punctuation in the sentence.
last_punc_index = max(sentence.rfind(p) for p in punctuation_split if p in sentence)
if last_punc_index != -1:
part1 = sentence[:last_punc_index+1].strip()
part2 = sentence[last_punc_index+1:].strip()
# Recursively split each part if necessary.
return advanced_split(part1) + advanced_split(part2)
# Fallback: split by tokens.
return split_sentence_by_tokens(sentence, max_tokens)

for phoneme in phoneme_list:
tokens = phoneme.split()
token_count = len(tokens)
if current_token_count + token_count > max_tokens:
# If current sentence ends with punctuation, try the splitting helper.
if any(current_sentence.endswith(p) for p in punctuation_split):
splits = split_long_sentence(current_sentence.strip())
sentences.extend(splits)
current_sentence = phoneme
current_token_count = token_count
# Only apply advanced splitting if current_sentence exceeds the token limit.
if len(current_sentence.split()) > max_tokens:
sentences.extend(advanced_split(current_sentence.strip()))
else:
# Look for the last punctuation inside current_sentence.
last_punc_index = -1
for p in punctuation_split:
idx = current_sentence.rfind(p)
if idx > last_punc_index:
last_punc_index = idx
if last_punc_index != -1:
first_part = current_sentence[:last_punc_index + 1].strip()
second_part = current_sentence[last_punc_index + 1:].strip()
if first_part:
sentences.append(first_part)
# Combine the remainder with the current phoneme.
current_sentence = (second_part + " " + phoneme).strip() if second_part else phoneme
current_token_count = len(current_sentence.split())
else:
# No punctuation found at all, force a split.
splits = split_sentence_by_tokens(current_sentence.strip(), max_tokens)
sentences.extend(splits)
current_sentence = phoneme
current_token_count = token_count
sentences.append(current_sentence.strip())
current_sentence = phoneme
current_token_count = token_count
else:
current_sentence = (current_sentence + " " + phoneme).strip() if current_sentence else phoneme
current_token_count += token_count

if current_sentence:
splits = split_long_sentence(current_sentence.strip())
sentences.extend(splits)
if len(current_sentence.split()) > max_tokens:
sentences.extend(advanced_split(current_sentence.strip()))
else:
sentences.append(current_sentence.strip())

return sentences



def get_vram():
os_name = platform.system()
# NVIDIA (Cross-Platform: Windows, Linux, macOS)
Expand Down
Loading