Skip to content

Commit e642d72

Browse files
committed
MMS TTS: handle the case when the model returns no synthetized voice
1 parent 3fa6c37 commit e642d72

File tree

1 file changed

+21
-10
lines changed

1 file changed

+21
-10
lines changed

open_dubbing/text_to_speech_mms.py

+21-10
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,29 @@ def _convert_text_to_speech(
5555
)
5656
inputs = tokenizer(text, return_tensors="pt").to(self.device)
5757

58-
# Generate waveform
59-
with torch.no_grad():
60-
output = model(**inputs).waveform
58+
# Model returns for some sequences of tokens no result
59+
if inputs["input_ids"].shape[1] == 0:
60+
sampling_rate = 16000
61+
duration_seconds = 1
62+
# If we fill the array with (np.zeros) the ffmpeg process later fails
63+
output_np = np.ones(sampling_rate * duration_seconds, dtype=np.int16)
64+
logger().warning(
65+
f"TextToSpeechMMS._convert_text_to_speech. Model returns input tokens for text '{text}', generating an empty WAV file."
66+
)
67+
else:
68+
with torch.no_grad():
69+
output = model(**inputs).waveform
6170

62-
# Convert waveform to NumPy array and scale to 16-bit PCM
63-
# Assuming `output` is a 2D tensor with shape (batch_size, samples)
64-
output_np = output.squeeze().cpu().numpy() # Remove batch dimension if present
65-
output_np = np.clip(output_np, -1, 1) # Clip values to be between -1 and 1
66-
output_np = (output_np * 32767).astype(np.int16) # Scale to 16-bit PCM
71+
# Convert waveform to NumPy array and scale to 16-bit PCM
72+
# Assuming `output` is a 2D tensor with shape (batch_size, samples)
73+
output_np = (
74+
output.squeeze().cpu().numpy()
75+
) # Remove batch dimension if present
76+
output_np = np.clip(output_np, -1, 1) # Clip values to be between -1 and 1
77+
output_np = (output_np * 32767).astype(np.int16) # Scale to 16-bit PCM
6778

68-
# Get the sampling rate
69-
sampling_rate = model.config.sampling_rate
79+
# Get the sampling rate
80+
sampling_rate = model.config.sampling_rate
7081

7182
# Write to WAV file
7283
wav_file = output_filename.replace(".mp3", ".wav")

0 commit comments

Comments
 (0)