@@ -55,18 +55,29 @@ def _convert_text_to_speech(
55
55
)
56
56
inputs = tokenizer (text , return_tensors = "pt" ).to (self .device )
57
57
58
- # Generate waveform
59
- with torch .no_grad ():
60
- output = model (** inputs ).waveform
58
+ # Model returns for some sequences of tokens no result
59
+ if inputs ["input_ids" ].shape [1 ] == 0 :
60
+ sampling_rate = 16000
61
+ duration_seconds = 1
62
+ # If we fill the array with (np.zeros) the ffmpeg process later fails
63
+ output_np = np .ones (sampling_rate * duration_seconds , dtype = np .int16 )
64
+ logger ().warning (
65
+ f"TextToSpeechMMS._convert_text_to_speech. Model returns input tokens for text '{ text } ', generating an empty WAV file."
66
+ )
67
+ else :
68
+ with torch .no_grad ():
69
+ output = model (** inputs ).waveform
61
70
62
- # Convert waveform to NumPy array and scale to 16-bit PCM
63
- # Assuming `output` is a 2D tensor with shape (batch_size, samples)
64
- output_np = output .squeeze ().cpu ().numpy () # Remove batch dimension if present
65
- output_np = np .clip (output_np , - 1 , 1 ) # Clip values to be between -1 and 1
66
- output_np = (output_np * 32767 ).astype (np .int16 ) # Scale to 16-bit PCM
71
+ # Convert waveform to NumPy array and scale to 16-bit PCM
72
+ # Assuming `output` is a 2D tensor with shape (batch_size, samples)
73
+ output_np = (
74
+ output .squeeze ().cpu ().numpy ()
75
+ ) # Remove batch dimension if present
76
+ output_np = np .clip (output_np , - 1 , 1 ) # Clip values to be between -1 and 1
77
+ output_np = (output_np * 32767 ).astype (np .int16 ) # Scale to 16-bit PCM
67
78
68
- # Get the sampling rate
69
- sampling_rate = model .config .sampling_rate
79
+ # Get the sampling rate
80
+ sampling_rate = model .config .sampling_rate
70
81
71
82
# Write to WAV file
72
83
wav_file = output_filename .replace (".mp3" , ".wav" )
0 commit comments