Skip to content

Commit af5833e

Browse files
authored
whisper : remove speed_up and phase_vocoder* functions (#2198)
* whisper : fix cast warning * whisper : remove phase_vocoder functions, ref #2195 * whisper : remove speed_up from whisper_full_params, closes #2195
1 parent b87494b commit af5833e

File tree

20 files changed

+14
-161
lines changed

20 files changed

+14
-161
lines changed

bindings/go/examples/go-whisper/flags.go

-9
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
6868
return strings.ToLower(flags.Lookup("out").Value.String())
6969
}
7070

71-
func (flags *Flags) IsSpeedup() bool {
72-
return flags.Lookup("speedup").Value.String() == "true"
73-
}
74-
7571
func (flags *Flags) IsTokens() bool {
7672
return flags.Lookup("tokens").Value.String() == "true"
7773
}
@@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
111107
fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
112108
context.SetDuration(duration)
113109
}
114-
if flags.IsSpeedup() {
115-
fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
116-
context.SetSpeedup(true)
117-
}
118110
if threads := flags.GetThreads(); threads != 0 {
119111
fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
120112
context.SetThreads(threads)
@@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
146138
flag.Duration("offset", 0, "Time offset")
147139
flag.Duration("duration", 0, "Duration of audio to process")
148140
flag.Uint("threads", 0, "Number of threads to use")
149-
flag.Bool("speedup", false, "Enable speedup")
150141
flag.Uint("max-len", 0, "Maximum segment length in characters")
151142
flag.Uint("max-tokens", 0, "Maximum tokens per segment")
152143
flag.Float64("word-thold", 0, "Maximum segment score")

bindings/go/params.go

-7
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
4747
p.print_timestamps = toBool(v)
4848
}
4949

50-
func (p *Params) SetSpeedup(v bool) {
51-
p.speed_up = toBool(v)
52-
}
53-
5450
// Set language id
5551
func (p *Params) SetLanguage(lang int) error {
5652
if lang == -1 {
@@ -177,9 +173,6 @@ func (p *Params) String() string {
177173
if p.token_timestamps {
178174
str += " token_timestamps"
179175
}
180-
if p.speed_up {
181-
str += " speed_up"
182-
}
183176

184177
return str + ">"
185178
}

bindings/go/pkg/whisper/context.go

-5
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
7676
context.params.SetTranslate(v)
7777
}
7878

79-
// Set speedup flag
80-
func (context *context) SetSpeedup(v bool) {
81-
context.params.SetSpeedup(v)
82-
}
83-
8479
func (context *context) SetSplitOnWord(v bool) {
8580
context.params.SetSplitOnWord(v)
8681
}

bindings/go/pkg/whisper/interface.go

-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ type Context interface {
4141
SetOffset(time.Duration) // Set offset
4242
SetDuration(time.Duration) // Set duration
4343
SetThreads(uint) // Set number of threads to use
44-
SetSpeedup(bool) // Set speedup flag
4544
SetSplitOnWord(bool) // Set split on word flag
4645
SetTokenThreshold(float32) // Set timestamp token probability threshold
4746
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold

bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java

+1-9
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
2020
* @return Whisper context on success, null on failure
2121
*/
2222
Pointer whisper_init_from_file(String path_model);
23-
23+
2424
/**
2525
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
2626
* Because this function allocates memory for the params, the caller must call either:
@@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
304304
/** Language id associated with the provided state */
305305
int whisper_full_lang_id_from_state(Pointer state);
306306

307-
/**
308-
* Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
309-
* The resulting spectrogram is stored inside the default state of the provided whisper context.
310-
* @return 0 on success
311-
*/
312-
int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);
313-
314-
int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);
315307

316308
/** Get the start time of the specified segment. */
317309
long whisper_full_get_segment_t0(Pointer ctx, int i_segment);

bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java

+1-9
Original file line numberDiff line numberDiff line change
@@ -129,14 +129,6 @@ public void splitOnWord(boolean enable) {
129129
/** Maximum tokens per segment (0, default = no limit) */
130130
public int max_tokens;
131131

132-
/** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
133-
public CBool speed_up;
134-
135-
/** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
136-
public void speedUp(boolean enable) {
137-
speed_up = enable ? CBool.TRUE : CBool.FALSE;
138-
}
139-
140132
/** Overwrite the audio context size (0 = use default). */
141133
public int audio_ctx;
142134

@@ -321,7 +313,7 @@ protected List<String> getFieldOrder() {
321313
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
322314
"no_context", "single_segment", "no_timestamps",
323315
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
324-
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
316+
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
325317
"tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
326318
"suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
327319
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",

bindings/ruby/ext/ruby_whisper.cpp

-8
Original file line numberDiff line numberDiff line change
@@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
311311
static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
312312
BOOL_PARAMS_SETTER(self, split_on_word, value)
313313
}
314-
static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
315-
BOOL_PARAMS_GETTER(self, speed_up)
316-
}
317-
static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
318-
BOOL_PARAMS_SETTER(self, speed_up, value)
319-
}
320314
static VALUE ruby_whisper_params_get_diarize(VALUE self) {
321315
ruby_whisper_params *rwp;
322316
Data_Get_Struct(self, ruby_whisper_params, rwp);
@@ -408,8 +402,6 @@ void Init_whisper() {
408402
rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
409403
rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
410404
rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
411-
rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
412-
rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
413405
rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
414406
rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
415407

bindings/ruby/tests/test_whisper.rb

-7
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,6 @@ def test_split_on_word
117117
assert !@params.split_on_word
118118
end
119119

120-
def test_speed_up
121-
@params.speed_up = true
122-
assert @params.speed_up
123-
@params.speed_up = false
124-
assert !@params.speed_up
125-
end
126-
127120
def test_whisper
128121
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
129122
params = Whisper::Params.new

examples/addon.node/addon.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ struct whisper_params {
2525
float entropy_thold = 2.4f;
2626
float logprob_thold = -1.0f;
2727

28-
bool speed_up = false;
2928
bool translate = false;
3029
bool diarize = false;
3130
bool output_txt = false;
@@ -232,8 +231,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
232231
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
233232
wparams.audio_ctx = params.audio_ctx;
234233

235-
wparams.speed_up = params.speed_up;
236-
237234
wparams.greedy.best_of = params.best_of;
238235
wparams.beam_search.beam_size = params.beam_size;
239236

examples/command/command.cpp

-5
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ struct whisper_params {
3838

3939
grammar_parser::parse_state grammar_parsed;
4040

41-
bool speed_up = false;
4241
bool translate = false;
4342
bool print_special = false;
4443
bool print_energy = false;
@@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
7675
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
7776
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
7877
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
79-
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
8078
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
8179
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
8280
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
@@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
115113
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
116114
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
117115
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
118-
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
119116
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
120117
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
121118
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
@@ -165,7 +162,6 @@ std::string transcribe(
165162
wparams.n_threads = params.n_threads;
166163

167164
wparams.audio_ctx = params.audio_ctx;
168-
wparams.speed_up = params.speed_up;
169165

170166
wparams.temperature = 0.4f;
171167
wparams.temperature_inc = 1.0f;
@@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
371367
wparams.n_threads = params.n_threads;
372368

373369
wparams.audio_ctx = params.audio_ctx;
374-
wparams.speed_up = params.speed_up;
375370

376371
wparams.prompt_tokens = k_tokens.data();
377372
wparams.prompt_n_tokens = k_tokens.size();

examples/common.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ class wav_writer {
185185
// It is assumed that PCM data is normalized to a range from -1 to 1
186186
bool write_audio(const float * data, size_t length) {
187187
for (size_t i = 0; i < length; ++i) {
188-
const int16_t intSample = data[i] * 32767;
188+
const int16_t intSample = int16_t(data[i] * 32767);
189189
file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
190190
dataSize += sizeof(int16_t);
191191
}

examples/lsp/lsp.cpp

-5
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ struct whisper_params {
2626
float vad_thold = 0.6f;
2727
float freq_thold = 100.0f;
2828

29-
bool speed_up = false;
3029
bool translate = false;
3130
bool print_special = false;
3231
bool print_energy = false;
@@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
7069
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
7170
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
7271
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
73-
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
7472
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
7573
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
7674
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
@@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
102100
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
103101
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
104102
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
105-
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
106103
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
107104
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
108105
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
@@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
184181
wparams.n_threads = params.n_threads;
185182

186183
wparams.audio_ctx = params.audio_ctx;
187-
wparams.speed_up = params.speed_up;
188184
wparams.suppress_non_speech_tokens = true;
189185
// run the transformer and a single decoding pass
190186
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
@@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
223219
wparams.n_threads = params.n_threads;
224220

225221
wparams.audio_ctx = params.audio_ctx;
226-
wparams.speed_up = params.speed_up;
227222

228223
// TODO: Do some time testing. Does an overly long prompt slow down processing?
229224
// Set up command sets/precompute prompts

examples/main/main.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ struct whisper_params {
4747
float temperature = 0.0f;
4848
float temperature_inc = 0.2f;
4949

50-
bool speed_up = false;
5150
bool debug_mode = false;
5251
bool translate = false;
5352
bool detect_language = false;
@@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
138137
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
139138
else if (arg == "-tp" || arg == "--temperature") { params.temperature = std::stof(argv[++i]); }
140139
else if (arg == "-tpi" || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
141-
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
142140
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
143141
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
144142
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
@@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
206204
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
207205
fprintf(stderr, " -tp, --temperature N [%-7.2f] The sampling temperature, between 0 and 1\n", params.temperature);
208206
fprintf(stderr, " -tpi, --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
209-
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
210207
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
211208
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
212209
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
@@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) {
11061103
wparams.split_on_word = params.split_on_word;
11071104
wparams.audio_ctx = params.audio_ctx;
11081105

1109-
wparams.speed_up = params.speed_up;
11101106
wparams.debug_mode = params.debug_mode;
11111107

11121108
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]

examples/server/server.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ struct whisper_params {
6161
float temperature = 0.00f;
6262
float temperature_inc = 0.20f;
6363

64-
bool speed_up = false;
6564
bool debug_mode = false;
6665
bool translate = false;
6766
bool detect_language = false;
@@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
112111
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
113112
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
114113
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
115-
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
116114
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
117115
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
118116
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
@@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
159157
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
160158
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
161159
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
162-
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
163160
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
164161
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
165162
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
@@ -768,7 +765,6 @@ int main(int argc, char ** argv) {
768765
wparams.split_on_word = params.split_on_word;
769766
wparams.audio_ctx = params.audio_ctx;
770767

771-
wparams.speed_up = params.speed_up;
772768
wparams.debug_mode = params.debug_mode;
773769

774770
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]

0 commit comments

Comments
 (0)