whisper : remove speed_up and phase_vocoder* functions (#2198)

iboB · web-flow · commit af5833e29819 · 2024-05-31T11:37:29.000+03:00
* whisper : fix cast warning * whisper : remove phase_vocoder functions, ref #2195 * whisper : remove speed_up from whisper_full_params, closes #2195
diff --git a/bindings/go/examples/go-whisper/flags.go b/bindings/go/examples/go-whisper/flags.go
@@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
 	return strings.ToLower(flags.Lookup("out").Value.String())
 }
 
-func (flags *Flags) IsSpeedup() bool {
-	return flags.Lookup("speedup").Value.String() == "true"
-}
-
 func (flags *Flags) IsTokens() bool {
 	return flags.Lookup("tokens").Value.String() == "true"
 }
@@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
 		fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
 		context.SetDuration(duration)
 	}
-	if flags.IsSpeedup() {
-		fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
-		context.SetSpeedup(true)
-	}
 	if threads := flags.GetThreads(); threads != 0 {
 		fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
 		context.SetThreads(threads)
@@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
 	flag.Duration("offset", 0, "Time offset")
 	flag.Duration("duration", 0, "Duration of audio to process")
 	flag.Uint("threads", 0, "Number of threads to use")
-	flag.Bool("speedup", false, "Enable speedup")
 	flag.Uint("max-len", 0, "Maximum segment length in characters")
 	flag.Uint("max-tokens", 0, "Maximum tokens per segment")
 	flag.Float64("word-thold", 0, "Maximum segment score")
diff --git a/bindings/go/params.go b/bindings/go/params.go
@@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
 	p.print_timestamps = toBool(v)
 }
 
-func (p *Params) SetSpeedup(v bool) {
-	p.speed_up = toBool(v)
-}
-
 // Set language id
 func (p *Params) SetLanguage(lang int) error {
 	if lang == -1 {
@@ -177,9 +173,6 @@ func (p *Params) String() string {
 	if p.token_timestamps {
 		str += " token_timestamps"
 	}
-	if p.speed_up {
-		str += " speed_up"
-	}
 
 	return str + ">"
 }
diff --git a/bindings/go/pkg/whisper/context.go b/bindings/go/pkg/whisper/context.go
@@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
 	context.params.SetTranslate(v)
 }
 
-// Set speedup flag
-func (context *context) SetSpeedup(v bool) {
-	context.params.SetSpeedup(v)
-}
-
 func (context *context) SetSplitOnWord(v bool) {
 	context.params.SetSplitOnWord(v)
 }
diff --git a/bindings/go/pkg/whisper/interface.go b/bindings/go/pkg/whisper/interface.go
@@ -41,7 +41,6 @@ type Context interface {
 	SetOffset(time.Duration)        // Set offset
 	SetDuration(time.Duration)      // Set duration
 	SetThreads(uint)                // Set number of threads to use
-	SetSpeedup(bool)                // Set speedup flag
 	SetSplitOnWord(bool)            // Set split on word flag
 	SetTokenThreshold(float32)      // Set timestamp token probability threshold
 	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
@@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
      * @return Whisper context on success, null on failure
      */
     Pointer whisper_init_from_file(String path_model);
-    
+
     /**
      * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
      * Because this function allocates memory for the params, the caller must call either:
@@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
     /** Language id associated with the provided state */
     int whisper_full_lang_id_from_state(Pointer state);
 
-    /**
-     * Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-     * The resulting spectrogram is stored inside the default state of the provided whisper context.
-     * @return 0 on success
-     */
-    int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);
-
-    int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);
 
     /** Get the start time of the specified segment. */
     long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@@ -129,14 +129,6 @@ public void splitOnWord(boolean enable) {
     /** Maximum tokens per segment (0, default = no limit) */
     public int max_tokens;
 
-    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
-    public CBool speed_up;
-
-    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
-    public void speedUp(boolean enable) {
-        speed_up = enable ? CBool.TRUE : CBool.FALSE;
-    }
-
     /** Overwrite the audio context size (0 = use default). */
     public int audio_ctx;
 
@@ -321,7 +313,7 @@ protected List<String> getFieldOrder() {
         return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
                 "no_context", "single_segment", "no_timestamps",
                 "print_special", "print_progress", "print_realtime", "print_timestamps",  "token_timestamps",
-                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
+                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
                 "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
                 "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
                 "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
diff --git a/bindings/ruby/ext/ruby_whisper.cpp b/bindings/ruby/ext/ruby_whisper.cpp
@@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
 static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
   BOOL_PARAMS_SETTER(self, split_on_word, value)
 }
-static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
-  BOOL_PARAMS_GETTER(self, speed_up)
-}
-static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, speed_up, value)
-}
 static VALUE ruby_whisper_params_get_diarize(VALUE self) {
   ruby_whisper_params *rwp;
   Data_Get_Struct(self, ruby_whisper_params, rwp);
@@ -408,8 +402,6 @@ void Init_whisper() {
   rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
   rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
   rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
-  rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
-  rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
   rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
   rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
 
diff --git a/bindings/ruby/tests/test_whisper.rb b/bindings/ruby/tests/test_whisper.rb
@@ -117,13 +117,6 @@ def test_split_on_word
     assert !@params.split_on_word
   end
 
-  def test_speed_up
-    @params.speed_up = true
-    assert @params.speed_up
-    @params.speed_up = false
-    assert !@params.speed_up
-  end
-
   def test_whisper
     @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
     params  = Whisper::Params.new
diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp
@@ -25,7 +25,6 @@ struct whisper_params {
     float entropy_thold = 2.4f;
     float logprob_thold = -1.0f;
 
-    bool speed_up       = false;
     bool translate      = false;
     bool diarize        = false;
     bool output_txt     = false;
@@ -232,8 +231,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
             wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
             wparams.audio_ctx        = params.audio_ctx;
 
-            wparams.speed_up         = params.speed_up;
-
             wparams.greedy.best_of        = params.best_of;
             wparams.beam_search.beam_size = params.beam_size;
 
diff --git a/examples/command/command.cpp b/examples/command/command.cpp
@@ -38,7 +38,6 @@ struct whisper_params {
 
     grammar_parser::parse_state grammar_parsed;
 
-    bool speed_up      = false;
     bool translate     = false;
     bool print_special = false;
     bool print_energy  = false;
@@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
         else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
         else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
         else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
         else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
         else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
     fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
     fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
     fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
     fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -165,7 +162,6 @@ std::string transcribe(
     wparams.n_threads        = params.n_threads;
 
     wparams.audio_ctx = params.audio_ctx;
-    wparams.speed_up  = params.speed_up;
 
     wparams.temperature     = 0.4f;
     wparams.temperature_inc = 1.0f;
@@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
             wparams.n_threads        = params.n_threads;
 
             wparams.audio_ctx        = params.audio_ctx;
-            wparams.speed_up         = params.speed_up;
 
             wparams.prompt_tokens    = k_tokens.data();
             wparams.prompt_n_tokens  = k_tokens.size();
diff --git a/examples/common.h b/examples/common.h
@@ -185,7 +185,7 @@ class wav_writer {
     // It is assumed that PCM data is normalized to a range from -1 to 1
     bool write_audio(const float * data, size_t length) {
         for (size_t i = 0; i < length; ++i) {
-            const int16_t intSample = data[i] * 32767;
+            const int16_t intSample = int16_t(data[i] * 32767);
             file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
             dataSize += sizeof(int16_t);
         }
diff --git a/examples/lsp/lsp.cpp b/examples/lsp/lsp.cpp
@@ -26,7 +26,6 @@ struct whisper_params {
     float vad_thold    = 0.6f;
     float freq_thold   = 100.0f;
 
-    bool speed_up      = false;
     bool translate     = false;
     bool print_special = false;
     bool print_energy  = false;
@@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
         else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
         else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
         else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
         else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
         else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
     fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
     fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
     fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
     fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
     fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
     wparams.n_threads        = params.n_threads;
 
     wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
     wparams.suppress_non_speech_tokens = true;
     // run the transformer and a single decoding pass
     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
@@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
     wparams.n_threads        = params.n_threads;
 
     wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
 
     // TODO: Do some time testing. Does an overly long prompt slow down processing?
     // Set up command sets/precompute prompts
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -47,7 +47,6 @@ struct whisper_params {
     float temperature     = 0.0f;
     float temperature_inc = 0.2f;
 
-    bool speed_up        = false;
     bool debug_mode      = false;
     bool translate       = false;
     bool detect_language = false;
@@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
         else if (arg == "-tp"   || arg == "--temperature")     { params.temperature     = std::stof(argv[++i]); }
         else if (arg == "-tpi"  || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
         else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
         else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
         else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
     fprintf(stderr, "  -tp,       --temperature N     [%-7.2f] The sampling temperature, between 0 and 1\n",    params.temperature);
     fprintf(stderr, "  -tpi,      --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
     fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
     fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
     fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) {
             wparams.split_on_word    = params.split_on_word;
             wparams.audio_ctx        = params.audio_ctx;
 
-            wparams.speed_up         = params.speed_up;
             wparams.debug_mode       = params.debug_mode;
 
             wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -61,7 +61,6 @@ struct whisper_params {
     float temperature     =  0.00f;
     float temperature_inc =  0.20f;
 
-    bool speed_up        = false;
     bool debug_mode      = false;
     bool translate       = false;
     bool detect_language = false;
@@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
     fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
     fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
     fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
     fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
     fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
         else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
         else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
         else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
         else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
         else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
         else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@@ -768,7 +765,6 @@ int main(int argc, char ** argv) {
             wparams.split_on_word    = params.split_on_word;
             wparams.audio_ctx        = params.audio_ctx;
 
-            wparams.speed_up         = params.speed_up;
             wparams.debug_mode       = params.debug_mode;
 
             wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp
diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp
diff --git a/examples/wchess/wchess.cmd/wchess.cmd.cpp b/examples/wchess/wchess.cmd/wchess.cmd.cpp
diff --git a/whisper.cpp b/whisper.cpp
diff --git a/whisper.h b/whisper.h

Original file line number	Diff line number	Diff line change
`@@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {`
`47`	`47`	`p.print_timestamps = toBool(v)`
`48`	`48`	`}`
`49`	`49`
`50`		`-func (p *Params) SetSpeedup(v bool) {`
`51`		`- p.speed_up = toBool(v)`
`52`		`-}`
`53`		`-`
`54`	`50`	`// Set language id`
`55`	`51`	`func (p *Params) SetLanguage(lang int) error {`
`56`	`52`	`if lang == -1 {`
`@@ -177,9 +173,6 @@ func (p *Params) String() string {`
`177`	`173`	`if p.token_timestamps {`
`178`	`174`	`str += " token_timestamps"`
`179`	`175`	`}`
`180`		`- if p.speed_up {`
`181`		`- str += " speed_up"`
`182`		`- }`
`183`	`176`
`184`	`177`	`return str + ">"`
`185`	`178`	`}`
Original file line number	Diff line number	Diff line change
`@@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {`
`76`	`76`	`context.params.SetTranslate(v)`
`77`	`77`	`}`
`78`	`78`
`79`		`-// Set speedup flag`
`80`		`-func (context *context) SetSpeedup(v bool) {`
`81`		`- context.params.SetSpeedup(v)`
`82`		`-}`
`83`		`-`
`84`	`79`	`func (context *context) SetSplitOnWord(v bool) {`
`85`	`80`	`context.params.SetSplitOnWord(v)`
`86`	`81`	`}`
Original file line number	Diff line number	Diff line change
`@@ -185,7 +185,7 @@ class wav_writer {`
`185`	`185`	`// It is assumed that PCM data is normalized to a range from -1 to 1`
`186`	`186`	`bool write_audio(const float * data, size_t length) {`
`187`	`187`	`for (size_t i = 0; i < length; ++i) {`
`188`		`- const int16_t intSample = data[i] * 32767;`
	`188`	`+ const int16_t intSample = int16_t(data[i] * 32767);`
`189`	`189`	`file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));`
`190`	`190`	`dataSize += sizeof(int16_t);`
`191`	`191`	`}`