diff --git a/examples/stream/README.md b/examples/stream/README.md index f07cfb8915c..e60fd8bd6da 100644 --- a/examples/stream/README.md +++ b/examples/stream/README.md @@ -1,51 +1,41 @@ -# whisper.cpp/examples/stream - -This is a naive example of performing real-time inference on audio from your microphone. -The `whisper-stream` tool samples the audio every half a second and runs the transcription continously. -More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10). - -```bash -./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000 -``` - -https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4 - -## Sliding window mode with VAD - -Setting the `--step` argument to `0` enables the sliding window mode: - -```bash - ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6 -``` - -In this mode, the tool will transcribe only after some speech activity is detected. A very -basic VAD detector is used, but in theory a more sophisticated approach can be added. The -`-vth` argument determines the VAD threshold - higher values will make it detect silence more often. -It's best to tune it to the specific use case, but a value around `0.6` should be OK in general. -When silence is detected, it will transcribe the last `--length` milliseconds of audio and output -a transcription block that is suitable for parsing. - -## Building - -The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this: - -```bash -# Install SDL2 -# On Debian based linux distributions: -sudo apt-get install libsdl2-dev - -# On Fedora Linux: -sudo dnf install SDL2 SDL2-devel - -# Install SDL2 on Mac OS -brew install sdl2 - -cmake -B build -DWHISPER_SDL2=ON -cmake --build build --config Release - -./build/bin/whisper-stream -``` - -## Web version - -This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm) +# whisper.cpp/examples/stream + +This is a naive example of performing real-time inference on audio from your microphone. +The `whisper-stream` tool samples the audio every half a second and runs the transcription continously. +More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10). + +```bash +./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000 +``` + +https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4 + +## VAD support + +VAD support can be enabled by specifying the `--vad` and optionally a `--vad-model` (by default +`models/for-tests-silero-v5.1.2-ggml.bin` will be used). + +## Building + +The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this: + +```bash +# Install SDL2 +# On Debian based linux distributions: +sudo apt-get install libsdl2-dev + +# On Fedora Linux: +sudo dnf install SDL2 SDL2-devel + +# Install SDL2 on Mac OS +brew install sdl2 + +cmake -B build -DWHISPER_SDL2=ON +cmake --build build --config Release + +./build/bin/whisper-stream +``` + +## Web version + +This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm) diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index bc6f13fb267..f741e40442c 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -37,10 +37,21 @@ struct whisper_params { bool save_audio = false; // save audio to wav file bool use_gpu = true; bool flash_attn = false; + bool no_prints = false; std::string language = "en"; std::string model = "models/ggml-base.en.bin"; std::string fname_out; + + // Voice Activity Detection (VAD) parameters + bool vad = false; + std::string vad_model = "models/for-tests-silero-v5.1.2-ggml.bin"; + float vad_threshold = 0.5f; + int vad_min_speech_duration_ms = 250; + int vad_min_silence_duration_ms = 100; + float vad_max_speech_duration_s = FLT_MAX; + int vad_speech_pad_ms = 30; + float vad_samples_overlap = 0.1f; }; void whisper_print_usage(int argc, char ** argv, const whisper_params & params); @@ -61,8 +72,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); } else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); } else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); } - else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); } - else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); } else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } @@ -74,7 +83,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-sa" || arg == "--save-audio") { params.save_audio = true; } else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } - + else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; } + // Voice Activity Detection (VAD) + else if ( arg == "--vad") { params.vad = true; } + else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; } + else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); } + else if (arg == "-vsd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); } + else if ( arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); } + else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); } + else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); } + else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -99,8 +117,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens); fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size); - fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); - fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); @@ -112,6 +128,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -sa, --save-audio [%-7s] save the recorded audio to a file\n", params.save_audio ? "true" : "false"); fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU inference\n", params.use_gpu ? "false" : "true"); fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention during inference\n", params.flash_attn ? "true" : "false"); + fprintf(stderr, " -np, --no-prints [%-7s] do not print anything other than the results\n", params.no_prints ? "true" : "false"); + // Voice Activity Detection (VAD) parameters + fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n"); + fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false"); + fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str()); + fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold); + fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms); + fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms); + fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ? + std::string("FLT_MAX").c_str() : + std::to_string(params.vad_max_speech_duration_s).c_str()); + fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms); + fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap); fprintf(stderr, "\n"); } @@ -124,6 +153,17 @@ int main(int argc, char ** argv) { return 1; } + if (params.no_prints) { + whisper_log_set([](enum ggml_log_level, const char*, void*) { }, NULL); + } + + if (params.vad) { + // For VAD, ensure at least 500 of context + params.keep_ms = std::max(params.keep_ms, 500); + } else { + params.keep_ms = std::min(params.keep_ms, params.step_ms); + } + params.keep_ms = std::min(params.keep_ms, params.step_ms); params.length_ms = std::max(params.length_ms, params.step_ms); @@ -132,7 +172,7 @@ int main(int argc, char ** argv) { const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE; const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE; - const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD + const bool use_vad = params.vad; const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line @@ -242,6 +282,30 @@ int main(int argc, char ** argv) { break; } + whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY); + + wparams.print_progress = false; + wparams.print_special = params.print_special; + wparams.print_realtime = false; + wparams.print_timestamps = !params.no_timestamps; + wparams.translate = params.translate; + wparams.single_segment = !use_vad; + wparams.max_tokens = params.max_tokens; + wparams.language = params.language.c_str(); + wparams.n_threads = params.n_threads; + wparams.beam_search.beam_size = params.beam_size; + + wparams.audio_ctx = params.audio_ctx; + + wparams.tdrz_enable = params.tinydiarize; // [TDRZ] + + // disable temperature fallback + //wparams.temperature_inc = -1.0f; + wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc; + + wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data(); + wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size(); + // process new audio if (!use_vad) { @@ -295,8 +359,26 @@ int main(int argc, char ** argv) { audio.get(2000, pcmf32_new); - if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) { - audio.get(params.length_ms, pcmf32); + whisper_full_params wvparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wvparams.vad = params.vad; + wvparams.vad_model_path = params.vad_model.c_str(); + wvparams.vad_params.threshold = params.vad_threshold; + wvparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms; + wvparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms; + wvparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s; + wvparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms; + wvparams.vad_params.samples_overlap = params.vad_samples_overlap; + + float * vad_samples; + int n_vad_samples; + if (whisper_vad(ctx, wvparams, pcmf32_new.data(), pcmf32_new.size(), &vad_samples, &n_vad_samples)) { + if (n_vad_samples == 0) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + continue; + } + + pcmf32.assign(vad_samples, vad_samples + n_vad_samples); + free(vad_samples); } else { std::this_thread::sleep_for(std::chrono::milliseconds(100)); @@ -308,30 +390,6 @@ int main(int argc, char ** argv) { // run the inference { - whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY); - - wparams.print_progress = false; - wparams.print_special = params.print_special; - wparams.print_realtime = false; - wparams.print_timestamps = !params.no_timestamps; - wparams.translate = params.translate; - wparams.single_segment = !use_vad; - wparams.max_tokens = params.max_tokens; - wparams.language = params.language.c_str(); - wparams.n_threads = params.n_threads; - wparams.beam_search.beam_size = params.beam_size; - - wparams.audio_ctx = params.audio_ctx; - - wparams.tdrz_enable = params.tinydiarize; // [TDRZ] - - // disable temperature fallback - //wparams.temperature_inc = -1.0f; - wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc; - - wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data(); - wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size(); - if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { fprintf(stderr, "%s: failed to process audio\n", argv[0]); return 6; diff --git a/include/whisper.h b/include/whisper.h index 4aeda98f334..e2b78b3165c 100644 --- a/include/whisper.h +++ b/include/whisper.h @@ -707,6 +707,14 @@ extern "C" { const float * samples, int n_samples); + WHISPER_API bool whisper_vad( + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples, + float ** vad_samples, + int * n_vad_samples); + WHISPER_API int whisper_vad_segments_n_segments(struct whisper_vad_segments * segments); WHISPER_API float whisper_vad_segments_get_segment_t0(struct whisper_vad_segments * segments, int i_segment); diff --git a/src/whisper.cpp b/src/whisper.cpp index a2f28d7db54..ea1bff4e9af 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -6792,6 +6792,38 @@ static bool whisper_vad( return true; } +bool whisper_vad( + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples, + float ** vad_samples, + int * n_vad_samples) { + + std::vector filtered_samples; + + if (!whisper_vad(ctx, ctx->state, params, samples, n_samples, filtered_samples)) { + WHISPER_LOG_ERROR("%s: VAD processing failed\n", __func__); + return false; + } + + *n_vad_samples = filtered_samples.size(); + + if (filtered_samples.size() == 0) { + *vad_samples = nullptr; // No speech detected + return true; + } + + *vad_samples = (float*)malloc(filtered_samples.size() * sizeof(float)); + if (!*vad_samples) { + WHISPER_LOG_ERROR("%s: VAD processing failed to allocate filtered_samples\n", __func__); + return false; + } + + std::memcpy(*vad_samples, filtered_samples.data(), filtered_samples.size() * sizeof(float)); + return true; +} + int whisper_full_with_state( struct whisper_context * ctx, struct whisper_state * state,