Skip to content

Commit 1f5fdbe

Browse files
ruby : add VAD support, migration to Ruby's newer API (#3197)
* Add VAD models * Extract function to normalize model path from ruby_whisper_initialize() * Define ruby_whisper_vad_params struct * Add VAD-related features to Whisper::Params * Add tests for VAD-related features * Define Whisper::VADParams * Add Whisper::VAD::Params attributes * Add test suite for VAD::Params * Make older test to follow namespace change * Add test for transcription with VAD * Add assertion for test_vad_params * Add signatures for VAD-related methods * Define VAD::Params#== * Add test for VAD::Params#== * Fix Params#vad_params * Add test for Params#vad_params * Fix signature of Params#vad_params * Use macro to define VAD::Params params * Define VAD::Params#initialize * Add tests for VAD::Params#initialize * Add signature for VAD::Params.new * Add documentation on VAD in README * Wrap register_callbask in prepare_transcription for clear meanings * Set whisper_params.vad_params just before transcription * Don't touch NULL * Define ruby_whisper_params_type * Use TypedData_XXX for ruby_whisper_params instead of Data_XXX * Remove unused functions * Define rb_whisper_model_data_type * Use TypedData_XXX for ruby_whisper_model instead of Data_XXX * Define ruby_whisper_segment_type * Use TypedData_XXX for ruby_whisper_segment instead of Data_XXX * Define ruby_whisper_type * Use TypedData_XXX for ruby_whisper instead of Data_XXX * Qualify with const
1 parent 5720426 commit 1f5fdbe

14 files changed

+924
-172
lines changed

bindings/ruby/README.md

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,41 @@ See [models][] page for details.
111111

112112
Currently, whisper.cpp accepts only 16-bit WAV files.
113113

114+
### Voice Activity Detection (VAD) ###
115+
116+
Support for Voice Activity Detection (VAD) can be enabled by setting `Whisper::Params`'s `vad` argument to `true` and specifying VAD model:
117+
118+
```ruby
119+
Whisper::Params.new(
120+
vad: true,
121+
vad_model_path: "silero-v5.1.2",
122+
# other arguments...
123+
)
124+
```
125+
126+
When you pass the model name (`"silero-v5.1.2"`) or URI (`https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin`), it will be downloaded automatically.
127+
Currently, "silero-v5.1.2" is registered as pre-converted model like ASR models. You also specify file path or URI of model.
128+
129+
If you need configure VAD behavior, pass params for that:
130+
131+
```ruby
132+
Whisper::Params.new(
133+
vad: true,
134+
vad_model_path: "silero-v5.1.2",
135+
vad_params: Whisper::VAD::Params.new(
136+
threshold: 1.0, # defaults to 0.5
137+
min_speech_duration_ms: 500, # defaults to 250
138+
min_silence_duration_ms: 200, # defaults to 100
139+
max_speech_duration_s: 30000, # default is FLT_MAX,
140+
speech_pad_ms: 50, # defaults to 30
141+
samples_overlap: 0.5 # defaults to 0.1
142+
),
143+
# other arguments...
144+
)
145+
```
146+
147+
For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
148+
114149
API
115150
---
116151

bindings/ruby/ext/ruby_whisper.c

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
#include "ruby_whisper.h"
44

55
VALUE mWhisper;
6+
VALUE mVAD;
67
VALUE cContext;
78
VALUE cParams;
9+
VALUE cVADParams;
810
VALUE eError;
911

1012
VALUE cSegment;
@@ -31,6 +33,7 @@ extern void init_ruby_whisper_params(VALUE *mWhisper);
3133
extern void init_ruby_whisper_error(VALUE *mWhisper);
3234
extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
3335
extern void init_ruby_whisper_model(VALUE *mWhisper);
36+
extern void init_ruby_whisper_vad_params(VALUE *mVAD);
3437
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
3538

3639
/*
@@ -116,16 +119,6 @@ static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_d
116119
return Qnil;
117120
}
118121

119-
static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
120-
rb_gc_mark(rwm->context);
121-
}
122-
123-
static VALUE ruby_whisper_model_allocate(VALUE klass) {
124-
ruby_whisper_model *rwm;
125-
rwm = ALLOC(ruby_whisper_model);
126-
return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
127-
}
128-
129122
void Init_whisper() {
130123
id_to_s = rb_intern("to_s");
131124
id_call = rb_intern("call");
@@ -139,6 +132,7 @@ void Init_whisper() {
139132
id_pre_converted_models = rb_intern("pre_converted_models");
140133

141134
mWhisper = rb_define_module("Whisper");
135+
mVAD = rb_define_module_under(mWhisper, "VAD");
142136

143137
rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
144138
rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
@@ -159,6 +153,7 @@ void Init_whisper() {
159153
init_ruby_whisper_error(&mWhisper);
160154
init_ruby_whisper_segment(&mWhisper, &cContext);
161155
init_ruby_whisper_model(&mWhisper);
156+
init_ruby_whisper_vad_params(&mVAD);
162157

163158
rb_require("whisper/model/uri");
164159
}

bindings/ruby/ext/ruby_whisper.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,13 @@ typedef struct {
2121
ruby_whisper_callback_container *progress_callback_container;
2222
ruby_whisper_callback_container *encoder_begin_callback_container;
2323
ruby_whisper_callback_container *abort_callback_container;
24+
VALUE vad_params;
2425
} ruby_whisper_params;
2526

27+
typedef struct {
28+
struct whisper_vad_params params;
29+
} ruby_whisper_vad_params;
30+
2631
typedef struct {
2732
VALUE context;
2833
int index;

0 commit comments

Comments
 (0)