forked from thewh1teagle/sherpa-rs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvad_whisper.rs
126 lines (111 loc) · 4.96 KB
/
vad_whisper.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/*
Detect speech in audio file and transcribe it
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/nemo_en_speakerverification_speakernet.onnx
tar xvf sherpa-onnx-whisper-tiny.tar.bz2
wget https://github.com/thewh1teagle/sherpa-rs/releases/download/v0.1.0/sam_altman.wav -O sam_altman.wav
cargo run --example vad_whisper sam_altman.wav
*/
use sherpa_rs::{
embedding_manager, read_audio_file, speaker_id,
vad::{Vad, VadConfig},
whisper::{WhisperConfig, WhisperRecognizer},
};
fn main() {
// Read audio data from the file
let path = std::env::args().nth(1).expect("Missing file path argument");
let (mut samples, sample_rate) = read_audio_file(&path).unwrap();
assert_eq!(sample_rate, 16000, "The sample rate must be 16000.");
// Pad with 3 seconds of silence so vad will be able to detect stop
samples.extend(vec![0.0; (3 * sample_rate) as usize]);
let extractor_config = speaker_id::ExtractorConfig {
model: "nemo_en_speakerverification_speakernet.onnx".into(),
..Default::default()
};
let mut extractor = speaker_id::EmbeddingExtractor::new(extractor_config).unwrap();
let mut embedding_manager =
embedding_manager::EmbeddingManager::new(extractor.embedding_size.try_into().unwrap()); // Assuming dimension 512 for embeddings
let config = WhisperConfig {
decoder: "sherpa-onnx-whisper-tiny/tiny-decoder.onnx".into(),
encoder: "sherpa-onnx-whisper-tiny/tiny-encoder.onnx".into(),
tokens: "sherpa-onnx-whisper-tiny/tiny-tokens.txt".into(),
language: "en".into(),
..Default::default() // fill in any missing fields with defaults
};
let mut recognizer = WhisperRecognizer::new(config).unwrap();
let mut speaker_counter = 0;
let window_size: usize = 512;
let vad_config = VadConfig {
model: "silero_vad.onnx".into(),
window_size: window_size as i32,
..Default::default()
};
let mut vad = Vad::new(vad_config, 60.0 * 10.0).unwrap();
let mut index = 0;
while index + window_size <= samples.len() {
let window = &samples[index..index + window_size];
vad.accept_waveform(window.to_vec()); // Convert slice to Vec
if vad.is_speech() {
while !vad.is_empty() {
let segment = vad.front();
let start_sec = (segment.start as f32) / sample_rate as f32;
let duration_sec = (segment.samples.len() as f32) / sample_rate as f32;
let transcript = recognizer.transcribe(sample_rate, &segment.samples);
// Compute the speaker embedding
let mut embedding = extractor
.compute_speaker_embedding(segment.samples, sample_rate)
.unwrap();
let name = if let Some(speaker_name) = embedding_manager.search(&embedding, 0.4) {
speaker_name
} else {
// Register a new speaker and add the embedding
let name = format!("speaker {}", speaker_counter);
embedding_manager.add(name.clone(), &mut embedding).unwrap();
speaker_counter += 1;
name
};
println!(
"({}) {} | {}s - {}s",
name,
transcript.text,
start_sec,
start_sec + duration_sec
);
vad.pop();
}
}
index += window_size;
}
if index < samples.len() {
let remaining_samples = &samples[index..];
vad.accept_waveform(remaining_samples.to_vec());
while !vad.is_empty() {
let segment = vad.front();
let start_sec = (segment.start as f32) / sample_rate as f32;
let duration_sec = (segment.samples.len() as f32) / sample_rate as f32;
let transcript = recognizer.transcribe(sample_rate, &segment.samples);
// Compute the speaker embedding
let mut embedding = extractor
.compute_speaker_embedding(segment.samples, sample_rate)
.unwrap();
let name = if let Some(speaker_name) = embedding_manager.search(&embedding, 0.4) {
speaker_name
} else {
// Register a new speaker and add the embedding
let name = format!("speaker {}", speaker_counter);
embedding_manager.add(name.clone(), &mut embedding).unwrap();
speaker_counter += 1;
name
};
println!(
"({}) {} | {}s - {}s",
name,
transcript.text,
start_sec,
start_sec + duration_sec
);
vad.pop();
}
}
}