Skip to content

Commit 91b6e29

Browse files
committed
added multilingual support for whisper
1 parent 0cb5995 commit 91b6e29

File tree

6 files changed

+49
-8
lines changed

6 files changed

+49
-8
lines changed

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ struct whisper_generation_inputs
192192
const char * prompt = nullptr;
193193
const char * audio_data = nullptr;
194194
const bool suppress_non_speech = false;
195+
const char * langcode = nullptr;
195196
const bool quiet = false;
196197
};
197198
struct whisper_generation_outputs

kcpp_docs.embd

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1344,13 +1344,27 @@
13441344
"application/json": {
13451345
"example": {
13461346
"prompt": "",
1347+
"suppress_non_speech" : false,
1348+
"langcode": "en",
13471349
"audio_data": "base64_wav_data",
13481350
},
13491351
"schema": {
13501352
"properties": {
13511353
"audio_data": {
13521354
"type": "string",
13531355
"description": "Base64 respresentation of a 16-bit 16kHz wave file to be transcribed to text."
1356+
},
1357+
"prompt": {
1358+
"type": "string",
1359+
"description": "Prompt to steer the transcription."
1360+
},
1361+
"langcode": {
1362+
"type": "string",
1363+
"description": "Two letter language code, or use auto to autodetect."
1364+
},
1365+
"suppress_non_speech": {
1366+
"type": "boolean",
1367+
"description": "Prevent noise tokens, always generate words for speech."
13541368
}
13551369
},
13561370
"type": "object"

klite.embd

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
1212
-->
1313

1414
<script>
15-
const LITEVER = 201;
15+
const LITEVER = 202;
1616
const urlParams = new URLSearchParams(window.location.search);
1717
var localflag = true;
1818
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -3044,6 +3044,7 @@ Current version indicated by LITEVER below.
30443044
narrate_only_dialog: false,
30453045
voice_end_delay: 300,
30463046
voice_suppress_nonspeech: false,
3047+
voice_langcode: "auto",
30473048
tts_speed: 1.0,
30483049
image_styles: "",
30493050
image_negprompt: "",
@@ -10330,6 +10331,7 @@ initializeInstructUIFunctionality();
1033010331
document.getElementById("tts_speed").value = localsettings.tts_speed;
1033110332
document.getElementById("voice_end_delay").value = localsettings.voice_end_delay;
1033210333
document.getElementById("voice_suppress_nonspeech").checked = localsettings.voice_suppress_nonspeech;
10334+
document.getElementById("voice_langcode").value = localsettings.voice_langcode;
1033310335
toggle_opmode();
1033410336

1033510337
//sd models display
@@ -10618,6 +10620,7 @@ initializeInstructUIFunctionality();
1061810620
localsettings.tts_speed = document.getElementById("tts_speed").value;
1061910621
localsettings.voice_end_delay = document.getElementById("voice_end_delay").value;
1062010622
localsettings.voice_suppress_nonspeech = (document.getElementById("voice_suppress_nonspeech").checked?true:false);
10623+
localsettings.voice_langcode = document.getElementById("voice_langcode").value;
1062110624
localsettings.auto_ctxlen = (document.getElementById("auto_ctxlen").checked ? true : false);
1062210625
localsettings.auto_genamt = (document.getElementById("auto_genamt").checked ? true : false);
1062310626

@@ -15299,7 +15302,8 @@ initializeInstructUIFunctionality();
1529915302
let payload = {
1530015303
"audio_data": dataurl,
1530115304
"prompt": "",
15302-
"suppress_non_speech": localsettings.voice_suppress_nonspeech
15305+
"suppress_non_speech": localsettings.voice_suppress_nonspeech,
15306+
"langcode": localsettings.voice_langcode,
1530315307
};
1530415308
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
1530515309
method: 'POST',
@@ -15349,7 +15353,8 @@ initializeInstructUIFunctionality();
1534915353
let payload = {
1535015354
"audio_data": dataurl,
1535115355
"prompt": "",
15352-
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false)
15356+
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false),
15357+
"langcode": document.getElementById("voice_langcode").value
1535315358
};
1535415359
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
1535515360
method: 'POST',
@@ -20229,6 +20234,12 @@ initializeInstructUIFunctionality();
2022920234
<div class="justifyleft" style="padding:2px" title="Suppress non-speech (e.g. music and sounds) from transcription">Suppress Non-Speech </div>
2023020235
<input title="Suppress Non-Speech" type="checkbox" id="voice_suppress_nonspeech" style="margin:0px 0px 0px auto;">
2023120236
</div>
20237+
<div class="inlinelabel" style="font-size: 11px;">
20238+
<div class="justifyleft" style="padding:2px" title="Language Code">Language </div>
20239+
<input class="settinglabel miniinput" type="text" placeholder="en" value="auto" id="voice_langcode" style="height:18px; width: 36px; padding: 2px;">
20240+
20241+
</div>
20242+
2023220243
<div class="inlinelabel" style="font-size: 11px;">
2023320244
<div class="justifyleft" style="padding:3px">Voice Delay: </div>
2023420245
<input title="Voice Delay Milliseconds" type="text" inputmode="decimal" value="300" id="voice_end_delay" style="width:40px">

koboldcpp.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ class whisper_generation_inputs(ctypes.Structure):
273273
_fields_ = [("prompt", ctypes.c_char_p),
274274
("audio_data", ctypes.c_char_p),
275275
("suppress_non_speech", ctypes.c_bool),
276+
("langcode", ctypes.c_char_p),
276277
("quiet", ctypes.c_bool)]
277278

278279
class whisper_generation_outputs(ctypes.Structure):
@@ -1252,6 +1253,9 @@ def whisper_generate(genparams):
12521253
inputs.prompt = prompt.encode("UTF-8")
12531254
inputs.audio_data = audio_data.encode("UTF-8")
12541255
inputs.quiet = is_quiet
1256+
lc = genparams.get("langcode", "auto")
1257+
lc = lc.strip().lower() if (lc and lc.strip().lower()!="") else "auto"
1258+
inputs.langcode = lc.encode("UTF-8")
12551259
inputs.suppress_non_speech = genparams.get("suppress_non_speech", False)
12561260
ret = handle.whisper_generate(inputs)
12571261
outstr = ""

otherarch/whispercpp/whisper.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5355,13 +5355,19 @@ int whisper_full_with_state(
53555355

53565356
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
53575357
if (lang_id < 0) {
5358-
WHISPER_LOG_ERROR("%s: failed to auto-detect language\n", __func__);
5358+
if(params.debug_mode)
5359+
{
5360+
printf("\n%s: failed to auto-detect language\n", __func__);
5361+
}
53595362
return -3;
53605363
}
53615364
state->lang_id = lang_id;
53625365
params.language = whisper_lang_str(lang_id);
53635366

5364-
WHISPER_LOG_INFO("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
5367+
if(params.debug_mode)
5368+
{
5369+
printf("\n%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
5370+
}
53655371
if (params.detect_language) {
53665372
return 0;
53675373
}
@@ -5477,7 +5483,11 @@ int whisper_full_with_state(
54775483
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx), };
54785484

54795485
if (whisper_is_multilingual(ctx)) {
5480-
const int lang_id = whisper_lang_id(params.language);
5486+
int lang_id = whisper_lang_id(params.language);
5487+
if(lang_id<0)
5488+
{
5489+
lang_id = 0; //default to english
5490+
}
54815491
state->lang_id = lang_id;
54825492
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
54835493
if (params.translate) {

otherarch/whispercpp/whisper_adapter.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
217217

218218
const std::string b64data = std::string(inputs.audio_data);
219219
const std::string initprompt = std::string(inputs.prompt);
220+
const std::string langcode = std::string(inputs.langcode);
220221

221222
std::vector<float> pcmf32; // mono-channel F32 PCM
222223
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@@ -236,7 +237,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
236237
wparams.print_timestamps = false;
237238
wparams.print_special = false;
238239
wparams.translate = false;
239-
wparams.language = "auto";
240+
wparams.language = langcode.c_str();
240241
wparams.detect_language = false;
241242
wparams.n_threads = 4;
242243
wparams.n_max_text_ctx = wparams.n_max_text_ctx;
@@ -248,7 +249,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
248249
wparams.split_on_word = false;
249250
wparams.audio_ctx = 0;
250251
wparams.speed_up = false;
251-
wparams.debug_mode = false;
252+
wparams.debug_mode = (whisperdebugmode==1);
252253
wparams.tdrz_enable = false;
253254
wparams.suppress_regex = nullptr;
254255
wparams.suppress_non_speech_tokens = inputs.suppress_non_speech;

0 commit comments

Comments
 (0)