Skip to content

Commit 961cc89

Browse files
committed
Merge branch 'concedo_experimental' into croco_nex_0
2 parents bdf6019 + 91b6e29 commit 961cc89

File tree

6 files changed

+51
-10
lines changed

6 files changed

+51
-10
lines changed

expose.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#include <cstdint>
33

44
const int tensor_split_max = 16;
5-
const int images_max = 4;
5+
const int images_max = 8;
66
const int logprobs_max = 5;
77

88
// match kobold's sampler list and order
@@ -194,6 +194,7 @@ struct whisper_generation_inputs
194194
const char * prompt = nullptr;
195195
const char * audio_data = nullptr;
196196
const bool suppress_non_speech = false;
197+
const char * langcode = nullptr;
197198
const bool quiet = false;
198199
};
199200
struct whisper_generation_outputs

kcpp_docs.embd

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1344,13 +1344,27 @@
13441344
"application/json": {
13451345
"example": {
13461346
"prompt": "",
1347+
"suppress_non_speech" : false,
1348+
"langcode": "en",
13471349
"audio_data": "base64_wav_data",
13481350
},
13491351
"schema": {
13501352
"properties": {
13511353
"audio_data": {
13521354
"type": "string",
13531355
"description": "Base64 respresentation of a 16-bit 16kHz wave file to be transcribed to text."
1356+
},
1357+
"prompt": {
1358+
"type": "string",
1359+
"description": "Prompt to steer the transcription."
1360+
},
1361+
"langcode": {
1362+
"type": "string",
1363+
"description": "Two letter language code, or use auto to autodetect."
1364+
},
1365+
"suppress_non_speech": {
1366+
"type": "boolean",
1367+
"description": "Prevent noise tokens, always generate words for speech."
13541368
}
13551369
},
13561370
"type": "object"

klite.embd

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Current version indicated by LITEVER below.
1212
-->
1313

1414
<script>
15-
const LITEVER = 201;
15+
const LITEVER = 202;
1616
const urlParams = new URLSearchParams(window.location.search);
1717
var localflag = true;
1818
const STORAGE_PREFIX = (localflag?"e_":"")+"kaihordewebui_";
@@ -3045,6 +3045,7 @@ Current version indicated by LITEVER below.
30453045
narrate_only_dialog: false,
30463046
voice_end_delay: 300,
30473047
voice_suppress_nonspeech: false,
3048+
voice_langcode: "auto",
30483049
tts_speed: 1.0,
30493050
image_styles: "",
30503051
image_negprompt: "",
@@ -10331,6 +10332,7 @@ initializeInstructUIFunctionality();
1033110332
document.getElementById("tts_speed").value = localsettings.tts_speed;
1033210333
document.getElementById("voice_end_delay").value = localsettings.voice_end_delay;
1033310334
document.getElementById("voice_suppress_nonspeech").checked = localsettings.voice_suppress_nonspeech;
10335+
document.getElementById("voice_langcode").value = localsettings.voice_langcode;
1033410336
toggle_opmode();
1033510337

1033610338
//sd models display
@@ -10619,6 +10621,7 @@ initializeInstructUIFunctionality();
1061910621
localsettings.tts_speed = document.getElementById("tts_speed").value;
1062010622
localsettings.voice_end_delay = document.getElementById("voice_end_delay").value;
1062110623
localsettings.voice_suppress_nonspeech = (document.getElementById("voice_suppress_nonspeech").checked?true:false);
10624+
localsettings.voice_langcode = document.getElementById("voice_langcode").value;
1062210625
localsettings.auto_ctxlen = (document.getElementById("auto_ctxlen").checked ? true : false);
1062310626
localsettings.auto_genamt = (document.getElementById("auto_genamt").checked ? true : false);
1062410627

@@ -15300,7 +15303,8 @@ initializeInstructUIFunctionality();
1530015303
let payload = {
1530115304
"audio_data": dataurl,
1530215305
"prompt": "",
15303-
"suppress_non_speech": localsettings.voice_suppress_nonspeech
15306+
"suppress_non_speech": localsettings.voice_suppress_nonspeech,
15307+
"langcode": localsettings.voice_langcode,
1530415308
};
1530515309
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
1530615310
method: 'POST',
@@ -15350,7 +15354,8 @@ initializeInstructUIFunctionality();
1535015354
let payload = {
1535115355
"audio_data": dataurl,
1535215356
"prompt": "",
15353-
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false)
15357+
"suppress_non_speech": (document.getElementById("voice_suppress_nonspeech").checked?true:false),
15358+
"langcode": document.getElementById("voice_langcode").value
1535415359
};
1535515360
fetch(apply_proxy_url(custom_kobold_endpoint + koboldcpp_transcribe_endpoint), {
1535615361
method: 'POST',
@@ -20230,6 +20235,12 @@ initializeInstructUIFunctionality();
2023020235
<div class="justifyleft" style="padding:2px" title="Suppress non-speech (e.g. music and sounds) from transcription">Suppress Non-Speech </div>
2023120236
<input title="Suppress Non-Speech" type="checkbox" id="voice_suppress_nonspeech" style="margin:0px 0px 0px auto;">
2023220237
</div>
20238+
<div class="inlinelabel" style="font-size: 11px;">
20239+
<div class="justifyleft" style="padding:2px" title="Language Code">Language </div>
20240+
<input class="settinglabel miniinput" type="text" placeholder="en" value="auto" id="voice_langcode" style="height:18px; width: 36px; padding: 2px;">
20241+
20242+
</div>
20243+
2023320244
<div class="inlinelabel" style="font-size: 11px;">
2023420245
<div class="justifyleft" style="padding:3px">Voice Delay: </div>
2023520246
<input title="Voice Delay Milliseconds" type="text" inputmode="decimal" value="300" id="voice_end_delay" style="width:40px">

koboldcpp.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
# constants
3232
sampler_order_max = 7
3333
tensor_split_max = 16
34-
images_max = 4
34+
images_max = 8
3535
bias_min_value = -100.0
3636
bias_max_value = 100.0
3737
logprobs_max = 5
@@ -285,6 +285,7 @@ class whisper_generation_inputs(ctypes.Structure):
285285
_fields_ = [("prompt", ctypes.c_char_p),
286286
("audio_data", ctypes.c_char_p),
287287
("suppress_non_speech", ctypes.c_bool),
288+
("langcode", ctypes.c_char_p),
288289
("quiet", ctypes.c_bool)]
289290

290291
class whisper_generation_outputs(ctypes.Structure):
@@ -1791,6 +1792,9 @@ def whisper_generate(genparams):
17911792
inputs.prompt = prompt.encode("UTF-8")
17921793
inputs.audio_data = audio_data.encode("UTF-8")
17931794
inputs.quiet = is_quiet
1795+
lc = genparams.get("langcode", "auto")
1796+
lc = lc.strip().lower() if (lc and lc.strip().lower()!="") else "auto"
1797+
inputs.langcode = lc.encode("UTF-8")
17941798
inputs.suppress_non_speech = genparams.get("suppress_non_speech", False)
17951799
ret = handle.whisper_generate(inputs)
17961800
outstr = ""

otherarch/whispercpp/whisper.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5355,13 +5355,19 @@ int whisper_full_with_state(
53555355

53565356
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
53575357
if (lang_id < 0) {
5358-
WHISPER_LOG_ERROR("%s: failed to auto-detect language\n", __func__);
5358+
if(params.debug_mode)
5359+
{
5360+
printf("\n%s: failed to auto-detect language\n", __func__);
5361+
}
53595362
return -3;
53605363
}
53615364
state->lang_id = lang_id;
53625365
params.language = whisper_lang_str(lang_id);
53635366

5364-
WHISPER_LOG_INFO("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
5367+
if(params.debug_mode)
5368+
{
5369+
printf("\n%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
5370+
}
53655371
if (params.detect_language) {
53665372
return 0;
53675373
}
@@ -5477,7 +5483,11 @@ int whisper_full_with_state(
54775483
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx), };
54785484

54795485
if (whisper_is_multilingual(ctx)) {
5480-
const int lang_id = whisper_lang_id(params.language);
5486+
int lang_id = whisper_lang_id(params.language);
5487+
if(lang_id<0)
5488+
{
5489+
lang_id = 0; //default to english
5490+
}
54815491
state->lang_id = lang_id;
54825492
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
54835493
if (params.translate) {

otherarch/whispercpp/whisper_adapter.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
217217

218218
const std::string b64data = std::string(inputs.audio_data);
219219
const std::string initprompt = std::string(inputs.prompt);
220+
const std::string langcode = std::string(inputs.langcode);
220221

221222
std::vector<float> pcmf32; // mono-channel F32 PCM
222223
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@@ -236,7 +237,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
236237
wparams.print_timestamps = false;
237238
wparams.print_special = false;
238239
wparams.translate = false;
239-
wparams.language = "auto";
240+
wparams.language = langcode.c_str();
240241
wparams.detect_language = false;
241242
wparams.n_threads = 4;
242243
wparams.n_max_text_ctx = wparams.n_max_text_ctx;
@@ -248,7 +249,7 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
248249
wparams.split_on_word = false;
249250
wparams.audio_ctx = 0;
250251
wparams.speed_up = false;
251-
wparams.debug_mode = false;
252+
wparams.debug_mode = (whisperdebugmode==1);
252253
wparams.tdrz_enable = false;
253254
wparams.suppress_regex = nullptr;
254255
wparams.suppress_non_speech_tokens = inputs.suppress_non_speech;

0 commit comments

Comments
 (0)