Skip to content

Commit 93b24bb

Browse files
committedFeb 26, 2025
Merge branch 'master' of https://github.com/ggerganov/whisper.cpp into miniaudio
2 parents 4508972 + dfc6ca6 commit 93b24bb

File tree

4 files changed

+23
-13
lines changed

4 files changed

+23
-13
lines changed
 

‎.github/workflows/docker.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ jobs:
2828

2929
- name: Set up QEMU
3030
uses: docker/setup-qemu-action@v3
31+
with:
32+
image: tonistiigi/binfmt:qemu-v7.0.0-28
3133

3234
- name: Set up Docker Buildx
3335
uses: docker/setup-buildx-action@v3

‎examples/stream/stream.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ struct whisper_params {
2323
int32_t capture_id = -1;
2424
int32_t max_tokens = 32;
2525
int32_t audio_ctx = 0;
26+
int32_t beam_size = -1;
2627

2728
float vad_thold = 0.6f;
2829
float freq_thold = 100.0f;
@@ -59,6 +60,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
5960
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
6061
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
6162
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
63+
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
6264
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
6365
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
6466
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
@@ -96,6 +98,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
9698
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
9799
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
98100
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
101+
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
99102
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
100103
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
101104
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
@@ -298,7 +301,7 @@ int main(int argc, char ** argv) {
298301

299302
// run the inference
300303
{
301-
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
304+
whisper_full_params wparams = whisper_full_default_params(params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY);
302305

303306
wparams.print_progress = false;
304307
wparams.print_special = params.print_special;
@@ -309,6 +312,7 @@ int main(int argc, char ** argv) {
309312
wparams.max_tokens = params.max_tokens;
310313
wparams.language = params.language.c_str();
311314
wparams.n_threads = params.n_threads;
315+
wparams.beam_search.beam_size = params.beam_size;
312316

313317
wparams.audio_ctx = params.audio_ctx;
314318

‎src/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ set_target_properties(whisper PROPERTIES
9494
target_include_directories(whisper PUBLIC . ../include)
9595
target_compile_features (whisper PUBLIC cxx_std_11) # don't bump
9696

97+
if (CMAKE_CXX_BYTE_ORDER STREQUAL "BIG_ENDIAN")
98+
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_BIG_ENDIAN)
99+
endif()
100+
97101
if (WHISPER_EXTRA_FLAGS)
98102
target_compile_options(whisper PRIVATE ${WHISPER_EXTRA_FLAGS})
99103
endif()

‎src/whisper.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,17 @@
3939
#pragma warning(disable: 4244 4267) // possible loss of data
4040
#endif
4141

42-
#if defined(GGML_BIG_ENDIAN)
43-
#include <bit>
44-
42+
#if defined(WHISPER_BIG_ENDIAN)
4543
template<typename T>
4644
static T byteswap(T value) {
47-
return std::byteswap(value);
48-
}
49-
50-
template<>
51-
float byteswap(float value) {
52-
return std::bit_cast<float>(byteswap(std::bit_cast<std::uint32_t>(value)));
45+
T value_swapped;
46+
char * source = reinterpret_cast<char *>(&value);
47+
char * target = reinterpret_cast<char *>(&value_swapped);
48+
int size = sizeof(T);
49+
for (int i = 0; i < size; i++) {
50+
target[size - 1 - i] = source[i];
51+
}
52+
return value_swapped;
5353
}
5454

5555
template<typename T>
@@ -85,14 +85,14 @@ static void byteswap_tensor(ggml_tensor * tensor) {
8585
}
8686

8787
#define BYTESWAP_VALUE(d) d = byteswap(d)
88-
#define BYTESWAP_FILTERS(f) \
88+
#define BYTESWAP_FILTERS(f) \
8989
do { \
9090
for (auto & datum : f.data) { \
9191
datum = byteswap(datum); \
9292
} \
9393
} while (0)
94-
#define BYTESWAP_TENSOR(t) \
95-
do { \
94+
#define BYTESWAP_TENSOR(t) \
95+
do { \
9696
byteswap_tensor(t); \
9797
} while (0)
9898
#else

0 commit comments

Comments
 (0)