Skip to content

Commit bbb1ca9

Browse files
authored
Merge pull request #316 from ngxson/xsn/llama_batch_remove_compat
Xsn/llama batch remove compat
2 parents 7eee341 + 4be7ecf commit bbb1ca9

File tree

36 files changed

+980
-845
lines changed

36 files changed

+980
-845
lines changed

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ variety of hardware - locally and in the cloud.
3131
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
3232
- AVX, AVX2 and AVX512 support for x86 architectures
3333
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
34-
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
34+
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
3535
- Vulkan and SYCL backend support
3636
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
3737

@@ -413,7 +413,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
413413
| [BLAS](./docs/build.md#blas-build) | All |
414414
| [BLIS](./docs/backend/BLIS.md) | All |
415415
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
416-
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
416+
| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
417417
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
418418
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
419419
| [Vulkan](./docs/build.md#vulkan) | GPU |

common/arg.cpp

+112-155
Large diffs are not rendered by default.

common/common.cpp

+19-3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
#include <algorithm>
1414
#include <cinttypes>
15+
#include <climits>
1516
#include <cmath>
1617
#include <codecvt>
1718
#include <cstdarg>
@@ -23,10 +24,10 @@
2324
#include <regex>
2425
#include <sstream>
2526
#include <string>
27+
#include <thread>
2628
#include <unordered_map>
2729
#include <unordered_set>
2830
#include <vector>
29-
#include <thread>
3031

3132
#if defined(__APPLE__) && defined(__MACH__)
3233
#include <sys/types.h>
@@ -400,6 +401,21 @@ std::string common_params_get_system_info(const common_params & params) {
400401
// String utils
401402
//
402403

404+
std::string string_format(const char * fmt, ...) {
405+
va_list ap;
406+
va_list ap2;
407+
va_start(ap, fmt);
408+
va_copy(ap2, ap);
409+
int size = vsnprintf(NULL, 0, fmt, ap);
410+
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
411+
std::vector<char> buf(size + 1);
412+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
413+
GGML_ASSERT(size2 == size);
414+
va_end(ap2);
415+
va_end(ap);
416+
return std::string(buf.data(), size);
417+
}
418+
403419
std::vector<std::string> string_split(std::string input, char separator) {
404420
std::vector<std::string> parts;
405421
size_t separator_pos = input.find(separator);
@@ -939,7 +955,7 @@ struct common_init_result common_init_from_params(common_params & params) {
939955
}
940956

941957
if (llama_model_has_encoder(model)) {
942-
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
958+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
943959
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
944960
if (decoder_start_token_id == -1) {
945961
decoder_start_token_id = bos;
@@ -948,7 +964,7 @@ struct common_init_result common_init_from_params(common_params & params) {
948964
tmp.push_back(decoder_start_token_id);
949965
}
950966
if (llama_model_has_decoder(model)) {
951-
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
967+
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
952968
}
953969
llama_kv_cache_clear(lctx);
954970
llama_synchronize(lctx);

common/common.h

+16-4
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,6 @@ struct common_params {
282282
std::string hostname = "127.0.0.1";
283283
std::string public_path = ""; // NOLINT
284284
std::string chat_template = ""; // NOLINT
285-
std::string system_prompt = ""; // NOLINT
286285
bool enable_chat_template = true;
287286

288287
std::vector<std::string> api_keys;
@@ -352,15 +351,28 @@ void common_init();
352351

353352
std::string common_params_get_system_info(const common_params & params);
354353

355-
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
356-
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
357-
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
354+
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
355+
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
356+
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
358357
bool set_process_priority(enum ggml_sched_priority prio);
359358

360359
//
361360
// String utils
362361
//
363362

363+
#ifdef __GNUC__
364+
#ifdef __MINGW32__
365+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
366+
#else
367+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
368+
#endif
369+
#else
370+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
371+
#endif
372+
373+
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
374+
std::string string_format(const char * fmt, ...);
375+
364376
std::vector<std::string> string_split(std::string input, char separator);
365377

366378
std::string string_strip(const std::string & str);

docs/build.md

+8
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,8 @@ The following compilation options are also available to tweak performance:
198198
199199
### MUSA
200200
201+
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
202+
201203
- Using `make`:
202204
```bash
203205
make GGML_MUSA=1
@@ -209,6 +211,12 @@ The following compilation options are also available to tweak performance:
209211
cmake --build build --config Release
210212
```
211213
214+
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
215+
216+
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
217+
218+
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
219+
212220
### hipBLAS
213221

214222
This provides BLAS acceleration on HIP-supported AMD GPUs.

examples/batched-bench/batched-bench.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ int main(int argc, char ** argv) {
7474
batch.n_seq_id + i,
7575
batch.seq_id + i,
7676
batch.logits + i,
77-
0, 0, 0, // unused
7877
};
7978

8079
const int ret = llama_decode(ctx, batch_view);

examples/cvector-generator/cvector-generator.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
339339

340340
static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
341341
llama_kv_cache_clear(ctx);
342-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
342+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
343343
fprintf(stderr, "%s : failed to eval\n", __func__);
344344
return false;
345345
}

examples/eval-callback/eval-callback.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ static bool run(llama_context * ctx, const common_params & params) {
131131

132132
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
133133

134-
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
134+
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
135135
LOG_ERR("%s : failed to eval\n", __func__);
136136
return false;
137137
}

examples/imatrix/imatrix.cpp

+11-2
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
496496
// clear the KV cache
497497
llama_kv_cache_clear(ctx);
498498

499+
llama_batch batch = llama_batch_init(n_batch, 0, 1);
500+
499501
for (int j = 0; j < num_batches; ++j) {
500502
const int batch_start = start + j * n_batch;
501503
const int batch_size = std::min(end - batch_start, n_batch);
@@ -508,9 +510,14 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
508510
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
509511
}
510512

511-
// TODO: use batch.logits to save computations instead of relying on logits_all == true
512-
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
513+
common_batch_clear(batch);
514+
for (int i = 0; i < batch_size; i++) {
515+
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
516+
}
517+
518+
if (llama_decode(ctx, batch)) {
513519
LOG_ERR("%s : failed to eval\n", __func__);
520+
llama_batch_free(batch);
514521
return false;
515522
}
516523

@@ -523,6 +530,8 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
523530
}
524531
}
525532

533+
llama_batch_free(batch);
534+
526535
const auto t_end = std::chrono::high_resolution_clock::now();
527536

528537
if (i == 0) {

examples/infill/infill.cpp

+9-9
Original file line numberDiff line numberDiff line change
@@ -205,11 +205,11 @@ int main(int argc, char ** argv) {
205205
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
206206
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
207207

208-
GGML_ASSERT(llama_token_prefix(model) >= 0);
209-
GGML_ASSERT(llama_token_suffix(model) >= 0);
208+
GGML_ASSERT(llama_token_fim_pre(model) >= 0);
209+
GGML_ASSERT(llama_token_fim_suf(model) >= 0);
210210

211-
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
212-
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
211+
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
212+
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
213213

214214
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
215215
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
@@ -218,7 +218,7 @@ int main(int argc, char ** argv) {
218218
}
219219
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
220220

221-
const llama_token middle_token = llama_token_middle(model);
221+
const llama_token middle_token = llama_token_fim_mid(model);
222222
if (middle_token >= 0) {
223223
embd_inp.push_back(middle_token);
224224
}
@@ -376,7 +376,7 @@ int main(int argc, char ** argv) {
376376
n_past, n_left, n_ctx, params.n_keep, n_discard);
377377

378378
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
379-
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
379+
llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past + 1, -n_discard);
380380

381381
n_past -= n_discard;
382382

@@ -396,7 +396,7 @@ int main(int argc, char ** argv) {
396396

397397
LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());
398398

399-
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
399+
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
400400
LOG_ERR("%s : failed to eval\n", __func__);
401401
return 1;
402402
}
@@ -508,8 +508,8 @@ int main(int argc, char ** argv) {
508508
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
509509
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
510510

511-
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
512-
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
511+
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
512+
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
513513

514514
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
515515
embd_end = params.spm_infill ? inp_pfx : inp_sfx;

examples/llama-bench/llama-bench.cpp

+8-8
Original file line numberDiff line numberDiff line change
@@ -1428,7 +1428,7 @@ struct sql_printer : public printer {
14281428
}
14291429
};
14301430

1431-
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
1431+
static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
14321432
llama_set_n_threads(ctx, n_threads, n_threads);
14331433

14341434
const llama_model * model = llama_get_model(ctx);
@@ -1444,14 +1444,14 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
14441444
for (int i = 1; i < n_tokens; i++) {
14451445
tokens[i] = std::rand() % n_vocab;
14461446
}
1447-
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
1447+
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
14481448
n_processed += n_tokens;
14491449
}
14501450

14511451
llama_synchronize(ctx);
14521452
}
14531453

1454-
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
1454+
static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
14551455
llama_set_n_threads(ctx, n_threads, n_threads);
14561456

14571457
const llama_model * model = llama_get_model(ctx);
@@ -1460,7 +1460,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
14601460
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
14611461

14621462
for (int i = 0; i < n_gen; i++) {
1463-
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
1463+
llama_decode(ctx, llama_batch_get_one(&token, 1));
14641464
llama_synchronize(ctx);
14651465
token = std::rand() % n_vocab;
14661466
}
@@ -1596,13 +1596,13 @@ int main(int argc, char ** argv) {
15961596
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
15971597
}
15981598
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
1599-
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1599+
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
16001600
}
16011601
if (t.n_gen > 0) {
16021602
if (params.progress) {
16031603
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
16041604
}
1605-
test_gen(ctx, 1, 0, t.n_threads);
1605+
test_gen(ctx, 1, t.n_threads);
16061606
}
16071607

16081608
for (int i = 0; i < params.reps; i++) {
@@ -1614,13 +1614,13 @@ int main(int argc, char ** argv) {
16141614
if (params.progress) {
16151615
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count, i + 1, params.reps);
16161616
}
1617-
test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
1617+
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
16181618
}
16191619
if (t.n_gen > 0) {
16201620
if (params.progress) {
16211621
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count, i + 1, params.reps);
16221622
}
1623-
test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
1623+
test_gen(ctx, t.n_gen, t.n_threads);
16241624
}
16251625

16261626
uint64_t t_ns = get_time_ns() - t_start;

examples/llama.android/llama/src/main/cpp/llama-android.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -283,9 +283,6 @@ Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens,
283283
nullptr,
284284
nullptr,
285285
nullptr,
286-
0,
287-
0,
288-
0,
289286
};
290287

291288
if (embd) {

examples/llava/llava-cli.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
2020
if (n_eval > n_batch) {
2121
n_eval = n_batch;
2222
}
23-
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
23+
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
2424
LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
2525
return false;
2626
}

examples/llava/llava.cpp

+36-2
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,39 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
401401
return true;
402402
}
403403

404+
struct llava_embd_batch {
405+
std::vector<llama_pos> pos;
406+
std::vector<int32_t> n_seq_id;
407+
std::vector<llama_seq_id> seq_id_0;
408+
std::vector<llama_seq_id *> seq_ids;
409+
std::vector<int8_t> logits;
410+
llama_batch batch;
411+
llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
412+
pos .resize(n_tokens);
413+
n_seq_id.resize(n_tokens);
414+
seq_ids .resize(n_tokens + 1);
415+
logits .resize(n_tokens);
416+
seq_id_0.resize(1);
417+
seq_id_0[0] = seq_id;
418+
seq_ids [n_tokens] = nullptr;
419+
batch = {
420+
/*n_tokens =*/ n_tokens,
421+
/*tokens =*/ nullptr,
422+
/*embd =*/ embd,
423+
/*pos =*/ pos.data(),
424+
/*n_seq_id =*/ n_seq_id.data(),
425+
/*seq_id =*/ seq_ids.data(),
426+
/*logits =*/ logits.data(),
427+
};
428+
for (int i = 0; i < n_tokens; i++) {
429+
batch.pos [i] = pos_0 + i;
430+
batch.n_seq_id[i] = 1;
431+
batch.seq_id [i] = seq_id_0.data();
432+
batch.logits [i] = false;
433+
}
434+
}
435+
};
436+
404437
bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
405438
int n_embd = llama_n_embd(llama_get_model(ctx_llama));
406439

@@ -409,8 +442,9 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
409442
if (n_eval > n_batch) {
410443
n_eval = n_batch;
411444
}
412-
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
413-
if (llama_decode(ctx_llama, batch)) {
445+
float * embd = image_embed->embed+i*n_embd;
446+
llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
447+
if (llama_decode(ctx_llama, llava_batch.batch)) {
414448
LOG_ERR("%s : failed to eval\n", __func__);
415449
return false;
416450
}

0 commit comments

Comments
 (0)