Skip to content

Commit 128dcbd

Browse files
authoredFeb 1, 2024
add --no-mmap in llama-bench (ggml-org#5257)
* add --no-mmap, show sycl backend * fix conflict * fix code format, change print for --no-mmap * ren no_mmap to mmap, show mmap when not default value in printer * update guide for mmap * mv position to reduce model reload
1 parent 4d0924a commit 128dcbd

File tree

4 files changed

+89
-10
lines changed

4 files changed

+89
-10
lines changed
 

‎README-sycl.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
405405

406406
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
407407

408-
Solution: add **--no-mmap**.
408+
Solution: add **--no-mmap** or **--mmap 0**.
409409

410410
## Q&A
411411

‎examples/llama-bench/llama-bench.cpp

+54-6
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "llama.h"
2121
#include "common.h"
2222
#include "ggml-cuda.h"
23+
#include "ggml-sycl.h"
2324

2425
// utils
2526
static uint64_t get_time_ns() {
@@ -120,6 +121,22 @@ static std::string get_gpu_info() {
120121
id += "/";
121122
}
122123
}
124+
#endif
125+
#ifdef GGML_USE_SYCL
126+
int device_list[GGML_SYCL_MAX_DEVICES];
127+
ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
128+
129+
for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
130+
if (device_list[i] >0 ){
131+
char buf[128];
132+
ggml_sycl_get_device_description(i, buf, sizeof(buf));
133+
id += buf;
134+
id += "/";
135+
}
136+
}
137+
if (id.length() >2 ) {
138+
id.pop_back();
139+
}
123140
#endif
124141
// TODO: other backends
125142
return id;
@@ -161,6 +178,7 @@ struct cmd_params {
161178
std::vector<bool> no_kv_offload;
162179
std::vector<bool> mul_mat_q;
163180
std::vector<std::vector<float>> tensor_split;
181+
std::vector<bool> use_mmap;
164182
int reps;
165183
bool verbose;
166184
output_formats output_format;
@@ -180,6 +198,7 @@ static const cmd_params cmd_params_defaults = {
180198
/* no_kv_offload */ {false},
181199
/* mul_mat_q */ {true},
182200
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
201+
/* use_mmap */ {true},
183202
/* reps */ 5,
184203
/* verbose */ false,
185204
/* output_format */ MARKDOWN
@@ -201,6 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
201220
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
202221
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
203222
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
223+
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
204224
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
205225
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
206226
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@@ -370,6 +390,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
370390
}
371391
auto p = split<bool>(argv[i], split_delim);
372392
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
393+
} else if (arg == "-mmp" || arg == "--mmap") {
394+
if (++i >= argc) {
395+
invalid_param = true;
396+
break;
397+
}
398+
auto p = split<bool>(argv[i], split_delim);
399+
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
373400
} else if (arg == "-ts" || arg == "--tensor-split") {
374401
if (++i >= argc) {
375402
invalid_param = true;
@@ -441,6 +468,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
441468
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
442469
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
443470
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
471+
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
444472
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
445473

446474
return params;
@@ -460,6 +488,7 @@ struct cmd_params_instance {
460488
bool no_kv_offload;
461489
bool mul_mat_q;
462490
std::vector<float> tensor_split;
491+
bool use_mmap;
463492

464493
llama_model_params to_llama_mparams() const {
465494
llama_model_params mparams = llama_model_default_params();
@@ -468,6 +497,7 @@ struct cmd_params_instance {
468497
mparams.split_mode = split_mode;
469498
mparams.main_gpu = main_gpu;
470499
mparams.tensor_split = tensor_split.data();
500+
mparams.use_mmap = use_mmap;
471501

472502
return mparams;
473503
}
@@ -477,6 +507,7 @@ struct cmd_params_instance {
477507
n_gpu_layers == other.n_gpu_layers &&
478508
split_mode == other.split_mode &&
479509
main_gpu == other.main_gpu &&
510+
use_mmap == other.use_mmap &&
480511
tensor_split == other.tensor_split;
481512
}
482513

@@ -503,6 +534,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
503534
for (const auto & sm : params.split_mode)
504535
for (const auto & mg : params.main_gpu)
505536
for (const auto & ts : params.tensor_split)
537+
for (const auto & mmp : params.use_mmap)
506538
for (const auto & nb : params.n_batch)
507539
for (const auto & tk : params.type_k)
508540
for (const auto & tv : params.type_v)
@@ -527,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
527559
/* .no_kv_offload= */ nkvo,
528560
/* .mul_mat_q = */ mmq,
529561
/* .tensor_split = */ ts,
562+
/* .use_mmap = */ mmp,
530563
};
531564
instances.push_back(instance);
532565
}
@@ -549,6 +582,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
549582
/* .no_kv_offload= */ nkvo,
550583
/* .mul_mat_q = */ mmq,
551584
/* .tensor_split = */ ts,
585+
/* .use_mmap = */ mmp,
552586
};
553587
instances.push_back(instance);
554588
}
@@ -565,6 +599,7 @@ struct test {
565599
static const bool vulkan;
566600
static const bool kompute;
567601
static const bool metal;
602+
static const bool sycl;
568603
static const bool gpu_blas;
569604
static const bool blas;
570605
static const std::string cpu_info;
@@ -583,6 +618,7 @@ struct test {
583618
bool no_kv_offload;
584619
bool mul_mat_q;
585620
std::vector<float> tensor_split;
621+
bool use_mmap;
586622
int n_prompt;
587623
int n_gen;
588624
std::string test_time;
@@ -605,6 +641,7 @@ struct test {
605641
no_kv_offload = inst.no_kv_offload;
606642
mul_mat_q = inst.mul_mat_q;
607643
tensor_split = inst.tensor_split;
644+
use_mmap = inst.use_mmap;
608645
n_prompt = inst.n_prompt;
609646
n_gen = inst.n_gen;
610647
// RFC 3339 date-time format
@@ -654,25 +691,29 @@ struct test {
654691
if (metal) {
655692
return "Metal";
656693
}
694+
if (sycl) {
695+
return GGML_SYCL_NAME;
696+
}
657697
if (gpu_blas) {
658698
return "GPU BLAS";
659699
}
660700
if (blas) {
661701
return "BLAS";
662702
}
703+
663704
return "CPU";
664705
}
665706

666707
static const std::vector<std::string> & get_fields() {
667708
static const std::vector<std::string> fields = {
668709
"build_commit", "build_number",
669-
"cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
710+
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
670711
"cpu_info", "gpu_info",
671712
"model_filename", "model_type", "model_size", "model_n_params",
672713
"n_batch", "n_threads", "type_k", "type_v",
673714
"n_gpu_layers", "split_mode",
674715
"main_gpu", "no_kv_offload",
675-
"mul_mat_q", "tensor_split",
716+
"mul_mat_q", "tensor_split", "use_mmap",
676717
"n_prompt", "n_gen", "test_time",
677718
"avg_ns", "stddev_ns",
678719
"avg_ts", "stddev_ts"
@@ -691,8 +732,8 @@ struct test {
691732
return INT;
692733
}
693734
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
694-
field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
695-
field == "mul_mat_q") {
735+
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
736+
field == "mul_mat_q" || field == "use_mmap") {
696737
return BOOL;
697738
}
698739
if (field == "avg_ts" || field == "stddev_ts") {
@@ -720,13 +761,13 @@ struct test {
720761
std::vector<std::string> values = {
721762
build_commit, std::to_string(build_number),
722763
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
723-
std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
764+
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
724765
cpu_info, gpu_info,
725766
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
726767
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
727768
std::to_string(n_gpu_layers), split_mode_str(split_mode),
728769
std::to_string(main_gpu), std::to_string(no_kv_offload),
729-
std::to_string(mul_mat_q), tensor_split_str,
770+
std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
730771
std::to_string(n_prompt), std::to_string(n_gen), test_time,
731772
std::to_string(avg_ns()), std::to_string(stdev_ns()),
732773
std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -753,6 +794,7 @@ const bool test::kompute = !!ggml_cpu_has_kompute();
753794
const bool test::metal = !!ggml_cpu_has_metal();
754795
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
755796
const bool test::blas = !!ggml_cpu_has_blas();
797+
const bool test::sycl = !!ggml_cpu_has_sycl();
756798
const std::string test::cpu_info = get_cpu_info();
757799
const std::string test::gpu_info = get_gpu_info();
758800

@@ -895,6 +937,9 @@ struct markdown_printer : public printer {
895937
if (field == "no_kv_offload") {
896938
return "nkvo";
897939
}
940+
if (field == "use_mmap") {
941+
return "mmap";
942+
}
898943
if (field == "tensor_split") {
899944
return "ts";
900945
}
@@ -938,6 +983,9 @@ struct markdown_printer : public printer {
938983
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
939984
fields.push_back("tensor_split");
940985
}
986+
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
987+
fields.push_back("use_mmap");
988+
}
941989
fields.push_back("test");
942990
fields.push_back("t/s");
943991

‎ggml-sycl.cpp

+32-2
Original file line numberDiff line numberDiff line change
@@ -2928,7 +2928,6 @@ void ggml_sycl_set_main_device(int main_device);
29282928
void ggml_sycl_set_mul_mat_q(bool mul_mat_q);
29292929
void ggml_sycl_set_scratch_size(size_t scratch_size);
29302930
void ggml_sycl_free_scratch(void);
2931-
int ggml_sycl_get_device_count(void);
29322931
void ggml_sycl_get_device_description(int device, char * description, size_t description_size);
29332932
bool ggml_backend_is_sycl(ggml_backend_t backend);
29342933
int ggml_backend_sycl_get_device(ggml_backend_t backend);
@@ -14493,6 +14492,37 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
1449314492
return true;
1449414493
}
1449514494

14495+
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
14496+
int max_compute_units = -1;
14497+
for(int i=0;i<max_len;i++) id_list[i] = 0;
14498+
14499+
int device_count = dpct::dev_mgr::instance().device_count();
14500+
14501+
for(int id=0; id< device_count; id++){
14502+
sycl::device device = dpct::dev_mgr::instance().get_device(id);
14503+
if (!device.is_gpu()) continue;
14504+
dpct::device_info prop;
14505+
dpct::get_device_info(prop, device);
14506+
if(max_compute_units < prop.get_max_compute_units()) max_compute_units = prop.get_max_compute_units();
14507+
}
14508+
14509+
for(int id=0;id< device_count;id++){
14510+
sycl::device device = dpct::dev_mgr::instance().get_device(id);
14511+
if (!device.is_gpu()) continue;
14512+
dpct::device_info prop;
14513+
dpct::get_device_info(prop, device);
14514+
if(max_compute_units == prop.get_max_compute_units() && prop.get_major_version() == 1 ){
14515+
id_list[id] = 1;
14516+
}
14517+
}
14518+
return;
14519+
}
14520+
catch (sycl::exception const &exc) {
14521+
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
14522+
<< ", line:" << __LINE__ << std::endl;
14523+
std::exit(1);
14524+
}
14525+
1449614526
int ggml_sycl_get_device_count() try {
1449714527
int device_count;
1449814528
if (CHECK_TRY_ERROR(device_count =
@@ -14507,7 +14537,7 @@ catch (sycl::exception const &exc) {
1450714537
std::exit(1);
1450814538
}
1450914539

14510-
void ggml_sycl_get_device_description(int device, char *description,
14540+
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
1451114541
size_t description_size) try {
1451214542
dpct::device_info prop;
1451314543
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(

‎ggml-sycl.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
2222
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
2323
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
2424
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
25-
25+
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
26+
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
2627
#ifdef __cplusplus
2728
}
2829
#endif

0 commit comments

Comments
 (0)
Failed to load comments.