20
20
#include " llama.h"
21
21
#include " common.h"
22
22
#include " ggml-cuda.h"
23
+ #include " ggml-sycl.h"
23
24
24
25
// utils
25
26
static uint64_t get_time_ns () {
@@ -120,6 +121,22 @@ static std::string get_gpu_info() {
120
121
id += " /" ;
121
122
}
122
123
}
124
+ #endif
125
+ #ifdef GGML_USE_SYCL
126
+ int device_list[GGML_SYCL_MAX_DEVICES];
127
+ ggml_sycl_get_gpu_list (device_list, GGML_SYCL_MAX_DEVICES);
128
+
129
+ for (int i = 0 ; i < GGML_SYCL_MAX_DEVICES; i++) {
130
+ if (device_list[i] >0 ){
131
+ char buf[128 ];
132
+ ggml_sycl_get_device_description (i, buf, sizeof (buf));
133
+ id += buf;
134
+ id += " /" ;
135
+ }
136
+ }
137
+ if (id.length () >2 ) {
138
+ id.pop_back ();
139
+ }
123
140
#endif
124
141
// TODO: other backends
125
142
return id;
@@ -161,6 +178,7 @@ struct cmd_params {
161
178
std::vector<bool > no_kv_offload;
162
179
std::vector<bool > mul_mat_q;
163
180
std::vector<std::vector<float >> tensor_split;
181
+ std::vector<bool > use_mmap;
164
182
int reps;
165
183
bool verbose;
166
184
output_formats output_format;
@@ -180,6 +198,7 @@ static const cmd_params cmd_params_defaults = {
180
198
/* no_kv_offload */ {false },
181
199
/* mul_mat_q */ {true },
182
200
/* tensor_split */ {std::vector<float >(llama_max_devices (), 0 .0f )},
201
+ /* use_mmap */ {true },
183
202
/* reps */ 5 ,
184
203
/* verbose */ false ,
185
204
/* output_format */ MARKDOWN
@@ -201,6 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
201
220
printf (" -sm, --split-mode <none|layer|row> (default: %s)\n " , join (transform_to_str (cmd_params_defaults.split_mode , split_mode_str), " ," ).c_str ());
202
221
printf (" -mg, --main-gpu <i> (default: %s)\n " , join (cmd_params_defaults.main_gpu , " ," ).c_str ());
203
222
printf (" -nkvo, --no-kv-offload <0|1> (default: %s)\n " , join (cmd_params_defaults.no_kv_offload , " ," ).c_str ());
223
+ printf (" -mmp, --mmap <0|1> (default: %s)\n " , join (cmd_params_defaults.use_mmap , " ," ).c_str ());
204
224
printf (" -mmq, --mul-mat-q <0|1> (default: %s)\n " , join (cmd_params_defaults.mul_mat_q , " ," ).c_str ());
205
225
printf (" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n " );
206
226
printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
@@ -370,6 +390,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
370
390
}
371
391
auto p = split<bool >(argv[i], split_delim);
372
392
params.mul_mat_q .insert (params.mul_mat_q .end (), p.begin (), p.end ());
393
+ } else if (arg == " -mmp" || arg == " --mmap" ) {
394
+ if (++i >= argc) {
395
+ invalid_param = true ;
396
+ break ;
397
+ }
398
+ auto p = split<bool >(argv[i], split_delim);
399
+ params.use_mmap .insert (params.use_mmap .end (), p.begin (), p.end ());
373
400
} else if (arg == " -ts" || arg == " --tensor-split" ) {
374
401
if (++i >= argc) {
375
402
invalid_param = true ;
@@ -441,6 +468,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
441
468
if (params.no_kv_offload .empty ()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload ; }
442
469
if (params.mul_mat_q .empty ()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q ; }
443
470
if (params.tensor_split .empty ()) { params.tensor_split = cmd_params_defaults.tensor_split ; }
471
+ if (params.use_mmap .empty ()) { params.use_mmap = cmd_params_defaults.use_mmap ; }
444
472
if (params.n_threads .empty ()) { params.n_threads = cmd_params_defaults.n_threads ; }
445
473
446
474
return params;
@@ -460,6 +488,7 @@ struct cmd_params_instance {
460
488
bool no_kv_offload;
461
489
bool mul_mat_q;
462
490
std::vector<float > tensor_split;
491
+ bool use_mmap;
463
492
464
493
llama_model_params to_llama_mparams () const {
465
494
llama_model_params mparams = llama_model_default_params ();
@@ -468,6 +497,7 @@ struct cmd_params_instance {
468
497
mparams.split_mode = split_mode;
469
498
mparams.main_gpu = main_gpu;
470
499
mparams.tensor_split = tensor_split.data ();
500
+ mparams.use_mmap = use_mmap;
471
501
472
502
return mparams;
473
503
}
@@ -477,6 +507,7 @@ struct cmd_params_instance {
477
507
n_gpu_layers == other.n_gpu_layers &&
478
508
split_mode == other.split_mode &&
479
509
main_gpu == other.main_gpu &&
510
+ use_mmap == other.use_mmap &&
480
511
tensor_split == other.tensor_split ;
481
512
}
482
513
@@ -503,6 +534,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
503
534
for (const auto & sm : params.split_mode )
504
535
for (const auto & mg : params.main_gpu )
505
536
for (const auto & ts : params.tensor_split )
537
+ for (const auto & mmp : params.use_mmap )
506
538
for (const auto & nb : params.n_batch )
507
539
for (const auto & tk : params.type_k )
508
540
for (const auto & tv : params.type_v )
@@ -527,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
527
559
/* .no_kv_offload= */ nkvo,
528
560
/* .mul_mat_q = */ mmq,
529
561
/* .tensor_split = */ ts,
562
+ /* .use_mmap = */ mmp,
530
563
};
531
564
instances.push_back (instance);
532
565
}
@@ -549,6 +582,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
549
582
/* .no_kv_offload= */ nkvo,
550
583
/* .mul_mat_q = */ mmq,
551
584
/* .tensor_split = */ ts,
585
+ /* .use_mmap = */ mmp,
552
586
};
553
587
instances.push_back (instance);
554
588
}
@@ -565,6 +599,7 @@ struct test {
565
599
static const bool vulkan;
566
600
static const bool kompute;
567
601
static const bool metal;
602
+ static const bool sycl;
568
603
static const bool gpu_blas;
569
604
static const bool blas;
570
605
static const std::string cpu_info;
@@ -583,6 +618,7 @@ struct test {
583
618
bool no_kv_offload;
584
619
bool mul_mat_q;
585
620
std::vector<float > tensor_split;
621
+ bool use_mmap;
586
622
int n_prompt;
587
623
int n_gen;
588
624
std::string test_time;
@@ -605,6 +641,7 @@ struct test {
605
641
no_kv_offload = inst.no_kv_offload ;
606
642
mul_mat_q = inst.mul_mat_q ;
607
643
tensor_split = inst.tensor_split ;
644
+ use_mmap = inst.use_mmap ;
608
645
n_prompt = inst.n_prompt ;
609
646
n_gen = inst.n_gen ;
610
647
// RFC 3339 date-time format
@@ -654,25 +691,29 @@ struct test {
654
691
if (metal) {
655
692
return " Metal" ;
656
693
}
694
+ if (sycl) {
695
+ return GGML_SYCL_NAME;
696
+ }
657
697
if (gpu_blas) {
658
698
return " GPU BLAS" ;
659
699
}
660
700
if (blas) {
661
701
return " BLAS" ;
662
702
}
703
+
663
704
return " CPU" ;
664
705
}
665
706
666
707
static const std::vector<std::string> & get_fields () {
667
708
static const std::vector<std::string> fields = {
668
709
" build_commit" , " build_number" ,
669
- " cuda" , " opencl" , " vulkan" , " kompute" , " metal" , " gpu_blas" , " blas" ,
710
+ " cuda" , " opencl" , " vulkan" , " kompute" , " metal" , " sycl " , " gpu_blas" , " blas" ,
670
711
" cpu_info" , " gpu_info" ,
671
712
" model_filename" , " model_type" , " model_size" , " model_n_params" ,
672
713
" n_batch" , " n_threads" , " type_k" , " type_v" ,
673
714
" n_gpu_layers" , " split_mode" ,
674
715
" main_gpu" , " no_kv_offload" ,
675
- " mul_mat_q" , " tensor_split" ,
716
+ " mul_mat_q" , " tensor_split" , " use_mmap " ,
676
717
" n_prompt" , " n_gen" , " test_time" ,
677
718
" avg_ns" , " stddev_ns" ,
678
719
" avg_ts" , " stddev_ts"
@@ -691,8 +732,8 @@ struct test {
691
732
return INT;
692
733
}
693
734
if (field == " cuda" || field == " opencl" || field == " vulkan" || field == " kompute" || field == " metal" ||
694
- field == " gpu_blas" || field == " blas" || field == " f16_kv" || field == " no_kv_offload" ||
695
- field == " mul_mat_q" ) {
735
+ field == " gpu_blas" || field == " blas" || field == " sycl " ||field == " f16_kv" || field == " no_kv_offload" ||
736
+ field == " mul_mat_q" || field == " use_mmap " ) {
696
737
return BOOL;
697
738
}
698
739
if (field == " avg_ts" || field == " stddev_ts" ) {
@@ -720,13 +761,13 @@ struct test {
720
761
std::vector<std::string> values = {
721
762
build_commit, std::to_string (build_number),
722
763
std::to_string (cuda), std::to_string (opencl), std::to_string (vulkan), std::to_string (vulkan),
723
- std::to_string (metal), std::to_string (gpu_blas), std::to_string (blas),
764
+ std::to_string (metal), std::to_string (sycl), std::to_string ( gpu_blas), std::to_string (blas),
724
765
cpu_info, gpu_info,
725
766
model_filename, model_type, std::to_string (model_size), std::to_string (model_n_params),
726
767
std::to_string (n_batch), std::to_string (n_threads), ggml_type_name (type_k), ggml_type_name (type_v),
727
768
std::to_string (n_gpu_layers), split_mode_str (split_mode),
728
769
std::to_string (main_gpu), std::to_string (no_kv_offload),
729
- std::to_string (mul_mat_q), tensor_split_str,
770
+ std::to_string (mul_mat_q), tensor_split_str, std::to_string (use_mmap),
730
771
std::to_string (n_prompt), std::to_string (n_gen), test_time,
731
772
std::to_string (avg_ns ()), std::to_string (stdev_ns ()),
732
773
std::to_string (avg_ts ()), std::to_string (stdev_ts ())
@@ -753,6 +794,7 @@ const bool test::kompute = !!ggml_cpu_has_kompute();
753
794
const bool test::metal = !!ggml_cpu_has_metal();
754
795
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
755
796
const bool test::blas = !!ggml_cpu_has_blas();
797
+ const bool test::sycl = !!ggml_cpu_has_sycl();
756
798
const std::string test::cpu_info = get_cpu_info();
757
799
const std::string test::gpu_info = get_gpu_info();
758
800
@@ -895,6 +937,9 @@ struct markdown_printer : public printer {
895
937
if (field == " no_kv_offload" ) {
896
938
return " nkvo" ;
897
939
}
940
+ if (field == " use_mmap" ) {
941
+ return " mmap" ;
942
+ }
898
943
if (field == " tensor_split" ) {
899
944
return " ts" ;
900
945
}
@@ -938,6 +983,9 @@ struct markdown_printer : public printer {
938
983
if (params.tensor_split .size () > 1 || params.tensor_split != cmd_params_defaults.tensor_split ) {
939
984
fields.push_back (" tensor_split" );
940
985
}
986
+ if (params.use_mmap .size () > 1 || params.use_mmap != cmd_params_defaults.use_mmap ) {
987
+ fields.push_back (" use_mmap" );
988
+ }
941
989
fields.push_back (" test" );
942
990
fields.push_back (" t/s" );
943
991
0 commit comments