Skip to content

Commit ddb2ddd

Browse files
committed
Merge remote-tracking branch 'origin/master' into deepseek-v3
2 parents dfffe67 + 4b0c638 commit ddb2ddd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+20735
-19986
lines changed

.github/workflows/build.yml

+13-15
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ jobs:
6060
-DLLAMA_CURL=ON \
6161
-DGGML_METAL_USE_BF16=ON \
6262
-DGGML_METAL_EMBED_LIBRARY=ON \
63-
-DGGML_RPC=ON \
64-
-DBUILD_SHARED_LIBS=OFF
63+
-DGGML_RPC=ON
6564
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
6665
6766
- name: Test
@@ -123,8 +122,7 @@ jobs:
123122
-DLLAMA_FATAL_WARNINGS=ON \
124123
-DLLAMA_CURL=ON \
125124
-DGGML_METAL=OFF \
126-
-DGGML_RPC=ON \
127-
-DBUILD_SHARED_LIBS=OFF
125+
-DGGML_RPC=ON
128126
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
129127
130128
- name: Test
@@ -181,7 +179,7 @@ jobs:
181179
run: |
182180
mkdir build
183181
cd build
184-
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
182+
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
185183
cmake --build . --config Release -j $(nproc)
186184
187185
- name: Test
@@ -651,23 +649,23 @@ jobs:
651649
matrix:
652650
include:
653651
- build: 'noavx-x64'
654-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
652+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
655653
- build: 'avx2-x64'
656-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
654+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
657655
- build: 'avx-x64'
658-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
656+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
659657
- build: 'avx512-x64'
660-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
658+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
661659
- build: 'openblas-x64'
662-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
660+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
663661
- build: 'kompute-x64'
664-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
662+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
665663
- build: 'vulkan-x64'
666-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
664+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
667665
- build: 'llvm-arm64'
668-
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
666+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
669667
- build: 'msvc-arm64'
670-
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
668+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=O'
671669
- build: 'llvm-arm64-opencl-adreno'
672670
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
673671

@@ -914,7 +912,7 @@ jobs:
914912
shell: cmd
915913
run: |
916914
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
917-
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON
915+
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
918916
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
919917
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
920918
cmake --build build --config Release

common/arg.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1512,15 +1512,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
15121512
{"--lora"}, "FNAME",
15131513
"path to LoRA adapter (can be repeated to use multiple adapters)",
15141514
[](common_params & params, const std::string & value) {
1515-
params.lora_adapters.push_back({ std::string(value), 1.0 });
1515+
params.lora_adapters.push_back({ std::string(value), 1.0, nullptr });
15161516
}
15171517
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
15181518
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));
15191519
add_opt(common_arg(
15201520
{"--lora-scaled"}, "FNAME", "SCALE",
15211521
"path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters)",
15221522
[](common_params & params, const std::string & fname, const std::string & scale) {
1523-
params.lora_adapters.push_back({ fname, std::stof(scale) });
1523+
params.lora_adapters.push_back({ fname, std::stof(scale), nullptr });
15241524
}
15251525
// we define this arg on both COMMON and EXPORT_LORA, so when showing help message of export-lora, it will be categorized as "example-specific" arg
15261526
).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}));

common/common.cpp

+15-15
Original file line numberDiff line numberDiff line change
@@ -889,9 +889,8 @@ struct common_init_result common_init_from_params(common_params & params) {
889889
}
890890

891891
if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
892-
LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
893-
llama_free_model(model);
894-
return iparams;
892+
LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
893+
params.ctx_shift = false;
895894
}
896895

897896
if (!params.control_vectors.empty()) {
@@ -922,20 +921,21 @@ struct common_init_result common_init_from_params(common_params & params) {
922921

923922
// load and optionally apply lora adapters
924923
for (auto & la : params.lora_adapters) {
925-
common_lora_adapter_container loaded_la;
926-
loaded_la.path = la.path;
927-
loaded_la.scale = la.scale;
928-
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
929-
if (loaded_la.adapter == nullptr) {
924+
llama_lora_adapter_ptr lora;
925+
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
926+
if (lora == nullptr) {
930927
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
931928
llama_free(lctx);
932929
llama_free_model(model);
933930
return iparams;
934931
}
935-
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
932+
933+
la.ptr = lora.get();
934+
iparams.lora.emplace_back(std::move(lora)); // copy to list of loaded adapters
936935
}
936+
937937
if (!params.lora_init_without_apply) {
938-
common_lora_adapters_apply(lctx, iparams.lora_adapters);
938+
common_lora_adapters_apply(lctx, params.lora_adapters);
939939
}
940940

941941
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
@@ -996,17 +996,17 @@ struct common_init_result common_init_from_params(common_params & params) {
996996
llama_perf_context_reset(lctx);
997997
}
998998

999-
iparams.model = model;
1000-
iparams.context = lctx;
999+
iparams.model.reset(model);
1000+
iparams.context.reset(lctx);
10011001

10021002
return iparams;
10031003
}
10041004

1005-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
1005+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
10061006
llama_lora_adapter_clear(ctx);
1007-
for (auto & la : lora_adapters) {
1007+
for (auto & la : lora) {
10081008
if (la.scale != 0.0f) {
1009-
llama_lora_adapter_set(ctx, la.adapter, la.scale);
1009+
llama_lora_adapter_set(ctx, la.ptr, la.scale);
10101010
}
10111011
}
10121012
}

common/common.h

+15-11
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
#pragma once
44

5-
#include "llama.h"
5+
#include "llama-cpp.h"
66

77
#include <string>
88
#include <vector>
@@ -27,10 +27,8 @@
2727
struct common_lora_adapter_info {
2828
std::string path;
2929
float scale;
30-
};
3130

32-
struct common_lora_adapter_container : common_lora_adapter_info {
33-
struct llama_lora_adapter * adapter;
31+
struct llama_lora_adapter * ptr;
3432
};
3533

3634
using llama_tokens = std::vector<llama_token>;
@@ -478,10 +476,12 @@ std::string fs_get_cache_file(const std::string & filename);
478476
// Model utils
479477
//
480478

479+
// note: defines object's lifetime
481480
struct common_init_result {
482-
struct llama_model * model = nullptr;
483-
struct llama_context * context = nullptr;
484-
std::vector<common_lora_adapter_container> lora_adapters;
481+
llama_model_ptr model;
482+
llama_context_ptr context;
483+
484+
std::vector<llama_lora_adapter_ptr> lora;
485485
};
486486

487487
struct common_init_result common_init_from_params(common_params & params);
@@ -503,7 +503,7 @@ struct llama_model * common_load_model_from_hf(
503503
const struct llama_model_params & params);
504504

505505
// clear LoRA adapters from context, then apply new list of adapters
506-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
506+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
507507

508508
//
509509
// Batch utils
@@ -640,6 +640,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
640640
// Split utils
641641
//
642642

643-
static const char * const LLM_KV_SPLIT_NO = "split.no";
644-
static const char * const LLM_KV_SPLIT_COUNT = "split.count";
645-
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
643+
namespace {
644+
645+
const char * const LLM_KV_SPLIT_NO = "split.no";
646+
const char * const LLM_KV_SPLIT_COUNT = "split.count";
647+
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
648+
649+
}

examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -434,12 +434,12 @@ static void print_matrix(struct ggml_tensor * probs) {
434434
}
435435
}
436436

437-
struct llama_file {
437+
struct my_llama_file {
438438
// use FILE * so we don't have to re-open the file to mmap
439439
FILE * fp;
440440
size_t size;
441441

442-
llama_file(const char * fname, const char * mode) {
442+
my_llama_file(const char * fname, const char * mode) {
443443
fp = std::fopen(fname, mode);
444444
if (fp == NULL) {
445445
size = 0;
@@ -500,15 +500,15 @@ struct llama_file {
500500
return std::string(chars.data(), len);
501501
}
502502

503-
~llama_file() {
503+
~my_llama_file() {
504504
if (fp) {
505505
std::fclose(fp);
506506
}
507507
}
508508
};
509509

510510
static bool is_ggml_file(const char * filename) {
511-
llama_file file(filename, "rb");
511+
my_llama_file file(filename, "rb");
512512
if (file.size < 4) {
513513
return false;
514514
}
@@ -576,7 +576,7 @@ static void load_vocab(const char * filename, const Config * config, struct my_l
576576
} else {
577577
// assume llama2.c vocabulary
578578
LOG_INF("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename);
579-
llama_file file(filename, "rb");
579+
my_llama_file file(filename, "rb");
580580
if (!file.fp) {
581581
die_fmt("%s: %s", strerror(errno), filename);
582582
}

examples/cvector-generator/cvector-generator.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -415,12 +415,13 @@ int main(int argc, char ** argv) {
415415
// load the model to get hparams
416416
common_init_result llama_init = common_init_from_params(params);
417417

418-
llama_model * model = llama_init.model;
419-
llama_context * ctx = llama_init.context;
418+
llama_model * model = llama_init.model.get();
419+
llama_context * ctx = llama_init.context.get();
420420

421421
// int n_ctx = llama_n_ctx(ctx);
422422
int n_layers = llama_n_layer(model);
423423
int n_embd = llama_n_embd(model);
424+
424425
// get model hint param (a.k.a model arch name)
425426
char model_hint[128];
426427
llama_model_meta_val_str(model, "general.architecture", model_hint, 128);
@@ -474,8 +475,6 @@ int main(int argc, char ** argv) {
474475

475476
// done with the model, we can now free it to make gain some memory
476477
printf("Done evaluate prompts, unload model...\n");
477-
llama_free(ctx);
478-
llama_free_model(model);
479478

480479
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
481480

examples/embedding/embedding.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,9 @@ int main(int argc, char ** argv) {
9797
// load the model
9898
common_init_result llama_init = common_init_from_params(params);
9999

100-
llama_model * model = llama_init.model;
101-
llama_context * ctx = llama_init.context;
100+
llama_model * model = llama_init.model.get();
101+
llama_context * ctx = llama_init.context.get();
102+
102103
if (model == NULL) {
103104
LOG_ERR("%s: unable to load model\n", __func__);
104105
return 1;
@@ -316,8 +317,6 @@ int main(int argc, char ** argv) {
316317

317318
// clean up
318319
llama_batch_free(batch);
319-
llama_free(ctx);
320-
llama_free_model(model);
321320
llama_backend_free();
322321

323322
return 0;

examples/eval-callback/eval-callback.cpp

+3-5
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,9 @@ int main(int argc, char ** argv) {
162162
// init
163163
common_init_result llama_init = common_init_from_params(params);
164164

165-
llama_model * model = llama_init.model;
166-
llama_context * ctx = llama_init.context;
165+
llama_model * model = llama_init.model.get();
166+
llama_context * ctx = llama_init.context.get();
167+
167168
if (model == nullptr || ctx == nullptr) {
168169
LOG_ERR("%s : failed to init\n", __func__);
169170
return 1;
@@ -184,9 +185,6 @@ int main(int argc, char ** argv) {
184185
LOG("\n");
185186
llama_perf_context_print(ctx);
186187

187-
llama_free(ctx);
188-
llama_free_model(model);
189-
190188
llama_backend_free();
191189

192190
return 0;

examples/gguf-split/gguf-split.cpp

+3-4
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,14 @@
22
#include "common.h"
33

44
#include <algorithm>
5-
#include <cmath>
65
#include <cstdlib>
76
#include <fstream>
87
#include <string>
98
#include <vector>
10-
11-
#include <stdio.h>
12-
#include <string.h>
139
#include <climits>
10+
11+
#include <cstdio>
12+
#include <cstring>
1413
#include <stdexcept>
1514

1615
#if defined(_WIN32)

examples/imatrix/imatrix.cpp

+5-6
Original file line numberDiff line numberDiff line change
@@ -430,9 +430,10 @@ static void process_logits(
430430

431431
static bool compute_imatrix(llama_context * ctx, const common_params & params) {
432432
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
433-
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
434433
const int n_ctx = llama_n_ctx(ctx);
435434

435+
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
436+
436437
auto tim1 = std::chrono::high_resolution_clock::now();
437438
LOG_INF("%s: tokenizing the input ..\n", __func__);
438439

@@ -618,8 +619,9 @@ int main(int argc, char ** argv) {
618619
// init
619620
common_init_result llama_init = common_init_from_params(params);
620621

621-
llama_model * model = llama_init.model;
622-
llama_context * ctx = llama_init.context;
622+
llama_model * model = llama_init.model.get();
623+
llama_context * ctx = llama_init.context.get();
624+
623625
if (model == nullptr || ctx == nullptr) {
624626
LOG_ERR("%s : failed to init\n", __func__);
625627
return 1;
@@ -655,9 +657,6 @@ int main(int argc, char ** argv) {
655657
LOG("\n");
656658
llama_perf_context_print(ctx);
657659

658-
llama_free(ctx);
659-
llama_free_model(model);
660-
661660
llama_backend_free();
662661

663662
return 0;

examples/infill/infill.cpp

+2-5
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,8 @@ int main(int argc, char ** argv) {
131131
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
132132
common_init_result llama_init = common_init_from_params(params);
133133

134-
model = llama_init.model;
135-
ctx = llama_init.context;
134+
model = llama_init.model.get();
135+
ctx = llama_init.context.get();
136136

137137
if (model == NULL) {
138138
LOG_ERR("%s: unable to load model\n", __func__);
@@ -581,9 +581,6 @@ int main(int argc, char ** argv) {
581581
LOG("\n");
582582
common_perf_print(ctx, smpl);
583583

584-
llama_free(ctx);
585-
llama_free_model(model);
586-
587584
common_sampler_free(smpl);
588585
llama_backend_free();
589586

0 commit comments

Comments
 (0)