Skip to content

Commit 1824f8d

Browse files
authored
Merge pull request #96 from ggerganov/master
b2392
2 parents c2101a2 + bb6d00b commit 1824f8d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2398
-3670
lines changed

Diff for: .github/workflows/server.yml

+45-1
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ jobs:
4747
- name: Clone
4848
id: checkout
4949
uses: actions/checkout@v3
50+
with:
51+
fetch-depth: 0
5052

5153
- name: Dependencies
5254
id: depends
@@ -58,7 +60,6 @@ jobs:
5860
cmake \
5961
python3-pip \
6062
wget \
61-
psmisc \
6263
language-pack-en
6364
6465
- name: Build
@@ -90,3 +91,46 @@ jobs:
9091
run: |
9192
cd examples/server/tests
9293
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
94+
95+
96+
server-windows:
97+
runs-on: windows-latest
98+
99+
steps:
100+
- name: Clone
101+
id: checkout
102+
uses: actions/checkout@v3
103+
with:
104+
fetch-depth: 0
105+
106+
- name: Build
107+
id: cmake_build
108+
run: |
109+
mkdir build
110+
cd build
111+
cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
112+
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
113+
114+
- name: Python setup
115+
id: setup_python
116+
uses: actions/setup-python@v5
117+
with:
118+
python-version: '3.11'
119+
120+
- name: Tests dependencies
121+
id: test_dependencies
122+
run: |
123+
pip install -r examples/server/tests/requirements.txt
124+
125+
- name: Tests
126+
id: server_integration_tests
127+
run: |
128+
cd examples/server/tests
129+
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
130+
131+
- name: Slow tests
132+
id: server_integration_tests_slow
133+
if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
134+
run: |
135+
cd examples/server/tests
136+
behave.exe --stop --no-skipped --no-capture --tags slow

Diff for: .gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ models-mnt
4545
/embedding
4646
/gguf
4747
/gguf-llama-simple
48+
/gritlm
4849
/imatrix
4950
/infill
5051
/libllama.so

Diff for: CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,8 @@ if (LLAMA_METAL)
199199
# get full path to the file
200200
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
201201

202-
# copy ggml-metal.metal to bin directory
202+
# copy ggml-common.h and ggml-metal.metal to bin directory
203+
configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
203204
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
204205

205206
if (LLAMA_METAL_EMBED_LIBRARY)

Diff for: Makefile

+11-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
BUILD_TARGETS = \
33
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
44
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
5+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
66

77
# Binaries only useful for tests
88
TEST_TARGETS = \
@@ -201,6 +201,10 @@ ifdef LLAMA_SERVER_VERBOSE
201201
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
202202
endif
203203

204+
ifdef LLAMA_SERVER_SSL
205+
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
206+
MK_LDFLAGS += -lssl -lcrypto
207+
endif
204208

205209
ifdef LLAMA_CODE_COVERAGE
206210
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
@@ -449,7 +453,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
449453
ifdef LLAMA_CUDA_CCBIN
450454
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
451455
endif
452-
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
456+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
453457
ifdef JETSON_EOL_MODULE_DETECT
454458
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
455459
else
@@ -626,7 +630,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
626630
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
627631
$(CC) $(CFLAGS) -c $< -o $@
628632

629-
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
633+
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
630634
$(CC) $(CFLAGS) -c $< -o $@
631635

632636
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
@@ -720,6 +724,10 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
720724
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
721725
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
722726

727+
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
728+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
729+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
730+
723731
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
724732
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
725733
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

Diff for: README.md

+3-5
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1616

1717
### Hot topics
1818

19-
- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
20-
- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
21-
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
22-
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
23-
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
19+
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
20+
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
21+
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
2422

2523
----
2624

Diff for: common/common.cpp

+15
Original file line numberDiff line numberDiff line change
@@ -1852,3 +1852,18 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
18521852

18531853
printf("\n=== Done dumping\n");
18541854
}
1855+
1856+
void llama_embd_normalize(const float * inp, float * out, int n) {
1857+
double sum = 0.0;
1858+
for (int i = 0; i < n; i++) {
1859+
sum += inp[i] * inp[i];
1860+
}
1861+
sum = sqrt(sum);
1862+
1863+
const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
1864+
1865+
for (int i = 0; i < n; i++) {
1866+
out[i] = inp[i] * norm;
1867+
}
1868+
}
1869+

Diff for: common/common.h

+7
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
260260

261261
// Dump the KV cache view showing individual sequences in each cell (long output).
262262
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
263+
264+
//
265+
// Embedding utils
266+
//
267+
268+
void llama_embd_normalize(const float * inp, float * out, int n);
269+

Diff for: common/grammar-parser.cpp

+16
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,22 @@ namespace grammar_parser {
278278
while (*pos) {
279279
pos = parse_rule(state, pos);
280280
}
281+
// Validate the state to ensure that all rules are defined
282+
for (const auto & rule : state.rules) {
283+
for (const auto & elem : rule) {
284+
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
285+
// Ensure that the rule at that location exists
286+
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
287+
// Get the name of the rule that is missing
288+
for (const auto & kv : state.symbol_ids) {
289+
if (kv.second == elem.value) {
290+
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
291+
}
292+
}
293+
}
294+
}
295+
}
296+
}
281297
return state;
282298
} catch (const std::exception & err) {
283299
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());

Diff for: examples/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ else()
2020
add_subdirectory(convert-llama2c-to-ggml)
2121
add_subdirectory(embedding)
2222
add_subdirectory(finetune)
23+
add_subdirectory(gritlm)
2324
add_subdirectory(infill)
2425
add_subdirectory(llama-bench)
2526
add_subdirectory(llava)

Diff for: examples/benchmark/benchmark-matmult.cpp

+2-4
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,10 @@ int main(int argc, char ** argv) {
189189

190190
int32_t nelements = sizex*sizey;
191191

192-
std::vector<int64_t> hist_cur(1 << 4, 0);
193-
194192
// Set up a the benchmark matrices
195193
// printf("Creating new tensor q11 & Running quantize\n");
196194
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
197-
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
195+
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
198196

199197
// Set up a the compute graph
200198
// printf("Creating new tensor q31\n");
@@ -207,7 +205,7 @@ int main(int argc, char ** argv) {
207205
// Set up a second graph computation to make sure we override the CPU cache lines
208206
// printf("Creating new tensor q12 & Running quantize\n");
209207
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
210-
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
208+
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
211209

212210
// printf("Creating new tensor q32\n");
213211
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);

Diff for: examples/embedding/embedding.cpp

+1-13
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,6 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
2323
}
2424
}
2525

26-
static void normalize(const float * vec, float * out, int n) {
27-
float norm = 0;
28-
for (int i = 0; i < n; i++) {
29-
norm += vec[i] * vec[i];
30-
}
31-
norm = sqrt(norm);
32-
for (int i = 0; i < n; i++) {
33-
out[i] = vec[i] / norm;
34-
}
35-
}
36-
3726
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
3827
// clear previous kv_cache values (irrelevant for embeddings)
3928
llama_kv_cache_clear(ctx);
@@ -44,7 +33,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
4433
fprintf(stderr, "%s : failed to decode\n", __func__);
4534
}
4635

47-
// normalize on copy
4836
for (int i = 0; i < batch.n_tokens; i++) {
4937
if (!batch.logits[i]) {
5038
continue;
@@ -61,7 +49,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
6149
}
6250

6351
float * out = output + batch.seq_id[i][0] * n_embd;
64-
normalize(embd, out, n_embd);
52+
llama_embd_normalize(embd, out, n_embd);
6553
}
6654
}
6755

Diff for: examples/gritlm/CMakeLists.txt

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET gritlm)
2+
add_executable(${TARGET} gritlm.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

0 commit comments

Comments
 (0)