Skip to content

Commit 57b9328

Browse files
authored
Merge branch 'master' into multiple-chat-templates
2 parents 980bb16 + c71bfd7 commit 57b9328

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+5930
-3579
lines changed

.github/workflows/build.yml

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252
id: cmake_test
5353
run: |
5454
cd build
55-
ctest -L main --verbose --timeout 900
55+
ctest -L 'main|curl' --verbose --timeout 900
5656
5757
- name: Determine tag name
5858
id: tag
@@ -101,7 +101,9 @@ jobs:
101101
sysctl -a
102102
mkdir build
103103
cd build
104-
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
104+
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
105+
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
106+
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON ..
105107
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
106108
107109
- name: Test
@@ -209,21 +211,21 @@ jobs:
209211
id: depends
210212
run: |
211213
sudo apt-get update
212-
sudo apt-get install build-essential
214+
sudo apt-get install build-essential libcurl4-openssl-dev
213215
214216
- name: Build
215217
id: cmake_build
216218
run: |
217219
mkdir build
218220
cd build
219-
cmake .. -DLLAMA_FATAL_WARNINGS=ON
221+
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
220222
cmake --build . --config Release -j $(nproc)
221223
222224
- name: Test
223225
id: cmake_test
224226
run: |
225227
cd build
226-
ctest -L main --verbose --timeout 900
228+
ctest -L 'main|curl' --verbose --timeout 900
227229
228230
- name: Test llama2c conversion
229231
id: llama2c_test
@@ -938,6 +940,12 @@ jobs:
938940
- name: Download artifacts
939941
id: download-artifact
940942
uses: actions/download-artifact@v4
943+
with:
944+
path: ./artifact
945+
946+
- name: Move artifacts
947+
id: move_artifacts
948+
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
941949

942950
- name: Create release
943951
id: create_release
@@ -956,15 +964,15 @@ jobs:
956964
const path = require('path');
957965
const fs = require('fs');
958966
const release_id = '${{ steps.create_release.outputs.id }}';
959-
for (let file of await fs.readdirSync('./artifact')) {
967+
for (let file of await fs.readdirSync('./artifact/release')) {
960968
if (path.extname(file) === '.zip') {
961969
console.log('uploadReleaseAsset', file);
962970
await github.repos.uploadReleaseAsset({
963971
owner: context.repo.owner,
964972
repo: context.repo.repo,
965973
release_id: release_id,
966974
name: file,
967-
data: await fs.readFileSync(`./artifact/${file}`)
975+
data: await fs.readFileSync(`./artifact/release/${file}`)
968976
});
969977
}
970978
}

.github/workflows/docker.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,14 +91,20 @@ jobs:
9191
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
9292
fi
9393
94+
- name: Downcase github.repository_owner
95+
run: |
96+
echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV
97+
env:
98+
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
99+
94100
- name: Build and push Docker image (versioned)
95101
if: github.event_name == 'push'
96102
uses: docker/build-push-action@v4
97103
with:
98104
context: .
99105
push: true
100106
platforms: ${{ matrix.config.platforms }}
101-
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
107+
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
102108
file: ${{ matrix.config.dockerfile }}
103109

104110
- name: Build and push Docker image (tagged)
@@ -107,5 +113,5 @@ jobs:
107113
context: .
108114
push: ${{ github.event_name == 'push' }}
109115
platforms: ${{ matrix.config.platforms }}
110-
tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
116+
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
111117
file: ${{ matrix.config.dockerfile }}

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ models-mnt
4848
/convert-llama2c-to-ggml
4949
/embd-input-test
5050
/embedding
51+
/eval-callback
5152
/gguf
5253
/gguf-llama-simple
5354
/gguf-split

CMakeLists.txt

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,18 @@ else()
4343
set(LLAMA_METAL_DEFAULT OFF)
4444
endif()
4545

46+
# TODO: fix this for Android CI
47+
# https://github.com/ggerganov/llama.cpp/pull/6716#issuecomment-2061509191
48+
#if (CMAKE_SYSTEM_NAME MATCHES "ANDROID")
49+
# set(LLAMA_LLAMAFILE_DEFAULT OFF)
50+
#else()
51+
# set(LLAMA_LLAMAFILE_DEFAULT ON)
52+
#endif()
53+
54+
# TODO: temporary disable until MoE is fixed
55+
# https://github.com/ggerganov/llama.cpp/pull/6716
56+
set(LLAMA_LLAMAFILE_DEFAULT OFF)
57+
4658
# general
4759
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
4860
option(LLAMA_STATIC "llama: static link libraries" OFF)
@@ -88,6 +100,7 @@ endif()
88100
# 3rd party libs
89101
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
90102
option(LLAMA_BLAS "llama: use BLAS" OFF)
103+
option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT})
91104
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
92105
option(LLAMA_CUDA "llama: use CUDA" OFF)
93106
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
@@ -286,6 +299,7 @@ if (LLAMA_METAL)
286299
${METALKIT_FRAMEWORK}
287300
)
288301
endif()
302+
289303
if (LLAMA_BLAS)
290304
if (LLAMA_STATIC)
291305
set(BLA_STATIC ON)
@@ -368,6 +382,13 @@ if (LLAMA_BLAS)
368382
endif()
369383
endif()
370384

385+
if (LLAMA_LLAMAFILE)
386+
add_compile_definitions(GGML_USE_LLAMAFILE)
387+
388+
set(GGML_HEADERS_LLAMAFILE sgemm.h)
389+
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
390+
endif()
391+
371392
if (LLAMA_QKK_64)
372393
add_compile_definitions(GGML_QKK_64)
373394
endif()
@@ -1151,15 +1172,16 @@ add_library(ggml OBJECT
11511172
ggml-backend.h
11521173
ggml-quants.c
11531174
ggml-quants.h
1154-
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
1155-
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
1156-
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
1157-
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
1158-
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
1159-
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
1160-
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
1161-
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
1162-
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
1175+
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
1176+
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
1177+
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
1178+
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
1179+
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
1180+
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
1181+
${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
1182+
${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN}
1183+
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
1184+
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
11631185
)
11641186

11651187
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})

Makefile

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Define the default target now so that it is always the first target
22
BUILD_TARGETS = \
33
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4-
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
4+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
55
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
66

77
# Binaries only useful for tests
@@ -384,6 +384,15 @@ ifdef LLAMA_OPENBLAS
384384
MK_LDFLAGS += $(shell pkg-config --libs openblas)
385385
endif # LLAMA_OPENBLAS
386386

387+
# TODO: temporary disable until MoE is fixed
388+
# https://github.com/ggerganov/llama.cpp/pull/6716
389+
LLAMA_NO_LLAMAFILE := 1
390+
391+
ifndef LLAMA_NO_LLAMAFILE
392+
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
393+
OBJS += sgemm.o
394+
endif
395+
387396
ifdef LLAMA_BLIS
388397
MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
389398
MK_LDFLAGS += -lblis -L/usr/local/lib
@@ -480,11 +489,9 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
480489

481490
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
482491
$(NVCC_COMPILE)
483-
484492
endif # LLAMA_CUDA
485493

486494
ifdef LLAMA_CLBLAST
487-
488495
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
489496
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
490497
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
@@ -603,6 +610,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
603610
$(CC) $(CFLAGS) -c $< -o $@
604611
endif # LLAMA_MPI
605612

613+
ifndef LLAMA_NO_LLAMAFILE
614+
sgemm.o: sgemm.cpp sgemm.h ggml.h
615+
$(CXX) $(CXXFLAGS) -c $< -o $@
616+
endif
617+
606618
GF_CC := $(CC)
607619
include scripts/get-flags.mk
608620

@@ -646,7 +658,7 @@ CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])'
646658
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
647659
ifndef CUDA_DOCKER_ARCH
648660
ifndef CUDA_POWER_ARCH
649-
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
661+
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
650662
endif # CUDA_POWER_ARCH
651663
endif # CUDA_DOCKER_ARCH
652664
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
@@ -688,7 +700,7 @@ llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml
688700
$(CXX) $(CXXFLAGS) -c $< -o $@
689701

690702
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
691-
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o
703+
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
692704

693705
common.o: common/common.cpp $(COMMON_H_DEPS)
694706
$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -756,7 +768,7 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C
756768
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
757769
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
758770

759-
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS)
771+
batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
760772
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
761773
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
762774

@@ -788,7 +800,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
788800
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
789801
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
790802

791-
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
803+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
792804
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
793805
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
794806

@@ -800,6 +812,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
800812
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
801813
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
802814

815+
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
816+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
817+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
818+
803819
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
804820
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
805821
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

Package.swift

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,45 @@
22

33
import PackageDescription
44

5+
var sources = [
6+
"ggml.c",
7+
"sgemm.cpp",
8+
"llama.cpp",
9+
"unicode.cpp",
10+
"unicode-data.cpp",
11+
"ggml-alloc.c",
12+
"ggml-backend.c",
13+
"ggml-quants.c",
14+
]
15+
16+
var resources: [Resource] = []
17+
var linkerSettings: [LinkerSetting] = []
18+
var cSettings: [CSetting] = [
19+
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
20+
.unsafeFlags(["-fno-objc-arc"]),
21+
// NOTE: NEW_LAPACK will required iOS version 16.4+
22+
// We should consider add this in the future when we drop support for iOS 14
23+
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
24+
// .define("ACCELERATE_NEW_LAPACK"),
25+
// .define("ACCELERATE_LAPACK_ILP64")
26+
]
27+
28+
#if canImport(Darwin)
29+
sources.append("ggml-metal.m")
30+
resources.append(.process("ggml-metal.metal"))
31+
linkerSettings.append(.linkedFramework("Accelerate"))
32+
cSettings.append(
33+
contentsOf: [
34+
.define("GGML_USE_ACCELERATE"),
35+
.define("GGML_USE_METAL")
36+
]
37+
)
38+
#endif
39+
40+
#if os(Linux)
41+
cSettings.append(.define("_GNU_SOURCE"))
42+
#endif
43+
544
let package = Package(
645
name: "llama",
746
platforms: [
@@ -28,34 +67,11 @@ let package = Package(
2867
"ggml-cuda.h",
2968
"Makefile"
3069
],
31-
sources: [
32-
"ggml.c",
33-
"llama.cpp",
34-
"unicode.cpp",
35-
"unicode-data.cpp",
36-
"ggml-alloc.c",
37-
"ggml-backend.c",
38-
"ggml-quants.c",
39-
"ggml-metal.m",
40-
],
41-
resources: [
42-
.process("ggml-metal.metal")
43-
],
70+
sources: sources,
71+
resources: resources,
4472
publicHeadersPath: "spm-headers",
45-
cSettings: [
46-
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
47-
.define("GGML_USE_ACCELERATE"),
48-
.unsafeFlags(["-fno-objc-arc"]),
49-
.define("GGML_USE_METAL"),
50-
// NOTE: NEW_LAPACK will required iOS version 16.4+
51-
// We should consider add this in the future when we drop support for iOS 14
52-
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
53-
// .define("ACCELERATE_NEW_LAPACK"),
54-
// .define("ACCELERATE_LAPACK_ILP64")
55-
],
56-
linkerSettings: [
57-
.linkedFramework("Accelerate")
58-
]
73+
cSettings: cSettings,
74+
linkerSettings: linkerSettings
5975
)
6076
],
6177
cxxLanguageStandard: .cxx11

0 commit comments

Comments
 (0)