Skip to content

Commit 0723ac5

Browse files
committed
cmake build update
and correct warning
1 parent a8e7e6f commit 0723ac5

File tree

8 files changed

+109
-67
lines changed

8 files changed

+109
-67
lines changed

CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
8888
set(GGML_LLAMAFILE_DEFAULT ON)
8989
endif()
9090

91+
if (NOT DEFINED GGML_OPENMP_SIMD)
92+
set(GGML_OPENMP_SIMD_DEFAULT ON)
93+
endif()
94+
9195
if (NOT DEFINED GGML_AMX)
9296
set(GGML_AMX ON)
9397
endif()

examples/perplexity/perplexity.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1846,9 +1846,9 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
18461846
total_seconds = total_seconds % (60*60);
18471847
}
18481848
LOG("%.2f minutes\n", total_seconds / 60.0);
1849+
LOG("\n");
1850+
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
18491851
}
1850-
LOG("\n");
1851-
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
18521852

18531853
const int first = n_ctx/2;
18541854
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);

ggml/CMakeLists.txt

+9-4
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ if (NOT GGML_LLAMAFILE_DEFAULT)
6161
set(GGML_LLAMAFILE_DEFAULT OFF)
6262
endif()
6363

64+
if (NOT GGML_OPENMP_SIMD_DEFAULT)
65+
set(GGML_OPENMP_SIMD_DEFAULT OFF)
66+
endif()
67+
6468
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
6569
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
6670
endif()
@@ -109,6 +113,7 @@ endif()
109113
option(GGML_LASX "ggml: enable lasx" ON)
110114
option(GGML_LSX "ggml: enable lsx" ON)
111115
option(GGML_SVE "ggml: enable SVE" OFF)
116+
option(GGML_OPENMP_SIMD "ggml: enable OPENMP_SIMD" ${GGML_OPENMP_SIMD_DEFAULT})
112117

113118
if (WIN32)
114119
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
@@ -178,11 +183,11 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
178183
set(CMAKE_C_STANDARD 11)
179184
set(CMAKE_C_STANDARD_REQUIRED true)
180185

181-
if (GGML_SYCL)
186+
#if (GGML_SYCL)
182187
set(CMAKE_CXX_STANDARD 17)
183-
else()
184-
set(CMAKE_CXX_STANDARD 11)
185-
endif()
188+
#else()
189+
# set(CMAKE_CXX_STANDARD 11)
190+
#endif()
186191
set(CMAKE_CXX_STANDARD_REQUIRED true)
187192

188193
set(THREADS_PREFER_PTHREAD_FLAG ON)

ggml/include/ggml.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ extern "C" {
394394
GGML_TYPE_E4M3 = 37,
395395
GGML_TYPE_E4M3_Q = 38,
396396
GGML_TYPE_E3M4_Q = 39,
397-
// E5M6 => 12 bits vs 16 bits for BF16 = E8M7 / FP16 = E5M10
397+
// E5M6 => 12 bits vs 16 bits for BF16 = E8M7 / FP16 = E5M10
398398
GGML_TYPE_COUNT,
399399
};
400400

ggml/src/CMakeLists.txt

+18
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ if (GGML_MUSA)
154154
endif()
155155

156156
if (GGML_OPENMP)
157+
set(OpenMP_RUNTIME_MSVC "experimental")
157158
find_package(OpenMP)
158159
if (OpenMP_FOUND)
159160
message(STATUS "OpenMP found")
@@ -171,6 +172,18 @@ if (GGML_OPENMP)
171172
endif()
172173
endif()
173174

175+
if (GGML_OPENMP_SIMD)
176+
check_cxx_compiler_flag("-fopenmp-simd" SUPPORTS_OPENMP_SIMD)
177+
if(SUPPORTS_OPENMP_SIMD)
178+
# OpenMP_RUNTIME_MSVC=experimental / if (MSVC)
179+
message(STATUS "Using openmp_simd.")
180+
add_compile_definitions(GGML_USE_OPENMP_SIMD)
181+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp-simd")
182+
else()
183+
message(FATAL_ERROR, "C++ compiler lacks OPENMP_SIMD support.")
184+
endif()
185+
endif()
186+
174187
if (GGML_BLAS)
175188
if (GGML_STATIC)
176189
set(BLA_STATIC ON)
@@ -1360,6 +1373,10 @@ endif()
13601373
# libraries
13611374
#
13621375

1376+
# FP8
1377+
file(GLOB GGML_HEADERS_FP8 "ggml-fp8.h")
1378+
file(GLOB GGML_SOURCES_FP8 "ggml-fp8.cpp")
1379+
13631380
# ggml
13641381

13651382
add_library(ggml
@@ -1384,6 +1401,7 @@ add_library(ggml
13841401
${GGML_SOURCES_AMX} ${GGML_HEADERS_AMX}
13851402
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
13861403
ggml-aarch64.c ggml-aarch64.h
1404+
${GGML_SOURCES_FP8} ${GGML_HEADERS_FP8}
13871405
)
13881406

13891407
if (EMSCRIPTEN)

ggml/src/ggml-common.h

+17-16
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,15 @@ typedef uint32_t ggml_half2;
88

99
#define GGML_COMMON_AGGR
1010

11+
#define GGML_COMMON_DECL
12+
#elif defined(GGML_COMMON_DECL_CPP)
13+
#include <cstdint>
14+
15+
typedef uint16_t ggml_half;
16+
typedef uint32_t ggml_half2;
17+
18+
#define GGML_COMMON_AGGR data
19+
1120
#define GGML_COMMON_DECL
1221
#elif defined(GGML_COMMON_DECL_METAL)
1322
#include <metal_stdlib>
@@ -419,22 +428,7 @@ typedef struct {
419428
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
420429

421430
// the fp8 types.
422-
typedef uint8_t ggml_e5m2;
423-
typedef uint8_t ggml_e4m3;
424-
typedef uint8_t ggml_e3m4;
425-
426-
// fp8 with bloc delta => 8.125 bpw
427-
typedef struct {
428-
float d; // delta
429-
ggml_e4m3 qs[QK_K];
430-
} block_e4m3_q;
431-
static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding");
432-
433-
typedef struct {
434-
float d; // delta
435-
ggml_e3m4 qs[QK_K];
436-
} block_e3m4_q;
437-
static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding");
431+
#include "ggml-fp8.h"
438432

439433
#endif // GGML_COMMON_DECL
440434
#endif // GGML_COMMON_DECL
@@ -449,6 +443,13 @@ static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q
449443
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
450444
#define GGML_TABLE_END() };
451445

446+
#define GGML_COMMON_IMPL
447+
#elif defined(GGML_COMMON_IMPL_CPP)
448+
#include <cstdint>
449+
450+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
451+
#define GGML_TABLE_END() };
452+
452453
#define GGML_COMMON_IMPL
453454
#elif defined(GGML_COMMON_IMPL_METAL)
454455
#include <metal_stdlib>

ggml/src/ggml-fp8.cpp

+27-23
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,26 @@
1-
#define GGML_COMMON_IMPL_C
2-
#include "ggml-common.h"
1+
// #include <iostream>
2+
#include <cstdint>
3+
#include <cassert>
4+
#include <cmath>
35

46
#include "ggml-fp8.h"
57

6-
#include <cassert>
7-
88
/*
9+
make clean
10+
make -j8
911
# ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
1012
./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
1113
./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
12-
# ./llama-perplexity -f ~/LLM/wikitext-2-raw/wiki.test.raw -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
1314
./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
1415
15-
*/
16+
rm -rf build
17+
cmake -B build
18+
cmake --build build --config Release
19+
./build/bin/llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
20+
./build/bin/llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
21+
./build/bin/llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
1622
17-
#include <iostream>
18-
#include <cstdint>
19-
#include <immintrin.h>
23+
*/
2024

2125
template<int N> constexpr float EXP2() {
2226
if constexpr (N==0) return 1;
@@ -188,29 +192,29 @@ void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML
188192
}
189193

190194
void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
191-
assert(k % QK_K == 0);
192-
conv(reinterpret_cast<const bloc_fp8<4, QK_K>*>(x), y, k);
195+
assert(k % FP8_QK == 0);
196+
conv(reinterpret_cast<const bloc_fp8<4, FP8_QK>*>(x), y, k);
193197
}
194198
void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
195-
assert(k % QK_K == 0);
196-
conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
199+
assert(k % FP8_QK == 0);
200+
conv(x, reinterpret_cast<bloc_fp8<4, FP8_QK>*>(y), k);
197201
}
198202
void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
199-
assert(k % QK_K == 0);
200-
conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
203+
assert(k % FP8_QK == 0);
204+
conv(x, reinterpret_cast<bloc_fp8<4, FP8_QK>*>(y), k);
201205
}
202206

203207
void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
204-
assert(k % QK_K == 0);
205-
conv(reinterpret_cast<const bloc_fp8<3, QK_K>*>(x), y, k);
208+
assert(k % FP8_QK == 0);
209+
conv(reinterpret_cast<const bloc_fp8<3, FP8_QK>*>(x), y, k);
206210
}
207211
void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
208-
assert(k % QK_K == 0);
209-
conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
212+
assert(k % FP8_QK == 0);
213+
conv(x, reinterpret_cast<bloc_fp8<3, FP8_QK>*>(y), k);
210214
}
211215
void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
212-
assert(k % QK_K == 0);
213-
conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
216+
assert(k % FP8_QK == 0);
217+
conv(x, reinterpret_cast<bloc_fp8<3, FP8_QK>*>(y), k);
214218
}
215219

216220
// the dot product for FP8 weight
@@ -238,7 +242,7 @@ void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_
238242
GGML_UNUSED(bx);
239243
GGML_UNUSED(by);
240244
GGML_UNUSED(bs);
241-
*s = dot(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);
245+
*s = dot(reinterpret_cast<const bloc_fp8<4, FP8_QK>*>(vx), vy, n);
242246
}
243247

244248
void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -247,5 +251,5 @@ void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_
247251
GGML_UNUSED(bx);
248252
GGML_UNUSED(by);
249253
GGML_UNUSED(bs);
250-
*s = dot(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);
254+
*s = dot(reinterpret_cast<const bloc_fp8<3, FP8_QK>*>(vx), vy, n);
251255
}

ggml/src/ggml-fp8.h

+31-21
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,51 @@
11
#pragma once
2-
3-
#define GGML_COMMON_DECL_C
4-
#include "ggml-common.h"
5-
62
#include "ggml.h"
73

8-
// les definitions / converstion FP8 <=> FP32
94
#ifdef __cplusplus
105
extern "C" {
116
#endif
127

8+
#define FP8_QK 256
9+
1310
typedef struct { uint8_t bits; } ggml_e5m2_t;
1411
typedef struct { uint8_t bits; } ggml_e4m3_t;
1512
typedef struct { uint8_t bits; } ggml_e3m4_t;
1613

17-
void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
18-
void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
19-
void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
14+
// fp8 with bloc delta => 8.125 bpw
15+
typedef struct {
16+
float d; // delta
17+
uint8_t qs[FP8_QK];
18+
} block_e4m3_q;
19+
static_assert(sizeof(block_e4m3_q) == sizeof(float) + FP8_QK, "wrong block_e4m3_q block size/padding");
20+
21+
typedef struct {
22+
float d; // delta
23+
uint8_t qs[FP8_QK];
24+
} block_e3m4_q;
25+
static_assert(sizeof(block_e3m4_q) == sizeof(float) + FP8_QK, "wrong block_e3m4_q block size/padding");
26+
27+
GGML_API void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
28+
GGML_API void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
29+
GGML_API void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
2030

21-
void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
22-
void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
23-
void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
31+
GGML_API void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
32+
GGML_API void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
33+
GGML_API void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
2434

25-
void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
26-
void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
27-
void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
35+
GGML_API void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
36+
GGML_API void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
37+
GGML_API void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
2838

29-
void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
30-
void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
31-
void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
39+
GGML_API void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
40+
GGML_API void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
41+
GGML_API void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
3242

3343
// TODO: the best depend on the CPU fp32 / bf16 / fp16
3444
#define GGML_FP8_VECT_DOT_TYPE GGML_TYPE_F32
35-
void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
36-
void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
37-
void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
38-
void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
45+
GGML_API void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
46+
GGML_API void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
47+
GGML_API void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
48+
GGML_API void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
3949

4050
#ifdef __cplusplus
4151
}

0 commit comments

Comments
 (0)