Skip to content

Commit 4b4fe02

Browse files
committedOct 26, 2024
cmake build update
and correct warning
1 parent a8e7e6f commit 4b4fe02

File tree

8 files changed

+77
-29
lines changed

8 files changed

+77
-29
lines changed
 

‎CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
8888
set(GGML_LLAMAFILE_DEFAULT ON)
8989
endif()
9090

91+
if (NOT DEFINED GGML_OPENMP_SIMD)
92+
set(GGML_OPENMP_SIMD_DEFAULT ON)
93+
endif()
94+
9195
if (NOT DEFINED GGML_AMX)
9296
set(GGML_AMX ON)
9397
endif()

‎examples/perplexity/perplexity.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1846,9 +1846,9 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
18461846
total_seconds = total_seconds % (60*60);
18471847
}
18481848
LOG("%.2f minutes\n", total_seconds / 60.0);
1849+
LOG("\n");
1850+
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
18491851
}
1850-
LOG("\n");
1851-
LOG("chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n");
18521852

18531853
const int first = n_ctx/2;
18541854
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);

‎ggml/CMakeLists.txt

+9-4
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ if (NOT GGML_LLAMAFILE_DEFAULT)
6161
set(GGML_LLAMAFILE_DEFAULT OFF)
6262
endif()
6363

64+
if (NOT GGML_OPENMP_SIMD_DEFAULT)
65+
set(GGML_OPENMP_SIMD_DEFAULT OFF)
66+
endif()
67+
6468
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
6569
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
6670
endif()
@@ -109,6 +113,7 @@ endif()
109113
option(GGML_LASX "ggml: enable lasx" ON)
110114
option(GGML_LSX "ggml: enable lsx" ON)
111115
option(GGML_SVE "ggml: enable SVE" OFF)
116+
option(GGML_OPENMP_SIMD "ggml: enable OPENMP_SIMD" ${GGML_OPENMP_SIMD_DEFAULT})
112117

113118
if (WIN32)
114119
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
@@ -178,11 +183,11 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
178183
set(CMAKE_C_STANDARD 11)
179184
set(CMAKE_C_STANDARD_REQUIRED true)
180185

181-
if (GGML_SYCL)
186+
#if (GGML_SYCL)
182187
set(CMAKE_CXX_STANDARD 17)
183-
else()
184-
set(CMAKE_CXX_STANDARD 11)
185-
endif()
188+
#else()
189+
# set(CMAKE_CXX_STANDARD 11)
190+
#endif()
186191
set(CMAKE_CXX_STANDARD_REQUIRED true)
187192

188193
set(THREADS_PREFER_PTHREAD_FLAG ON)

‎ggml/include/ggml.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ extern "C" {
394394
GGML_TYPE_E4M3 = 37,
395395
GGML_TYPE_E4M3_Q = 38,
396396
GGML_TYPE_E3M4_Q = 39,
397-
// E5M6 => 12 bits vs 16 bits for BF16 = E8M7 / FP16 = E5M10
397+
// E5M6 => 12 bits vs 16 bits for BF16 = E8M7 / FP16 = E5M10
398398
GGML_TYPE_COUNT,
399399
};
400400

‎ggml/src/CMakeLists.txt

+18
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ if (GGML_MUSA)
154154
endif()
155155

156156
if (GGML_OPENMP)
157+
set(OpenMP_RUNTIME_MSVC "experimental")
157158
find_package(OpenMP)
158159
if (OpenMP_FOUND)
159160
message(STATUS "OpenMP found")
@@ -171,6 +172,18 @@ if (GGML_OPENMP)
171172
endif()
172173
endif()
173174

175+
if (GGML_OPENMP_SIMD)
176+
check_cxx_compiler_flag("-fopenmp-simd" SUPPORTS_OPENMP_SIMD)
177+
if(SUPPORTS_OPENMP_SIMD)
178+
# OpenMP_RUNTIME_MSVC=experimental / if (MSVC)
179+
message(STATUS "Using openmp_simd.")
180+
add_compile_definitions(GGML_USE_OPENMP_SIMD)
181+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp-simd")
182+
else()
183+
message(FATAL_ERROR, "C++ compiler lacks OPENMP_SIMD support.")
184+
endif()
185+
endif()
186+
174187
if (GGML_BLAS)
175188
if (GGML_STATIC)
176189
set(BLA_STATIC ON)
@@ -1360,6 +1373,10 @@ endif()
13601373
# libraries
13611374
#
13621375

1376+
# FP8
1377+
file(GLOB GGML_HEADERS_FP8 "ggml-fp8.h")
1378+
file(GLOB GGML_SOURCES_FP8 "ggml-fp8.cpp")
1379+
13631380
# ggml
13641381

13651382
add_library(ggml
@@ -1384,6 +1401,7 @@ add_library(ggml
13841401
${GGML_SOURCES_AMX} ${GGML_HEADERS_AMX}
13851402
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
13861403
ggml-aarch64.c ggml-aarch64.h
1404+
${GGML_SOURCES_FP8} ${GGML_HEADERS_FP8}
13871405
)
13881406

13891407
if (EMSCRIPTEN)

‎ggml/src/ggml-common.h

+16
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,15 @@ typedef uint32_t ggml_half2;
88

99
#define GGML_COMMON_AGGR
1010

11+
#define GGML_COMMON_DECL
12+
#elif defined(GGML_COMMON_DECL_CPP)
13+
#include <cstdint>
14+
15+
typedef uint16_t ggml_half;
16+
typedef uint32_t ggml_half2;
17+
18+
#define GGML_COMMON_AGGR data
19+
1120
#define GGML_COMMON_DECL
1221
#elif defined(GGML_COMMON_DECL_METAL)
1322
#include <metal_stdlib>
@@ -449,6 +458,13 @@ static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q
449458
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
450459
#define GGML_TABLE_END() };
451460

461+
#define GGML_COMMON_IMPL
462+
#elif defined(GGML_COMMON_IMPL_CPP)
463+
#include <cstdint>
464+
465+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
466+
#define GGML_TABLE_END() };
467+
452468
#define GGML_COMMON_IMPL
453469
#elif defined(GGML_COMMON_IMPL_METAL)
454470
#include <metal_stdlib>

‎ggml/src/ggml-fp8.cpp

+11-4
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,29 @@
1-
#define GGML_COMMON_IMPL_C
1+
#define GGML_COMMON_DECL_CPP
2+
#define GGML_COMMON_IMPL_CPP
23
#include "ggml-common.h"
3-
44
#include "ggml-fp8.h"
55

66
#include <cassert>
77

88
/*
9+
make clean
10+
make -j8
911
# ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
1012
./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
1113
./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
12-
# ./llama-perplexity -f ~/LLM/wikitext-2-raw/wiki.test.raw -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
1314
./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
1415
16+
rm -rf build
17+
cmake -B build
18+
cmake --build build --config Release
19+
./build/bin/llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
20+
./build/bin/llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
21+
./build/bin/llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
22+
1523
*/
1624

1725
#include <iostream>
1826
#include <cstdint>
19-
#include <immintrin.h>
2027

2128
template<int N> constexpr float EXP2() {
2229
if constexpr (N==0) return 1;

‎ggml/src/ggml-fp8.h

+16-18
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,8 @@
22

33
#define GGML_COMMON_DECL_C
44
#include "ggml-common.h"
5-
65
#include "ggml.h"
76

8-
// les definitions / converstion FP8 <=> FP32
97
#ifdef __cplusplus
108
extern "C" {
119
#endif
@@ -14,28 +12,28 @@ extern "C" {
1412
typedef struct { uint8_t bits; } ggml_e4m3_t;
1513
typedef struct { uint8_t bits; } ggml_e3m4_t;
1614

17-
void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
18-
void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
19-
void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
15+
GGML_API void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
16+
GGML_API void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
17+
GGML_API void ggml_fp32_to_e5m2_row_ref(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
2018

21-
void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
22-
void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
23-
void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
19+
GGML_API void ggml_e4m3_to_fp32_row(const ggml_e4m3_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
20+
GGML_API void ggml_fp32_to_e4m3_row(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
21+
GGML_API void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML_RESTRICT y, int64_t k);
2422

25-
void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
26-
void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
27-
void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
23+
GGML_API void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
24+
GGML_API void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
25+
GGML_API void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k);
2826

29-
void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
30-
void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
31-
void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
27+
GGML_API void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
28+
GGML_API void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
29+
GGML_API void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k);
3230

3331
// TODO: the best depend on the CPU fp32 / bf16 / fp16
3432
#define GGML_FP8_VECT_DOT_TYPE GGML_TYPE_F32
35-
void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
36-
void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
37-
void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
38-
void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
33+
GGML_API void ggml_vec_dot_e5m2(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e5m2_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
34+
GGML_API void ggml_vec_dot_e4m3(int n, float * GGML_RESTRICT s, size_t bs, const ggml_e4m3_t * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
35+
GGML_API void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e4m3_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
36+
GGML_API void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc);
3937

4038
#ifdef __cplusplus
4139
}

0 commit comments

Comments
 (0)