Skip to content

Commit e2bc44b

Browse files
committed
Add FP8 support to gguf/llama:
E5M2 & E4M3: for use with FP8 distributed model E4M3_Q & E3M4_Q: for gguf quantized model. E5M2 and A4M3 type are use like FP16 / BF16 native. E4M3_Q and E3M4_Q are define like Q8_0 with bloc size of 256 (like QK_K)
1 parent 8836bff commit e2bc44b

17 files changed

+571
-124
lines changed

CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
9696
set(GGML_LLAMAFILE_DEFAULT ON)
9797
endif()
9898

99+
if (NOT DEFINED GGML_OPENMP_SIMD)
100+
set(GGML_OPENMP_SIMD_DEFAULT ON)
101+
endif()
102+
99103
if (NOT DEFINED GGML_AMX)
100104
set(GGML_AMX ON)
101105
endif()

Makefile

+16-12
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,10 @@ GGML_NO_OPENMP := 1
138138
DEPRECATE_WARNING := 1
139139
endif
140140

141+
ifdef LLAMA_NO_OPENMP_SIMD
142+
GGML_NO_OPENMP_SIMD := 1
143+
endif
144+
141145
ifdef LLAMA_NO_METAL
142146
GGML_NO_METAL := 1
143147
DEPRECATE_WARNING := 1
@@ -542,6 +546,13 @@ ifndef GGML_NO_OPENMP
542546
MK_CXXFLAGS += -fopenmp
543547
endif # GGML_NO_OPENMP
544548

549+
ifndef GGML_NO_OPENMP_SIMD
550+
MK_CPPFLAGS += -DGGML_USE_OPENMP_SIMD
551+
MK_CFLAGS += -fopenmp-simd
552+
MK_CXXFLAGS += -fopenmp-simd
553+
# -openmp:experimental pour MSVC?
554+
endif # GGML_NO_OPENMP_SIMD
555+
545556
ifdef GGML_OPENBLAS
546557
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
547558
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -945,11 +956,12 @@ OBJ_GGML = \
945956
$(DIR_GGML)/src/ggml-alloc.o \
946957
$(DIR_GGML)/src/ggml-backend.o \
947958
$(DIR_GGML)/src/ggml-backend-reg.o \
959+
$(DIR_GGML)/src/ggml-fp8_cpp17.o \
948960
$(DIR_GGML)/src/ggml-opt.o \
949961
$(DIR_GGML)/src/ggml-quants.o \
950962
$(DIR_GGML)/src/ggml-threading.o \
951963
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
952-
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
964+
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp17.o \
953965
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
954966
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
955967
$(OBJ_GGML_EXT)
@@ -1091,17 +1103,9 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
10911103
# Default target
10921104
all: $(BUILD_TARGETS)
10931105

1094-
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
1095-
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
1096-
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
1097-
ggml/src/ggml-cpu/ggml-cpu.cpp \
1098-
ggml/include/ggml-backend.h \
1099-
ggml/include/ggml.h \
1100-
ggml/include/ggml-alloc.h \
1101-
ggml/src/ggml-backend-impl.h \
1102-
ggml/include/ggml-cpu.h \
1103-
ggml/src/ggml-impl.h
1104-
$(CXX) $(CXXFLAGS) -c $< -o $@
1106+
# for c++17 build
1107+
$(DIR_GGML)/%_cpp17.o: $(DIR_GGML)/%.cpp
1108+
$(CXX) $(CXXFLAGS) -MMD -std=c++17 -c $< -o $@
11051109

11061110
# Rules for building object files
11071111
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c

Package.swift

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ var sources = [
2020
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
2121
"ggml/src/ggml-threading.cpp",
2222
"ggml/src/ggml-quants.c",
23+
"ggml/src/ggml-fp8.cpp",
2324
]
2425

2526
var resources: [Resource] = []

examples/quantize/quantize.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
5151
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
5252
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
5353
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
54+
{ "E4M3_Q", LLAMA_FTYPE_MOSTLY_E4M3_Q, "12.21G, 0.0050 kld @ Mistral-Nemo", },
55+
{ "E3M4_Q", LLAMA_FTYPE_MOSTLY_E3M4_Q, "12.21G, 0.0016 kld @ Mistral-Nemo", },
5456
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
5557
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
5658
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

ggml/CMakeLists.txt

+5
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ if (NOT GGML_LLAMAFILE_DEFAULT)
6262
set(GGML_LLAMAFILE_DEFAULT OFF)
6363
endif()
6464

65+
if (NOT GGML_OPENMP_SIMD_DEFAULT)
66+
set(GGML_OPENMP_SIMD_DEFAULT OFF)
67+
endif()
68+
6569
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
6670
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
6771
endif()
@@ -112,6 +116,7 @@ option(GGML_LASX "ggml: enable lasx" ON)
112116
option(GGML_LSX "ggml: enable lsx" ON)
113117
option(GGML_RVV "ggml: enable rvv" ON)
114118
option(GGML_SVE "ggml: enable SVE" OFF)
119+
option(GGML_OPENMP_SIMD "ggml: enable OPENMP_SIMD" ${GGML_OPENMP_SIMD_DEFAULT})
115120

116121
if (WIN32)
117122
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")

ggml/include/ggml.h

+8
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,10 @@ extern "C" {
392392
GGML_TYPE_IQ4_NL_4_4 = 36,
393393
// GGML_TYPE_IQ4_NL_4_8 = 37,
394394
// GGML_TYPE_IQ4_NL_8_8 = 38,
395+
GGML_TYPE_E5M2 = 39,
396+
GGML_TYPE_E4M3 = 40,
397+
GGML_TYPE_E4M3_Q = 41,
398+
GGML_TYPE_E3M4_Q = 42,
395399
GGML_TYPE_COUNT,
396400
};
397401

@@ -436,6 +440,10 @@ extern "C" {
436440
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
437441
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
438442
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
443+
GGML_FTYPE_MOSTLY_E5M2 = 28, // except 1d tensors
444+
GGML_FTYPE_MOSTLY_E4M3 = 29, // except 1d tensors
445+
GGML_FTYPE_MOSTLY_E4M3_Q = 30, // except 1d tensors
446+
GGML_FTYPE_MOSTLY_E3M4_Q = 31, // except 1d tensors
439447
};
440448

441449
// available tensor operations:

ggml/src/CMakeLists.txt

+23-1
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,18 @@ if (GGML_CCACHE)
104104
endif ()
105105
endif()
106106

107+
if (GGML_OPENMP_SIMD)
108+
check_cxx_compiler_flag("-fopenmp-simd" SUPPORTS_OPENMP_SIMD)
109+
if (SUPPORTS_OPENMP_SIMD)
110+
# OpenMP_RUNTIME_MSVC=experimental / if (MSVC)
111+
message(STATUS "Using openmp_simd.")
112+
add_compile_definitions(GGML_USE_OPENMP_SIMD)
113+
set(OPENMP_SIMD_FLAGS -fopenmp-simd)
114+
else()
115+
message(WARNING "C++ compiler lacks OPENMP_SIMD support.")
116+
endif()
117+
endif()
118+
107119
# this version of Apple ld64 is buggy
108120
execute_process(
109121
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
@@ -203,6 +215,14 @@ endif()
203215

204216
# ggml
205217

218+
# FP8
219+
file(GLOB GGML_HEADERS_FP8 "ggml-fp8.h")
220+
file(GLOB GGML_SOURCES_FP8 "ggml-fp8.cpp")
221+
222+
if (OPENMP_SIMD_FLAGS)
223+
set_source_files_properties(${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS ${OPENMP_SIMD_FLAGS})
224+
endif()
225+
206226
if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS)
207227
message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS")
208228
endif()
@@ -222,7 +242,9 @@ add_library(ggml-base
222242
ggml-quants.c
223243
ggml-quants.h
224244
ggml-aarch64.c
225-
ggml-aarch64.h)
245+
ggml-aarch64.h
246+
${GGML_SOURCES_FP8} ${GGML_HEADERS_FP8}
247+
)
226248

227249
target_include_directories(ggml-base PRIVATE .)
228250

ggml/src/ggml-common.h

+59-17
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,20 @@
66
typedef uint16_t ggml_half;
77
typedef uint32_t ggml_half2;
88

9-
#define GGML_COMMON_AGGR
9+
#define GGML_COMMON_AGGR_U
10+
#define GGML_COMMON_AGGR_S
11+
12+
#define GGML_COMMON_DECL
13+
#elif defined(GGML_COMMON_DECL_CPP)
14+
#include <cstdint>
15+
16+
typedef uint16_t ggml_half;
17+
typedef uint32_t ggml_half2;
18+
19+
// std-c++ allow anonymous unions but some compiler warn on it
20+
#define GGML_COMMON_AGGR_U data
21+
// std-c++ do not allow it.
22+
#define GGML_COMMON_AGGR_S data
1023

1124
#define GGML_COMMON_DECL
1225
#elif defined(GGML_COMMON_DECL_METAL)
@@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
1528
typedef half ggml_half;
1629
typedef half2 ggml_half2;
1730

18-
#define GGML_COMMON_AGGR
31+
#define GGML_COMMON_AGGR_U
32+
#define GGML_COMMON_AGGR_S
1933

2034
#define GGML_COMMON_DECL
2135
#elif defined(GGML_COMMON_DECL_CUDA)
@@ -29,7 +43,8 @@ typedef half2 ggml_half2;
2943
typedef half ggml_half;
3044
typedef half2 ggml_half2;
3145

32-
#define GGML_COMMON_AGGR data
46+
#define GGML_COMMON_AGGR_U
47+
#define GGML_COMMON_AGGR_S data
3348

3449
#define GGML_COMMON_DECL
3550
#elif defined(GGML_COMMON_DECL_HIP)
@@ -39,7 +54,8 @@ typedef half2 ggml_half2;
3954
typedef half ggml_half;
4055
typedef half2 ggml_half2;
4156

42-
#define GGML_COMMON_AGGR data
57+
#define GGML_COMMON_AGGR_U
58+
#define GGML_COMMON_AGGR_S data
4359

4460
#define GGML_COMMON_DECL
4561
#elif defined(GGML_COMMON_DECL_SYCL)
@@ -49,7 +65,8 @@ typedef half2 ggml_half2;
4965
typedef sycl::half ggml_half;
5066
typedef sycl::half2 ggml_half2;
5167

52-
#define GGML_COMMON_AGGR data
68+
#define GGML_COMMON_AGGR_U
69+
#define GGML_COMMON_AGGR_S data
5370

5471
#define GGML_COMMON_DECL
5572
#endif
@@ -154,9 +171,9 @@ typedef struct {
154171
struct {
155172
ggml_half d; // delta
156173
ggml_half m; // min
157-
} GGML_COMMON_AGGR;
174+
} GGML_COMMON_AGGR_S;
158175
ggml_half2 dm;
159-
};
176+
} GGML_COMMON_AGGR_U;
160177
uint8_t qs[QK4_1 / 2]; // nibbles / quants
161178
} block_q4_1;
162179
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -175,9 +192,9 @@ typedef struct {
175192
struct {
176193
ggml_half d; // delta
177194
ggml_half m; // min
178-
} GGML_COMMON_AGGR;
195+
} GGML_COMMON_AGGR_S;
179196
ggml_half2 dm;
180-
};
197+
} GGML_COMMON_AGGR_U;
181198
uint8_t qh[4]; // 5-th bit of quants
182199
uint8_t qs[QK5_1 / 2]; // nibbles / quants
183200
} block_q5_1;
@@ -196,9 +213,9 @@ typedef struct {
196213
struct {
197214
ggml_half d; // delta
198215
ggml_half s; // d * sum(qs[i])
199-
} GGML_COMMON_AGGR;
216+
} GGML_COMMON_AGGR_S;
200217
ggml_half2 ds;
201-
};
218+
} GGML_COMMON_AGGR_U;
202219
int8_t qs[QK8_1]; // quants
203220
} block_q8_1;
204221
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
@@ -261,9 +278,9 @@ typedef struct {
261278
struct {
262279
ggml_half d; // super-block scale for quantized scales
263280
ggml_half dmin; // super-block scale for quantized mins
264-
} GGML_COMMON_AGGR;
281+
} GGML_COMMON_AGGR_S;
265282
ggml_half2 dm;
266-
};
283+
} GGML_COMMON_AGGR_U;
267284
} block_q2_K;
268285
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
269286

@@ -288,9 +305,9 @@ typedef struct {
288305
struct {
289306
ggml_half d; // super-block scale for quantized scales
290307
ggml_half dmin; // super-block scale for quantized mins
291-
} GGML_COMMON_AGGR;
308+
} GGML_COMMON_AGGR_S;
292309
ggml_half2 dm;
293-
};
310+
} GGML_COMMON_AGGR_U;
294311
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
295312
uint8_t qs[QK_K/2]; // 4--bit quants
296313
} block_q4_K;
@@ -305,9 +322,9 @@ typedef struct {
305322
struct {
306323
ggml_half d; // super-block scale for quantized scales
307324
ggml_half dmin; // super-block scale for quantized mins
308-
} GGML_COMMON_AGGR;
325+
} GGML_COMMON_AGGR_S;
309326
ggml_half2 dm;
310-
};
327+
} GGML_COMMON_AGGR_U;
311328
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
312329
uint8_t qh[QK_K/8]; // quants, high bit
313330
uint8_t qs[QK_K/2]; // quants, low 4 bits
@@ -424,6 +441,24 @@ typedef struct {
424441
} block_iq4_nlx4;
425442
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
426443

444+
// fp8 support
445+
// - fp8 simple type
446+
typedef struct { uint8_t bits; } ggml_e5m2_t;
447+
typedef struct { uint8_t bits; } ggml_e4m3_t;
448+
449+
// - fp8 with bloc delta => 8.125 bpw
450+
typedef struct {
451+
float d; // delta
452+
uint8_t qs[QK_K];
453+
} block_e4m3_q;
454+
static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding");
455+
456+
typedef struct {
457+
float d; // delta
458+
uint8_t qs[QK_K];
459+
} block_e3m4_q;
460+
static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding");
461+
427462
#endif // GGML_COMMON_DECL
428463
#endif // GGML_COMMON_DECL
429464

@@ -437,6 +472,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
437472
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
438473
#define GGML_TABLE_END() };
439474

475+
#define GGML_COMMON_IMPL
476+
#elif defined(GGML_COMMON_IMPL_CPP)
477+
#include <cstdint>
478+
479+
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
480+
#define GGML_TABLE_END() };
481+
440482
#define GGML_COMMON_IMPL
441483
#elif defined(GGML_COMMON_IMPL_METAL)
442484
#include <metal_stdlib>

0 commit comments

Comments
 (0)