1
- // #include <iostream>
2
- #include < cstdint>
3
1
#include < cassert>
4
2
#include < algorithm>
5
3
4
+ #define GGML_COMMON_DECL_CPP
5
+ #include " ggml-common.h"
6
+ #include " ggml.h"
7
+
6
8
#include " ggml-fp8.h"
7
9
8
10
/*
@@ -20,6 +22,11 @@ cmake --build build --config Release
20
22
./build/bin/llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
21
23
./build/bin/llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
22
24
25
+ # la CI local:
26
+ rm -rf tmp
27
+ mkdir tmp
28
+ bash ./ci/run.sh ./tmp/results ./tmp/mnt
29
+
23
30
*/
24
31
25
32
template <int N> constexpr float EXP2 () {
@@ -45,7 +52,9 @@ struct FP8 {
45
52
static constexpr float MIN () { return EXP2<-M>()*EXP2<2 -EXP_I2<_E-1 >()>(); }
46
53
// =============================================
47
54
55
+ #ifdef GGML_USE_OPENMP_SIMD
48
56
#pragma omp declare simd
57
+ #endif
49
58
void operator =(float value) {
50
59
union {
51
60
float f;
@@ -67,7 +76,9 @@ struct FP8 {
67
76
}
68
77
}
69
78
79
+ #ifdef GGML_USE_OPENMP_SIMD
70
80
#pragma omp declare simd
81
+ #endif
71
82
operator float () const {
72
83
union {
73
84
float f;
@@ -84,23 +95,21 @@ struct FP8 {
84
95
}
85
96
};
86
97
87
- // block_e4m3_q
88
- // typedef struct {
89
- // float d; // delta
90
- // ggml_e4m3 qs[QK_K];
91
- // } block_e4m3_q;
92
-
93
98
template <int E>
94
99
static inline void conv (const FP8<E>* x, float * y, int64_t size) {
100
+ #ifdef GGML_USE_OPENMP_SIMD
95
101
#pragma omp simd
102
+ #endif
96
103
for (int64_t i=0 ; i<size; i++) {
97
104
y[i] = (float ) x[i];
98
105
}
99
106
}
100
107
101
108
template <int E>
102
109
static inline void conv (const float * x, FP8<E>* y, int64_t size) {
110
+ #ifdef GGML_USE_OPENMP_SIMD
103
111
#pragma omp simd
112
+ #endif
104
113
for (int64_t i=0 ; i<size; i++) {
105
114
y[i] = x[i];
106
115
}
@@ -109,7 +118,9 @@ static inline void conv(const float* x, FP8<E>* y, int64_t size) {
109
118
template <int E>
110
119
static inline float dot (const FP8<E>* x, const float * y, int64_t size) {
111
120
float z = 0 ;
121
+ #ifdef GGML_USE_OPENMP_SIMD
112
122
#pragma omp simd reduction(+:z)
123
+ #endif
113
124
for (int64_t i=0 ; i<size; i++) {
114
125
z += ((float )x[i])*y[i];
115
126
}
@@ -126,7 +137,9 @@ template <int E, int QK>
126
137
static inline void conv (const bloc_fp8<E, QK>* x, float * y, int64_t size) {
127
138
const auto qk_size = size / QK;
128
139
for (int64_t q=0 ; q<qk_size; ++q) {
140
+ #ifdef GGML_USE_OPENMP_SIMD
129
141
#pragma omp simd
142
+ #endif
130
143
for (int64_t i=0 ; i<QK; i++) {
131
144
y[q*QK+i] = ((float ) x[q].qs [i])*(x[q]).d ;
132
145
}
@@ -138,13 +151,18 @@ static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
138
151
const auto qk_size = size / QK;
139
152
for (int64_t q=0 ; q<qk_size; ++q) {
140
153
float m = 0 ;
154
+ // @ voir si c'est lui qui pose probleme et si c'est sur toutes les target
155
+ #ifdef GGML_USE_OPENMP_SIMD
141
156
#pragma omp simd reduction(max:m)
157
+ #endif
142
158
for (int64_t i=0 ; i<QK; i++) {
143
159
m = std::max (std::abs (x[q*QK+i]),m);
144
160
}
145
161
const float D = FP8<E>::MAX ()/m;
146
162
y[q].d = m/FP8<E>::MAX ();
163
+ #ifdef GGML_USE_OPENMP_SIMD
147
164
#pragma omp simd
165
+ #endif
148
166
for (int64_t i=0 ; i<QK; i++) {
149
167
y[q].qs [i] = x[q*QK+i]*D;
150
168
}
@@ -157,7 +175,9 @@ static inline float dot(const bloc_fp8<E, QK>* x, const float* y, int64_t size)
157
175
const auto qk_size = size / QK;
158
176
for (int64_t q=0 ; q<qk_size; ++q) {
159
177
float z0 = 0 ;
178
+ #ifdef GGML_USE_OPENMP_SIMD
160
179
#pragma omp simd reduction(+:z0)
180
+ #endif
161
181
for (int64_t i=0 ; i<QK; i++) {
162
182
z0 += ((float )x[q].qs [i])*y[q*QK+i];
163
183
}
@@ -192,29 +212,29 @@ void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML
192
212
}
193
213
194
214
void dequantize_row_e4m3_q (const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
195
- assert (k % FP8_QK == 0 );
196
- conv (reinterpret_cast <const bloc_fp8<4 , FP8_QK >*>(x), y, k);
215
+ assert (k % QK_K == 0 );
216
+ conv (reinterpret_cast <const bloc_fp8<4 , QK_K >*>(x), y, k);
197
217
}
198
218
void quantize_row_e4m3_q (const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
199
- assert (k % FP8_QK == 0 );
200
- conv (x, reinterpret_cast <bloc_fp8<4 , FP8_QK >*>(y), k);
219
+ assert (k % QK_K == 0 );
220
+ conv (x, reinterpret_cast <bloc_fp8<4 , QK_K >*>(y), k);
201
221
}
202
222
void quantize_row_e4m3_q_ref (const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
203
- assert (k % FP8_QK == 0 );
204
- conv (x, reinterpret_cast <bloc_fp8<4 , FP8_QK >*>(y), k);
223
+ assert (k % QK_K == 0 );
224
+ conv (x, reinterpret_cast <bloc_fp8<4 , QK_K >*>(y), k);
205
225
}
206
226
207
227
void dequantize_row_e3m4_q (const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
208
- assert (k % FP8_QK == 0 );
209
- conv (reinterpret_cast <const bloc_fp8<3 , FP8_QK >*>(x), y, k);
228
+ assert (k % QK_K == 0 );
229
+ conv (reinterpret_cast <const bloc_fp8<3 , QK_K >*>(x), y, k);
210
230
}
211
231
void quantize_row_e3m4_q (const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
212
- assert (k % FP8_QK == 0 );
213
- conv (x, reinterpret_cast <bloc_fp8<3 , FP8_QK >*>(y), k);
232
+ assert (k % QK_K == 0 );
233
+ conv (x, reinterpret_cast <bloc_fp8<3 , QK_K >*>(y), k);
214
234
}
215
235
void quantize_row_e3m4_q_ref (const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
216
- assert (k % FP8_QK == 0 );
217
- conv (x, reinterpret_cast <bloc_fp8<3 , FP8_QK >*>(y), k);
236
+ assert (k % QK_K == 0 );
237
+ conv (x, reinterpret_cast <bloc_fp8<3 , QK_K >*>(y), k);
218
238
}
219
239
220
240
// the dot product for FP8 weight
@@ -242,7 +262,7 @@ void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_
242
262
GGML_UNUSED (bx);
243
263
GGML_UNUSED (by);
244
264
GGML_UNUSED (bs);
245
- *s = dot (reinterpret_cast <const bloc_fp8<4 , FP8_QK >*>(vx), vy, n);
265
+ *s = dot (reinterpret_cast <const bloc_fp8<4 , QK_K >*>(vx), vy, n);
246
266
}
247
267
248
268
void ggml_vec_dot_e3m4_q (int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -251,5 +271,5 @@ void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_
251
271
GGML_UNUSED (bx);
252
272
GGML_UNUSED (by);
253
273
GGML_UNUSED (bs);
254
- *s = dot (reinterpret_cast <const bloc_fp8<3 , FP8_QK >*>(vx), vy, n);
274
+ *s = dot (reinterpret_cast <const bloc_fp8<3 , QK_K >*>(vx), vy, n);
255
275
}
0 commit comments