7
7
8
8
#include " ggml-fp8.h"
9
9
10
- /*
11
- make clean
12
- make -j8
13
- # ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
14
- ./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
15
- ./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
16
- ./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
17
-
18
- rm -rf build
19
- cmake -B build
20
- cmake --build build --config Release -j $(nproc)
21
- ./build/bin/llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
22
- ./build/bin/llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
23
- ./build/bin/llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
24
-
25
- # la CI local:
26
- rm -rf tmp
27
- mkdir tmp
28
- bash ./ci/run.sh ./tmp/results ./tmp/mnt
29
-
30
- # HIP legacy target?
31
- cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
32
- cmake --build build2 --config Release -j $(nproc) -v
33
-
34
- */
35
-
36
- template <int N> constexpr float EXP2 () {
37
- if constexpr (N==0 ) return 1 ;
38
- if constexpr (N>0 ) return EXP2<N-1 >()*2 ;
39
- if constexpr (N<0 ) return EXP2<N+1 >()/2 ;
10
+ template <int N>
11
+ constexpr float exp_p2 () {
12
+ return exp_p2<N-1 >()*2 ;
40
13
}
41
-
42
- // 2^N avec N>0 en entier
43
- template <int N> constexpr int EXP_I2 () {
44
- if constexpr (N==0 ) return 1 ;
45
- if constexpr (N>0 ) return EXP_I2<N-1 >()*2 ;
14
+ template <int N>
15
+ constexpr float exp_m2 () {
16
+ return exp_m2<N+1 >()/2 ;
17
+ }
18
+ template <int N>
19
+ constexpr int exp_i2 () {
20
+ return 1 << N;
46
21
}
22
+ template <> constexpr float exp_p2<0 >() { return 1 ;}
23
+ template <> constexpr float exp_m2<0 >() { return 1 ;}
47
24
48
- template <int _E > // , int M=7-E> 1.7 bits!
25
+ template <int E > // , int M=7-E> 1.7 bits!
49
26
struct FP8 {
50
27
uint8_t bits;
51
- using type = FP8<_E >;
52
- static constexpr int E=_E;
53
- static constexpr int M= 7 -_E;
54
- static constexpr int E_BIAS=EXP2<_E -1 >()-1 ;
55
- static constexpr float MAX () { return (2 -EXP2 <-M+1 >())*EXP2<EXP_I2<_E -1 >()>(); }
56
- static constexpr float MIN () { return EXP2 <-M>()*EXP2 <2 -EXP_I2<_E -1 >()>(); }
28
+ using type = FP8<E >;
29
+ // static constexpr int E=_E;
30
+ static constexpr int M () { return 7 -E; }
31
+ static constexpr int E_BIAS () { return exp_i2< E -1 >()-1 ; }
32
+ static constexpr float MAX () { return (2 -exp_m2 <-M () +1 >())*exp_p2<exp_i2< E -1 >()>(); }
33
+ static constexpr float MIN () { return exp_m2 <-M () >()*exp_m2 <2 -exp_i2< E -1 >()>(); }
57
34
// =============================================
58
35
59
36
#ifdef GGML_USE_OPENMP_SIMD
@@ -64,19 +41,19 @@ struct FP8 {
64
41
float f;
65
42
uint32_t bits;
66
43
} in = {value};
67
- // le signe:
44
+ // the signe:
68
45
bits = (in.bits >> 24 ) & 0x80 ;
69
- // la valeur sans la signe!
46
+ // value without signe!
70
47
in.bits &= 0x7fffffff ;
71
48
// GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN
72
49
if (in.f >= MAX ()) {
73
50
bits |= 0x7E ;
74
51
} else if (in.f <MIN ()) { // => 0.
75
52
// OK: S.0000000
76
53
} else {
77
- in.f *= EXP2 <E_BIAS-127 >();
78
- in.bits += 1 <<(22 -M); // for rounding
79
- bits |= (in.bits >> (23 -M)) & 0x7F ;
54
+ in.f *= exp_m2 <E_BIAS () -127 >();
55
+ in.bits += 1 <<(22 -M () ); // for rounding
56
+ bits |= (in.bits >> (23 -M () )) & 0x7F ;
80
57
}
81
58
}
82
59
@@ -88,13 +65,12 @@ struct FP8 {
88
65
float f;
89
66
uint32_t bits;
90
67
} out = {0 };
91
- // le signe:
92
68
out.bits = bits & 0x80 ;
93
69
out.bits <<= 24 ;
94
70
uint32_t _bits = bits & 0x7F ;
95
- _bits <<= (23 -M);
71
+ _bits <<= (23 -M () );
96
72
out.bits |= _bits;
97
- out.f *= EXP2 <127 -E_BIAS>();
73
+ out.f *= exp_p2 <127 -E_BIAS () >();
98
74
return out.f ;
99
75
}
100
76
};
@@ -156,7 +132,7 @@ static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
156
132
for (int64_t q=0 ; q<qk_size; ++q) {
157
133
float m = 0 ;
158
134
#ifdef GGML_USE_OPENMP_SIMD
159
- // not work on macos and warn.
135
+ // did not work on macOS and warn.
160
136
// #pragma omp simd reduction(max:m)
161
137
#endif
162
138
for (int64_t i=0 ; i<QK; i++) {
0 commit comments