8
8
#include " ggml-fp8.h"
9
9
10
10
/*
11
- make clean
12
- make -j8
13
- # ./llama-quantize --output-tensor-type fp8_e3m4_q ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
14
- ./llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
15
- ./llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
16
- ./llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
17
-
18
- rm -rf build
19
- cmake -B build
20
- cmake --build build --config Release -j $(nproc)
21
- ./build/bin/llama-quantize ~/LLM/Mistral-Nemo-Instruct-2407.BF16.gguf ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf E3M4_Q
22
- ./build/bin/llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
23
- ./build/bin/llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
24
-
25
- # la CI local:
26
- rm -rf tmp
27
- mkdir tmp
28
- bash ./ci/run.sh ./tmp/results ./tmp/mnt
29
-
30
- # HIP legacy target?
31
- cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
32
- cmake --build build2 --config Release -j $(nproc) -v
33
-
11
+ template<int N>
12
+ constexpr float exp2() {
13
+ union {
14
+ float f;
15
+ uint32_t bits;
16
+ } out = {0};
17
+ out.bits = (N+127)<<23;
18
+ return out.f;
19
+ }
34
20
*/
35
-
36
- template <int N> constexpr float EXP2 () {
37
- if constexpr (N==0 ) return 1 ;
38
- if constexpr (N>0 ) return EXP2<N-1 >()*2 ;
39
- if constexpr (N<0 ) return EXP2<N+1 >()/2 ;
21
+ template <int N>
22
+ constexpr float exp2 () {
23
+ uint32_t bits = (N+127 )<<23 ;
24
+ return reinterpret_cast <float >(bits);
40
25
}
41
-
42
- // 2^N avec N>0 en entier
43
- template <int N> constexpr int EXP_I2 () {
44
- if constexpr (N==0 ) return 1 ;
45
- if constexpr (N>0 ) return EXP_I2<N-1 >()*2 ;
26
+ template <int N>
27
+ constexpr int exp_i2 () {
28
+ return 1 << N;
46
29
}
47
30
48
- template <int _E > // , int M=7-E> 1.7 bits!
31
+ template <int E > // , int M=7-E> 1.7 bits!
49
32
struct FP8 {
50
33
uint8_t bits;
51
- using type = FP8<_E >;
52
- static constexpr int E=_E;
53
- static constexpr int M= 7 -_E;
54
- static constexpr int E_BIAS=EXP2<_E -1 >()-1 ;
55
- static constexpr float MAX () { return (2 -EXP2 <-M+1 >())*EXP2<EXP_I2<_E -1 >()>(); }
56
- static constexpr float MIN () { return EXP2 <-M>()*EXP2 <2 -EXP_I2<_E -1 >()>(); }
34
+ using type = FP8<E >;
35
+ // static constexpr int E=_E;
36
+ static constexpr int M () { return 7 -E; }
37
+ static constexpr int E_BIAS () { return exp_i2< E -1 >()-1 ; }
38
+ static constexpr float MAX () { return (2 -exp2 <-M () +1 >())*exp2 <exp_i2< E -1 >()>(); }
39
+ static constexpr float MIN () { return exp2 <-M () >()*exp2 <2 -exp_i2< E -1 >()>(); }
57
40
// =============================================
58
41
59
42
#ifdef GGML_USE_OPENMP_SIMD
@@ -64,19 +47,19 @@ struct FP8 {
64
47
float f;
65
48
uint32_t bits;
66
49
} in = {value};
67
- // le signe:
50
+ // the signe:
68
51
bits = (in.bits >> 24 ) & 0x80 ;
69
- // la valeur sans la signe!
52
+ // value without signe!
70
53
in.bits &= 0x7fffffff ;
71
54
// GGML_ASSERT(in.bits < 0x7f800000); // +/- infini ou NAN
72
55
if (in.f >= MAX ()) {
73
56
bits |= 0x7E ;
74
57
} else if (in.f <MIN ()) { // => 0.
75
58
// OK: S.0000000
76
59
} else {
77
- in.f *= EXP2 <E_BIAS-127 >();
78
- in.bits += 1 <<(22 -M); // for rounding
79
- bits |= (in.bits >> (23 -M)) & 0x7F ;
60
+ in.f *= exp2 <E_BIAS () -127 >();
61
+ in.bits += 1 <<(22 -M () ); // for rounding
62
+ bits |= (in.bits >> (23 -M () )) & 0x7F ;
80
63
}
81
64
}
82
65
@@ -88,13 +71,12 @@ struct FP8 {
88
71
float f;
89
72
uint32_t bits;
90
73
} out = {0 };
91
- // le signe:
92
74
out.bits = bits & 0x80 ;
93
75
out.bits <<= 24 ;
94
76
uint32_t _bits = bits & 0x7F ;
95
- _bits <<= (23 -M);
77
+ _bits <<= (23 -M () );
96
78
out.bits |= _bits;
97
- out.f *= EXP2 <127 -E_BIAS>();
79
+ out.f *= exp2 <127 -E_BIAS () >();
98
80
return out.f ;
99
81
}
100
82
};
@@ -156,7 +138,7 @@ static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
156
138
for (int64_t q=0 ; q<qk_size; ++q) {
157
139
float m = 0 ;
158
140
#ifdef GGML_USE_OPENMP_SIMD
159
- // not work on macos and warn.
141
+ // did not work on macOS and warn.
160
142
// #pragma omp simd reduction(max:m)
161
143
#endif
162
144
for (int64_t i=0 ; i<QK; i++) {
0 commit comments