try to refactor include and simd

Djip007 · Djip007 · commit 05c55e5ad663 · 2024-10-26T20:52:34.000+02:00
- correct local CI.
diff --git a/Makefile b/Makefile
@@ -556,6 +556,7 @@ ifndef GGML_NO_OPENMP_SIMD
 	MK_CPPFLAGS += -DGGML_USE_OPENMP_SIMD
 	MK_CFLAGS   += -fopenmp-simd
 	MK_CXXFLAGS += -fopenmp-simd
+	# -openmp:experimental pour MSVC?
 endif # GGML_NO_OPENMP_SIMD
 
 ifdef GGML_OPENBLAS
@@ -1085,6 +1086,7 @@ ggml/src/ggml-aarch64.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 
+#$(CXX) $(CXXFLAGS) -std=c++17 -fopt-info-vec-missed -c $< -o $@
 ggml/src/ggml-fp8.o: \
 	ggml/src/ggml-fp8.cpp \
 	ggml/src/ggml-fp8.h \
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
@@ -154,7 +154,7 @@ if (GGML_MUSA)
 endif()
 
 if (GGML_OPENMP)
-    set(OpenMP_RUNTIME_MSVC "experimental")
+    # set(OpenMP_RUNTIME_MSVC "experimental")
     find_package(OpenMP)
     if (OpenMP_FOUND)
         message(STATUS "OpenMP found")
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
@@ -427,8 +427,24 @@ typedef struct {
 } block_iq4_xs;
 static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
 
-// the fp8 types.
-#include "ggml-fp8.h"
+// fp8 support
+// - fp8 simple type
+typedef struct { uint8_t bits; } ggml_e5m2_t;
+typedef struct { uint8_t bits; } ggml_e4m3_t;
+typedef struct { uint8_t bits; } ggml_e3m4_t;
+
+// - fp8 with bloc delta => 8.125 bpw
+typedef struct {
+    float d;  // delta
+    uint8_t qs[QK_K];
+} block_e4m3_q;
+static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding");
+
+typedef struct {
+    float d;  // delta
+    uint8_t qs[QK_K];
+} block_e3m4_q;
+static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding");
 
 #endif // GGML_COMMON_DECL
 #endif // GGML_COMMON_DECL
diff --git a/ggml/src/ggml-fp8.cpp b/ggml/src/ggml-fp8.cpp
@@ -1,8 +1,10 @@
-// #include <iostream>
-#include <cstdint>
 #include <cassert>
 #include <algorithm>
 
+#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+#include "ggml.h"
+
 #include "ggml-fp8.h"
 
 /*
@@ -20,6 +22,11 @@ cmake --build build --config Release
 ./build/bin/llama-cli -c 1024 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf -p "[INST]bonjour a tu un nom. je ne sais pas comment t'appeler. Si tu n'en as pas je peux t'appeler TINTIN[/INST]" -s 42
 ./build/bin/llama-perplexity --kl-divergence-base ~/LLM/Mistral-Nemo-Instruct-2407.BF16.kld --kl-divergence -s 31337 -m ~/LLM/Mistral-Nemo-Instruct-2407.E3M4_Q.gguf
 
+# la CI local:
+rm -rf tmp
+mkdir tmp
+bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
 */
 
 template<int N> constexpr float EXP2() {
@@ -45,7 +52,9 @@ struct FP8 {
     static constexpr float MIN() { return EXP2<-M>()*EXP2<2-EXP_I2<_E-1>()>(); }
     //=============================================
 
+#ifdef GGML_USE_OPENMP_SIMD
     #pragma omp declare simd
+#endif
     void operator=(float value) {
         union {
             float f;
@@ -67,7 +76,9 @@ struct FP8 {
         }
     }
 
+#ifdef GGML_USE_OPENMP_SIMD
     #pragma omp declare simd
+#endif
     operator float () const {
         union {
             float f;
@@ -84,23 +95,21 @@ struct FP8 {
     }
 };
 
-// block_e4m3_q
-//typedef struct {
-//    float d;  // delta
-//    ggml_e4m3 qs[QK_K];
-//} block_e4m3_q;
-
 template<int E>
 static inline void conv(const FP8<E>* x, float* y, int64_t size) {
+#ifdef GGML_USE_OPENMP_SIMD
     #pragma omp simd
+#endif
     for (int64_t i=0; i<size; i++) {
         y[i] = (float) x[i];
     }
 }
 
 template<int E>
 static inline void conv(const float* x, FP8<E>* y, int64_t size) {
+#ifdef GGML_USE_OPENMP_SIMD
     #pragma omp simd
+#endif
     for (int64_t i=0; i<size; i++) {
         y[i] = x[i];
     }
@@ -109,7 +118,9 @@ static inline void conv(const float* x, FP8<E>* y, int64_t size) {
 template<int E>
 static inline float dot(const FP8<E>* x, const float* y, int64_t size) {
     float z = 0;
+#ifdef GGML_USE_OPENMP_SIMD
     #pragma omp simd reduction(+:z)
+#endif
     for (int64_t i=0; i<size; i++) {
         z += ((float)x[i])*y[i];
     }
@@ -126,7 +137,9 @@ template <int E, int QK>
 static inline void conv(const bloc_fp8<E, QK>* x, float* y, int64_t size) {
     const auto qk_size = size / QK;
     for (int64_t q=0; q<qk_size; ++q) {
+#ifdef GGML_USE_OPENMP_SIMD
         #pragma omp simd
+#endif
         for (int64_t i=0; i<QK; i++) {
             y[q*QK+i] = ((float) x[q].qs[i])*(x[q]).d;
         }
@@ -138,13 +151,18 @@ static inline void conv(const float* x, bloc_fp8<E, QK>* y, int64_t size) {
     const auto qk_size = size / QK;
     for (int64_t q=0; q<qk_size; ++q) {
         float m = 0;
+        // @ voir si c'est lui qui pose probleme et si c'est sur toutes les target
+#ifdef GGML_USE_OPENMP_SIMD
         #pragma omp simd reduction(max:m)
+#endif
         for (int64_t i=0; i<QK; i++) {
             m = std::max(std::abs(x[q*QK+i]),m);
         }
         const float D = FP8<E>::MAX()/m;
         y[q].d = m/FP8<E>::MAX();
+#ifdef GGML_USE_OPENMP_SIMD
         #pragma omp simd
+#endif
         for (int64_t i=0; i<QK; i++) {
             y[q].qs[i] = x[q*QK+i]*D;
         }
@@ -157,7 +175,9 @@ static inline float dot(const bloc_fp8<E, QK>* x, const float* y, int64_t size)
     const auto qk_size = size / QK;
     for (int64_t q=0; q<qk_size; ++q) {
         float z0 = 0;
+#ifdef GGML_USE_OPENMP_SIMD
         #pragma omp simd reduction(+:z0)
+#endif
         for (int64_t i=0; i<QK; i++) {
             z0 += ((float)x[q].qs[i])*y[q*QK+i];
         }
@@ -192,29 +212,29 @@ void ggml_fp32_to_e4m3_row_ref(const float * GGML_RESTRICT x, ggml_e4m3_t * GGML
 }
 
 void dequantize_row_e4m3_q(const block_e4m3_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
-    assert(k % FP8_QK == 0);
-    conv(reinterpret_cast<const bloc_fp8<4, FP8_QK>*>(x), y, k);
+    assert(k % QK_K == 0);
+    conv(reinterpret_cast<const bloc_fp8<4, QK_K>*>(x), y, k);
 }
 void quantize_row_e4m3_q(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
-    assert(k % FP8_QK == 0);
-    conv(x, reinterpret_cast<bloc_fp8<4, FP8_QK>*>(y), k);
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
 }
 void quantize_row_e4m3_q_ref(const float * GGML_RESTRICT x, block_e4m3_q * GGML_RESTRICT y, int64_t k) {
-    assert(k % FP8_QK == 0);
-    conv(x, reinterpret_cast<bloc_fp8<4, FP8_QK>*>(y), k);
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<4, QK_K>*>(y), k);
 }
 
 void dequantize_row_e3m4_q(const block_e3m4_q * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
-    assert(k % FP8_QK == 0);
-    conv(reinterpret_cast<const bloc_fp8<3, FP8_QK>*>(x), y, k);
+    assert(k % QK_K == 0);
+    conv(reinterpret_cast<const bloc_fp8<3, QK_K>*>(x), y, k);
 }
 void quantize_row_e3m4_q(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
-    assert(k % FP8_QK == 0);
-    conv(x, reinterpret_cast<bloc_fp8<3, FP8_QK>*>(y), k);
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
 }
 void quantize_row_e3m4_q_ref(const float * GGML_RESTRICT x, block_e3m4_q * GGML_RESTRICT y, int64_t k) {
-    assert(k % FP8_QK == 0);
-    conv(x, reinterpret_cast<bloc_fp8<3, FP8_QK>*>(y), k);
+    assert(k % QK_K == 0);
+    conv(x, reinterpret_cast<bloc_fp8<3, QK_K>*>(y), k);
 }
 
 // the dot product for FP8 weight
@@ -242,7 +262,7 @@ void ggml_vec_dot_e4m3_q(int n, float * GGML_RESTRICT s, size_t bs, const block_
     GGML_UNUSED(bx);
     GGML_UNUSED(by);
     GGML_UNUSED(bs);
-    *s = dot(reinterpret_cast<const bloc_fp8<4, FP8_QK>*>(vx), vy, n);
+    *s = dot(reinterpret_cast<const bloc_fp8<4, QK_K>*>(vx), vy, n);
 }
 
 void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_e3m4_q * GGML_RESTRICT vx, size_t bx, const float * GGML_RESTRICT vy, size_t by, int nrc) {
@@ -251,5 +271,5 @@ void ggml_vec_dot_e3m4_q(int n, float * GGML_RESTRICT s, size_t bs, const block_
     GGML_UNUSED(bx);
     GGML_UNUSED(by);
     GGML_UNUSED(bs);
-    *s = dot(reinterpret_cast<const bloc_fp8<3, FP8_QK>*>(vx), vy, n);
+    *s = dot(reinterpret_cast<const bloc_fp8<3, QK_K>*>(vx), vy, n);
 }
diff --git a/ggml/src/ggml-fp8.h b/ggml/src/ggml-fp8.h
@@ -1,28 +1,9 @@
-#pragma once
-#include "ggml.h"
-
+// this is more a .inc.
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
-#define FP8_QK 256
-
-    typedef struct { uint8_t bits; } ggml_e5m2_t;
-    typedef struct { uint8_t bits; } ggml_e4m3_t;
-    typedef struct { uint8_t bits; } ggml_e3m4_t;
-
-    // fp8 with bloc delta => 8.125 bpw
-    typedef struct {
-        float d;  // delta
-        uint8_t qs[FP8_QK];
-    } block_e4m3_q;
-    static_assert(sizeof(block_e4m3_q) == sizeof(float) + FP8_QK, "wrong block_e4m3_q block size/padding");
-
-    typedef struct {
-        float d;  // delta
-        uint8_t qs[FP8_QK];
-    } block_e3m4_q;
-    static_assert(sizeof(block_e3m4_q) == sizeof(float) + FP8_QK, "wrong block_e3m4_q block size/padding");
+    // Note: types are define in ggml-common.h
 
     GGML_API void ggml_e5m2_to_fp32_row(const ggml_e5m2_t * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
     GGML_API void ggml_fp32_to_e5m2_row(const float * GGML_RESTRICT x, ggml_e5m2_t * GGML_RESTRICT y, int64_t k);
diff --git a/scripts/hf.sh b/scripts/hf.sh
@@ -26,7 +26,7 @@ function has_cmd {
 }
 
 if has_cmd wget; then
-    cmd="wget -q --show-progress -c -O %s/%s %s"
+    cmd="wget -q -c -O %s/%s %s"
 elif has_cmd curl; then
     cmd="curl -C - -f --output-dir %s -o %s -L %s"
 else

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ function has_cmd {`
`26`	`26`	`}`
`27`	`27`
`28`	`28`	`if has_cmd wget; then`
`29`		`- cmd="wget -q --show-progress -c -O %s/%s %s"`
	`29`	`+ cmd="wget -q -c -O %s/%s %s"`
`30`	`30`	`elif has_cmd curl; then`
`31`	`31`	`cmd="curl -C - -f --output-dir %s -o %s -L %s"`
`32`	`32`	`else`