Nexesenex
diff --git a/‎examples/quantize/quantize.cpp
-1 b/‎examples/quantize/quantize.cpp
-1
diff --git a/‎ggml/include/ggml.h
+1-13 b/‎ggml/include/ggml.h
+1-13
diff --git a/‎ggml/src/ggml-common.h
-11 b/‎ggml/src/ggml-common.h
-11
diff --git a/‎ggml/src/ggml-cuda/common.cuh
-7 b/‎ggml/src/ggml-cuda/common.cuh
-7
diff --git a/‎ggml/src/ggml-cuda/convert.cu
-41 b/‎ggml/src/ggml-cuda/convert.cu
-41
diff --git a/‎ggml/src/ggml-cuda/cpy.cu
+2-72 b/‎ggml/src/ggml-cuda/cpy.cu
+2-72
@@ -21,7 +21,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
     { "Q4_1",     LLAMA_FTYPE_MOSTLY_Q4_1,     " 4.78G, +0.4511 ppl @ Llama-3-8B",  },
     { "Q5_0",     LLAMA_FTYPE_MOSTLY_Q5_0,     " 5.21G, +0.1316 ppl @ Llama-3-8B",  },
     { "Q5_1",     LLAMA_FTYPE_MOSTLY_Q5_1,     " 5.65G, +0.1062 ppl @ Llama-3-8B",  },
-    { "Q6_0",     LLAMA_FTYPE_MOSTLY_Q6_0,     " 6.5 bpw quantization",             },
     { "IQ2_XXS",  LLAMA_FTYPE_MOSTLY_IQ2_XXS,  " 2.06 bpw quantization",            },
     { "IQ2_XS",   LLAMA_FTYPE_MOSTLY_IQ2_XS,   " 2.31 bpw quantization",            },
     { "IQ2_S",    LLAMA_FTYPE_MOSTLY_IQ2_S,    " 2.5  bpw quantization",            },
 
@@ -385,15 +385,10 @@ extern "C" {
         // GGML_TYPE_Q4_0_8_8 = 33,
         GGML_TYPE_TQ1_0   = 34,
         GGML_TYPE_TQ2_0   = 35,
-
         // GGML_TYPE_IQ4_NL_4_4 = 36,
         // GGML_TYPE_IQ4_NL_4_8 = 37,
         // GGML_TYPE_IQ4_NL_8_8 = 38,
-        // GGML_TYPE_COUNT   = 39,
-
-        //
-        GGML_TYPE_Q6_0    = 133,
-        GGML_TYPE_COUNT,
+        GGML_TYPE_COUNT   = 39,
     };
 
     // precision
@@ -428,13 +423,6 @@ extern "C" {
         GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
         GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
         GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
-
-        // GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
-        // GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
-        // GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
-        //
-        GGML_FTYPE_MOSTLY_Q6_0    = 127, // except 1d tensors
-
     };
 
     // available tensor operations:
 
@@ -105,9 +105,6 @@ typedef sycl::half2 ggml_half2;
 #define QI5_1 (QK5_1 / (4 * QR5_1))
 #define QR5_1 2
 
-#define QI6_0 (QK6_0 / (4 * QR6_0))
-#define QR6_0 2
-
 #define QI8_0 (QK8_0 / (4 * QR8_0))
 #define QR8_0 1
 
@@ -203,14 +200,6 @@ typedef struct {
 } block_q5_1;
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
 
-#define QK6_0 32
-typedef struct {
-    ggml_half d;         // delta
-    uint8_t qh[QK6_0/4]; // 5+6-th bit of quants
-    uint8_t qs[QK6_0/2]; // nibbles / quants
-} block_q6_0;
-static_assert(sizeof(block_q6_0) == sizeof(ggml_half) + QK6_0/2 + QK6_0/4, "wrong q6_0 block size/padding");
-
 #define QK8_0 32
 typedef struct {
     ggml_half d;       // delta
 
@@ -487,13 +487,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
     static constexpr int qi = QI5_1;
 };
 
-template<>
-struct ggml_cuda_type_traits<GGML_TYPE_Q6_0> {
-    static constexpr int qk = QK6_0;
-    static constexpr int qr = QR6_0;
-    static constexpr int qi = QI6_0;
-};
-
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
     static constexpr int qk = QK8_0;
 
@@ -122,36 +122,6 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
     }
 }
 
-template<typename dst_t>
-static __global__ void dequantize_block_q6_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
-
-    const int64_t i = blockIdx.x;
-
-    // assume 32 threads
-    const int64_t tid = threadIdx.x;
-    const int64_t il  = tid/8;
-    const int64_t ir  = tid%8;
-    const int64_t ib = 8*i + ir;
-    if (ib >= nb32) {
-        return;
-    }
-
-    dst_t * y = yy + 256*i + 32*ir + 4*il;
-
-    const block_q6_0 * x = (const block_q6_0 *)vx + ib;
-    const float d = __half2float(x->d);
-    const float dm = -32*d;
-
-    const uint8_t * qs = x->qs + 4*il;
-    const uint8_t * qh = x->qh + 4*(il%2);
-
-    for (int l = 0; l < 4; ++l) {
-        const uint8_t h = qh[l] >> 4*(il/2);
-        y[l+ 0] = d * ((qs[l] & 0xF) | ((h << 4) & 0x30)) + dm;
-        y[l+16] = d * ((qs[l] >>  4) | ((h << 2) & 0x30)) + dm;
-    }
-}
-
 //================================== k-quants
 
 template<typename dst_t>
@@ -527,13 +497,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k
     dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
 }
 
-template<typename dst_t>
-static void dequantize_row_q6_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
-    const int nb32 = k / 32;
-    const int nb = (k + 255) / 256;
-    dequantize_block_q6_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
-}
-
 template<typename dst_t>
 static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
     const int nb = k / QK_K;
@@ -635,8 +598,6 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
             return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:
             return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q6_0:
-            return dequantize_row_q6_0_cuda;
         case GGML_TYPE_Q8_0:
             if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
                 return dequantize_block_q8_0_f16_cuda;
@@ -687,8 +648,6 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:
             return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q6_0:
-            return dequantize_row_q6_0_cuda;
         case GGML_TYPE_Q8_0:
             return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
         case GGML_TYPE_Q2_K:
 
@@ -251,59 +251,6 @@ static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val,
     return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
 }
 
-static __device__ void cpy_blck_f32_q6_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q6_0 * dsti = (block_q6_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK6_0; ++j) {
-        const float v  = xi[j];
-        const float av = fabsf(xi[j]);
-        if (amax < av) {
-            amax = av;
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -32;
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-    memset(dsti->qh, 0, QK6_0/4);
-
-    for (int j = 0; j < QK6_0/2; ++j) {
-        const float x0 = xi[0       + j]*id;
-        const float x1 = xi[QK4_0/2 + j]*id;
-
-        const uint8_t xi0 = min(63, (int8_t)(x0 + 32.5f));
-        const uint8_t xi1 = min(63, (int8_t)(x1 + 32.5f));
-
-        dsti->qs[j]  = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
-        const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2);
-        dsti->qh[j%(QK6_0/4)] |= (h << 4*(j/(QK6_0/4)));
-    }
-}
-
-static __device__ const int8_t iq4nl_index[241] = {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 16, 16,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
-     1, 17, 17,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 18,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
-     3,  3,  3,  3,  3,  3, 19,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4, 20,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
-     5,  5, 21, 21,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, 22,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, 23, 23,  8,  8,  8,  8,
-     8,  8,  8,  8,  8,  8, 24,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 25, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 26, 26,
-    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 27, 27, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 28, 13, 13, 13,
-    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 29, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-    14, 14, 14, 14, 30, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
-};
-
-static __device__ __forceinline__ int best_index_iq4nl(const int8_t * values, float x) {
-    int ix = (int)x - values[0];
-    if (ix < 0 || ix >= 241) return ix < 0 ? 0 : 15;
-    ix = iq4nl_index[ix];
-    return ix < 16 ? ix : x - values[ix-16] < values[ix-15] - x ? ix-16 : ix-15;
-}
-
 static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
     const float * xi = (const float *) cxi;
     block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
@@ -322,14 +269,12 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
     float d = vmax / kvalues_iq4nl[0];
     const float id = d ? 1.0f/d : 0.0f;
 
-    //dsti->d = d;
-
     float sumqx = 0, sumq2 = 0;
     for (int j = 0; j < QK4_NL/2; ++j) {
         const float x0 = xi[0        + j]*id;
         const float x1 = xi[QK4_NL/2 + j]*id;
-        const uint8_t xi0 = best_index_iq4nl(kvalues_iq4nl, x0);
-        const uint8_t xi1 = best_index_iq4nl(kvalues_iq4nl, x1);
+        const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
+        const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
         dsti->qs[j] = xi0 | (xi1 << 4);
         const float v0 = kvalues_iq4nl[xi0];
         const float v1 = kvalues_iq4nl[xi1];
@@ -541,17 +486,6 @@ static void ggml_cpy_q5_1_f32_cuda(
         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }
 
-static void ggml_cpy_f32_q6_0_cuda(
-    const char * cx, char * cdst, const int ne,
-    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
-
-    GGML_ASSERT(ne % QK6_0 == 0);
-    const int num_blocks = ne / QK6_0;
-    cpy_f32_q<cpy_blck_f32_q6_0, QK6_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-}
-
 static void ggml_cpy_f32_iq4_nl_cuda(
     const char * cx, char * cdst, const int ne,
     const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -639,8 +573,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
         ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
-        ggml_cpy_f32_q6_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
         ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
@@ -685,8 +617,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
         return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
     } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
         return (void*) cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>;
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
-        return (void*) cpy_f32_q<cpy_blck_f32_q6_0, QK6_0>;
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
         return (void*) cpy_f32_f16<cpy_1_f32_f16>;
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {