Skip to content

Commit ed670d9

Browse files
committed
Revert "start to implement IK quants"
This reverts commit bb8afeeae4915e76c9c5f441f755d6917e1b1b7a.
1 parent cedc9a6 commit ed670d9

File tree

52 files changed

+40
-9283
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+40
-9283
lines changed

examples/quantize/quantize.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
2121
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
2222
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
2323
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
24-
{ "Q6_0", LLAMA_FTYPE_MOSTLY_Q6_0, " 6.5 bpw quantization", },
2524
{ "IQ2_XXS", LLAMA_FTYPE_MOSTLY_IQ2_XXS, " 2.06 bpw quantization", },
2625
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
2726
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },

ggml/include/ggml.h

+1-13
Original file line numberDiff line numberDiff line change
@@ -385,15 +385,10 @@ extern "C" {
385385
// GGML_TYPE_Q4_0_8_8 = 33,
386386
GGML_TYPE_TQ1_0 = 34,
387387
GGML_TYPE_TQ2_0 = 35,
388-
389388
// GGML_TYPE_IQ4_NL_4_4 = 36,
390389
// GGML_TYPE_IQ4_NL_4_8 = 37,
391390
// GGML_TYPE_IQ4_NL_8_8 = 38,
392-
// GGML_TYPE_COUNT = 39,
393-
394-
//
395-
GGML_TYPE_Q6_0 = 133,
396-
GGML_TYPE_COUNT,
391+
GGML_TYPE_COUNT = 39,
397392
};
398393

399394
// precision
@@ -428,13 +423,6 @@ extern "C" {
428423
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
429424
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
430425
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
431-
432-
// GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
433-
// GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
434-
// GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
435-
//
436-
GGML_FTYPE_MOSTLY_Q6_0 = 127, // except 1d tensors
437-
438426
};
439427

440428
// available tensor operations:

ggml/src/ggml-common.h

-11
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,6 @@ typedef sycl::half2 ggml_half2;
105105
#define QI5_1 (QK5_1 / (4 * QR5_1))
106106
#define QR5_1 2
107107

108-
#define QI6_0 (QK6_0 / (4 * QR6_0))
109-
#define QR6_0 2
110-
111108
#define QI8_0 (QK8_0 / (4 * QR8_0))
112109
#define QR8_0 1
113110

@@ -203,14 +200,6 @@ typedef struct {
203200
} block_q5_1;
204201
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
205202

206-
#define QK6_0 32
207-
typedef struct {
208-
ggml_half d; // delta
209-
uint8_t qh[QK6_0/4]; // 5+6-th bit of quants
210-
uint8_t qs[QK6_0/2]; // nibbles / quants
211-
} block_q6_0;
212-
static_assert(sizeof(block_q6_0) == sizeof(ggml_half) + QK6_0/2 + QK6_0/4, "wrong q6_0 block size/padding");
213-
214203
#define QK8_0 32
215204
typedef struct {
216205
ggml_half d; // delta

ggml/src/ggml-cuda/common.cuh

-7
Original file line numberDiff line numberDiff line change
@@ -487,13 +487,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
487487
static constexpr int qi = QI5_1;
488488
};
489489

490-
template<>
491-
struct ggml_cuda_type_traits<GGML_TYPE_Q6_0> {
492-
static constexpr int qk = QK6_0;
493-
static constexpr int qr = QR6_0;
494-
static constexpr int qi = QI6_0;
495-
};
496-
497490
template<>
498491
struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
499492
static constexpr int qk = QK8_0;

ggml/src/ggml-cuda/convert.cu

-41
Original file line numberDiff line numberDiff line change
@@ -122,36 +122,6 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
122122
}
123123
}
124124

125-
template<typename dst_t>
126-
static __global__ void dequantize_block_q6_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) {
127-
128-
const int64_t i = blockIdx.x;
129-
130-
// assume 32 threads
131-
const int64_t tid = threadIdx.x;
132-
const int64_t il = tid/8;
133-
const int64_t ir = tid%8;
134-
const int64_t ib = 8*i + ir;
135-
if (ib >= nb32) {
136-
return;
137-
}
138-
139-
dst_t * y = yy + 256*i + 32*ir + 4*il;
140-
141-
const block_q6_0 * x = (const block_q6_0 *)vx + ib;
142-
const float d = __half2float(x->d);
143-
const float dm = -32*d;
144-
145-
const uint8_t * qs = x->qs + 4*il;
146-
const uint8_t * qh = x->qh + 4*(il%2);
147-
148-
for (int l = 0; l < 4; ++l) {
149-
const uint8_t h = qh[l] >> 4*(il/2);
150-
y[l+ 0] = d * ((qs[l] & 0xF) | ((h << 4) & 0x30)) + dm;
151-
y[l+16] = d * ((qs[l] >> 4) | ((h << 2) & 0x30)) + dm;
152-
}
153-
}
154-
155125
//================================== k-quants
156126

157127
template<typename dst_t>
@@ -527,13 +497,6 @@ static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k
527497
dequantize_block_q4_1<<<nb, 32, 0, stream>>>(vx, y, nb32);
528498
}
529499

530-
template<typename dst_t>
531-
static void dequantize_row_q6_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
532-
const int nb32 = k / 32;
533-
const int nb = (k + 255) / 256;
534-
dequantize_block_q6_0<<<nb, 32, 0, stream>>>(vx, y, nb32);
535-
}
536-
537500
template<typename dst_t>
538501
static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
539502
const int nb = k / QK_K;
@@ -635,8 +598,6 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
635598
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
636599
case GGML_TYPE_Q5_1:
637600
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
638-
case GGML_TYPE_Q6_0:
639-
return dequantize_row_q6_0_cuda;
640601
case GGML_TYPE_Q8_0:
641602
if (fp16_available(ggml_cuda_info().devices[ggml_cuda_get_device()].cc)) {
642603
return dequantize_block_q8_0_f16_cuda;
@@ -687,8 +648,6 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
687648
return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
688649
case GGML_TYPE_Q5_1:
689650
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
690-
case GGML_TYPE_Q6_0:
691-
return dequantize_row_q6_0_cuda;
692651
case GGML_TYPE_Q8_0:
693652
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
694653
case GGML_TYPE_Q2_K:

ggml/src/ggml-cuda/cpy.cu

+2-72
Original file line numberDiff line numberDiff line change
@@ -251,59 +251,6 @@ static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val,
251251
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
252252
}
253253

254-
static __device__ void cpy_blck_f32_q6_0(const char * cxi, char * cdsti) {
255-
const float * xi = (const float *) cxi;
256-
block_q6_0 * dsti = (block_q6_0 *) cdsti;
257-
258-
float amax = 0.0f;
259-
float vmax = 0.0f;
260-
261-
for (int j = 0; j < QK6_0; ++j) {
262-
const float v = xi[j];
263-
const float av = fabsf(xi[j]);
264-
if (amax < av) {
265-
amax = av;
266-
vmax = v;
267-
}
268-
}
269-
270-
const float d = vmax / -32;
271-
const float id = d ? 1.0f/d : 0.0f;
272-
273-
dsti->d = d;
274-
memset(dsti->qh, 0, QK6_0/4);
275-
276-
for (int j = 0; j < QK6_0/2; ++j) {
277-
const float x0 = xi[0 + j]*id;
278-
const float x1 = xi[QK4_0/2 + j]*id;
279-
280-
const uint8_t xi0 = min(63, (int8_t)(x0 + 32.5f));
281-
const uint8_t xi1 = min(63, (int8_t)(x1 + 32.5f));
282-
283-
dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4);
284-
const uint8_t h = (xi0 >> 4) | ((xi1 >> 4) << 2);
285-
dsti->qh[j%(QK6_0/4)] |= (h << 4*(j/(QK6_0/4)));
286-
}
287-
}
288-
289-
static __device__ const int8_t iq4nl_index[241] = {
290-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
291-
1, 17, 17, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 18, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
292-
3, 3, 3, 3, 3, 3, 19, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 20, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
293-
5, 5, 21, 21, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 22, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 23, 23, 8, 8, 8, 8,
294-
8, 8, 8, 8, 8, 8, 24, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 25, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 26, 26,
295-
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 27, 27, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 28, 13, 13, 13,
296-
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 29, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
297-
14, 14, 14, 14, 30, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15
298-
};
299-
300-
static __device__ __forceinline__ int best_index_iq4nl(const int8_t * values, float x) {
301-
int ix = (int)x - values[0];
302-
if (ix < 0 || ix >= 241) return ix < 0 ? 0 : 15;
303-
ix = iq4nl_index[ix];
304-
return ix < 16 ? ix : x - values[ix-16] < values[ix-15] - x ? ix-16 : ix-15;
305-
}
306-
307254
static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
308255
const float * xi = (const float *) cxi;
309256
block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
@@ -322,14 +269,12 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
322269
float d = vmax / kvalues_iq4nl[0];
323270
const float id = d ? 1.0f/d : 0.0f;
324271

325-
//dsti->d = d;
326-
327272
float sumqx = 0, sumq2 = 0;
328273
for (int j = 0; j < QK4_NL/2; ++j) {
329274
const float x0 = xi[0 + j]*id;
330275
const float x1 = xi[QK4_NL/2 + j]*id;
331-
const uint8_t xi0 = best_index_iq4nl(kvalues_iq4nl, x0);
332-
const uint8_t xi1 = best_index_iq4nl(kvalues_iq4nl, x1);
276+
const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0);
277+
const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1);
333278
dsti->qs[j] = xi0 | (xi1 << 4);
334279
const float v0 = kvalues_iq4nl[xi0];
335280
const float v1 = kvalues_iq4nl[xi1];
@@ -541,17 +486,6 @@ static void ggml_cpy_q5_1_f32_cuda(
541486
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
542487
}
543488

544-
static void ggml_cpy_f32_q6_0_cuda(
545-
const char * cx, char * cdst, const int ne,
546-
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
547-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
548-
549-
GGML_ASSERT(ne % QK6_0 == 0);
550-
const int num_blocks = ne / QK6_0;
551-
cpy_f32_q<cpy_blck_f32_q6_0, QK6_0><<<num_blocks, 1, 0, stream>>>
552-
(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
553-
}
554-
555489
static void ggml_cpy_f32_iq4_nl_cuda(
556490
const char * cx, char * cdst, const int ne,
557491
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -639,8 +573,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
639573
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
640574
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
641575
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
642-
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
643-
ggml_cpy_f32_q6_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
644576
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
645577
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
646578
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
@@ -685,8 +617,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
685617
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
686618
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
687619
return (void*) cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>;
688-
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
689-
return (void*) cpy_f32_q<cpy_blck_f32_q6_0, QK6_0>;
690620
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
691621
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
692622
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {

0 commit comments

Comments
 (0)