@@ -251,59 +251,6 @@ static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val,
251
251
return x - val[mu-1 ] < val[mu] - x ? mu-1 : mu;
252
252
}
253
253
254
- static __device__ void cpy_blck_f32_q6_0 (const char * cxi, char * cdsti) {
255
- const float * xi = (const float *) cxi;
256
- block_q6_0 * dsti = (block_q6_0 *) cdsti;
257
-
258
- float amax = 0 .0f ;
259
- float vmax = 0 .0f ;
260
-
261
- for (int j = 0 ; j < QK6_0; ++j) {
262
- const float v = xi[j];
263
- const float av = fabsf (xi[j]);
264
- if (amax < av) {
265
- amax = av;
266
- vmax = v;
267
- }
268
- }
269
-
270
- const float d = vmax / -32 ;
271
- const float id = d ? 1 .0f /d : 0 .0f ;
272
-
273
- dsti->d = d;
274
- memset (dsti->qh , 0 , QK6_0/4 );
275
-
276
- for (int j = 0 ; j < QK6_0/2 ; ++j) {
277
- const float x0 = xi[0 + j]*id;
278
- const float x1 = xi[QK4_0/2 + j]*id;
279
-
280
- const uint8_t xi0 = min (63 , (int8_t )(x0 + 32 .5f ));
281
- const uint8_t xi1 = min (63 , (int8_t )(x1 + 32 .5f ));
282
-
283
- dsti->qs [j] = (xi0 & 0xf ) | ((xi1 & 0xf ) << 4 );
284
- const uint8_t h = (xi0 >> 4 ) | ((xi1 >> 4 ) << 2 );
285
- dsti->qh [j%(QK6_0/4 )] |= (h << 4 *(j/(QK6_0/4 )));
286
- }
287
- }
288
-
289
- static __device__ const int8_t iq4nl_index[241 ] = {
290
- 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 16 , 16 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ,
291
- 1 , 17 , 17 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 18 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 ,
292
- 3 , 3 , 3 , 3 , 3 , 3 , 19 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 4 , 20 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 , 5 ,
293
- 5 , 5 , 21 , 21 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 6 , 22 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 7 , 23 , 23 , 8 , 8 , 8 , 8 ,
294
- 8 , 8 , 8 , 8 , 8 , 8 , 24 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 9 , 25 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 10 , 26 , 26 ,
295
- 11 , 11 , 11 , 11 , 11 , 11 , 11 , 11 , 11 , 11 , 11 , 11 , 27 , 27 , 12 , 12 , 12 , 12 , 12 , 12 , 12 , 12 , 12 , 12 , 12 , 12 , 12 , 12 , 28 , 13 , 13 , 13 ,
296
- 13 , 13 , 13 , 13 , 13 , 13 , 13 , 13 , 13 , 13 , 13 , 13 , 13 , 13 , 29 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 , 14 ,
297
- 14 , 14 , 14 , 14 , 30 , 15 , 15 , 15 , 15 , 15 , 15 , 15 , 15 , 15 , 15 , 15 , 15
298
- };
299
-
300
- static __device__ __forceinline__ int best_index_iq4nl (const int8_t * values, float x) {
301
- int ix = (int )x - values[0 ];
302
- if (ix < 0 || ix >= 241 ) return ix < 0 ? 0 : 15 ;
303
- ix = iq4nl_index[ix];
304
- return ix < 16 ? ix : x - values[ix-16 ] < values[ix-15 ] - x ? ix-16 : ix-15 ;
305
- }
306
-
307
254
static __device__ void cpy_blck_f32_iq4_nl (const char * cxi, char * cdsti) {
308
255
const float * xi = (const float *) cxi;
309
256
block_iq4_nl * dsti = (block_iq4_nl *) cdsti;
@@ -322,14 +269,12 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
322
269
float d = vmax / kvalues_iq4nl[0 ];
323
270
const float id = d ? 1 .0f /d : 0 .0f ;
324
271
325
- // dsti->d = d;
326
-
327
272
float sumqx = 0 , sumq2 = 0 ;
328
273
for (int j = 0 ; j < QK4_NL/2 ; ++j) {
329
274
const float x0 = xi[0 + j]*id;
330
275
const float x1 = xi[QK4_NL/2 + j]*id;
331
- const uint8_t xi0 = best_index_iq4nl ( kvalues_iq4nl, x0);
332
- const uint8_t xi1 = best_index_iq4nl ( kvalues_iq4nl, x1);
276
+ const uint8_t xi0 = best_index_int8 ( 16 , kvalues_iq4nl, x0);
277
+ const uint8_t xi1 = best_index_int8 ( 16 , kvalues_iq4nl, x1);
333
278
dsti->qs [j] = xi0 | (xi1 << 4 );
334
279
const float v0 = kvalues_iq4nl[xi0];
335
280
const float v1 = kvalues_iq4nl[xi1];
@@ -541,17 +486,6 @@ static void ggml_cpy_q5_1_f32_cuda(
541
486
ne10, ne11, ne12, nb10, nb11, nb12, nb13);
542
487
}
543
488
544
- static void ggml_cpy_f32_q6_0_cuda (
545
- const char * cx, char * cdst, const int ne,
546
- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
547
- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
548
-
549
- GGML_ASSERT (ne % QK6_0 == 0 );
550
- const int num_blocks = ne / QK6_0;
551
- cpy_f32_q<cpy_blck_f32_q6_0, QK6_0><<<num_blocks, 1 , 0 , stream>>>
552
- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
553
- }
554
-
555
489
static void ggml_cpy_f32_iq4_nl_cuda (
556
490
const char * cx, char * cdst, const int ne,
557
491
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -639,8 +573,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
639
573
ggml_cpy_f32_q5_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
640
574
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
641
575
ggml_cpy_q5_1_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
642
- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
643
- ggml_cpy_f32_q6_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
644
576
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
645
577
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
646
578
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
@@ -685,8 +617,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
685
617
return (void *) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
686
618
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
687
619
return (void *) cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1>;
688
- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q6_0) {
689
- return (void *) cpy_f32_q<cpy_blck_f32_q6_0, QK6_0>;
690
620
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
691
621
return (void *) cpy_f32_f16<cpy_1_f32_f16>;
692
622
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
0 commit comments