@@ -1347,10 +1347,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
1347
1347
GGML_UNUSED (main_device);
1348
1348
}
1349
1349
1350
+ static cudaError_t ggml_cuda_Memcpy2DPeerAsync (
1351
+ void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
1352
+
1353
+ #if !defined(GGML_USE_HIPBLAS)
1354
+ // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1355
+ cudaMemcpy3DPeerParms p = {};
1356
+ p.dstDevice = dstDevice;
1357
+ p.dstPtr = make_cudaPitchedPtr (dst, dpitch, dpitch, height);
1358
+ p.srcDevice = srcDevice;
1359
+ p.srcPtr = make_cudaPitchedPtr (src, spitch, spitch, height);
1360
+ p.extent = make_cudaExtent (width, height, 1 );
1361
+ return cudaMemcpy3DPeerAsync (&p, stream);
1362
+ #else
1363
+ // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1364
+ GGML_UNUSED (dstDevice);
1365
+ GGML_UNUSED (srcDevice);
1366
+ return cudaMemcpy2DAsync (dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
1367
+ #endif // !defined(GGML_USE_HIPBLAS)
1368
+ }
1369
+
1350
1370
static void ggml_cuda_op_mul_mat (
1351
1371
ggml_backend_cuda_context & ctx,
1352
1372
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
1353
- const bool convert_src1_to_q8_1 ) {
1373
+ quantize_cuda_t quantize_src1 ) {
1354
1374
1355
1375
const int64_t ne00 = src0->ne [0 ];
1356
1376
const int64_t ne01 = src0->ne [1 ];
@@ -1407,7 +1427,9 @@ static void ggml_cuda_op_mul_mat(
1407
1427
}
1408
1428
1409
1429
struct dev_data {
1410
- ggml_cuda_pool_alloc<char > src0_dd_alloc;
1430
+ int cc;
1431
+
1432
+ ggml_cuda_pool_alloc<char > src0_dd_alloc;
1411
1433
ggml_cuda_pool_alloc<float > src1_ddf_alloc;
1412
1434
ggml_cuda_pool_alloc<char > src1_ddq_alloc;
1413
1435
ggml_cuda_pool_alloc<float > dst_dd_alloc;
@@ -1426,6 +1448,8 @@ static void ggml_cuda_op_mul_mat(
1426
1448
int used_devices = 0 ;
1427
1449
1428
1450
for (int id = 0 ; id < ggml_backend_cuda_get_device_count (); ++id) {
1451
+ dev[id].cc = ggml_cuda_info ().devices [id].cc ;
1452
+
1429
1453
// by default, use all rows
1430
1454
dev[id].row_low = 0 ;
1431
1455
dev[id].row_high = ne01;
@@ -1476,11 +1500,15 @@ static void ggml_cuda_op_mul_mat(
1476
1500
dev[id].src1_ddf = dev[id].src1_ddf_alloc .alloc (ctx.pool (id), ggml_nelements (src1));
1477
1501
}
1478
1502
1479
- if (convert_src1_to_q8_1) {
1480
- dev[id].src1_ddq = dev[id].src1_ddq_alloc .alloc (ctx.pool (id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
1503
+ if (quantize_src1) {
1504
+ size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
1505
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1506
+ src_1_ddq_size += get_mmq_x_max_host (dev[id].cc )*sizeof (block_q8_1_mmq);
1507
+ }
1508
+ dev[id].src1_ddq = dev[id].src1_ddq_alloc .alloc (ctx.pool (id), src_1_ddq_size);
1481
1509
1482
1510
if (src1_on_device && src1_is_contiguous) {
1483
- quantize_row_q8_1_cuda (dev[id].src1_ddf , dev[id].src1_ddq , ne10, nrows1, src1_padded_col_size, stream);
1511
+ quantize_src1 (dev[id].src1_ddf , dev[id].src1_ddq , ne10, ne11, ne12*ne13, src1_padded_col_size, src0-> type , stream);
1484
1512
CUDA_CHECK (cudaGetLastError ());
1485
1513
}
1486
1514
}
@@ -1526,7 +1554,12 @@ static void ggml_cuda_op_mul_mat(
1526
1554
const int64_t i03 = i0 / ne12;
1527
1555
const int64_t i02 = i0 % ne12;
1528
1556
1529
- const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
1557
+ size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1558
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1559
+ src1_ddq_i_offset += src1_col_0 * sizeof (block_q8_1_mmq);
1560
+ } else {
1561
+ src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
1562
+ }
1530
1563
1531
1564
// for split tensors the data begins at i0 == i0_offset_low
1532
1565
char * src0_dd_i = dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
@@ -1543,10 +1576,17 @@ static void ggml_cuda_op_mul_mat(
1543
1576
// copy src0, src1 to device if necessary
1544
1577
if (src1_is_contiguous) {
1545
1578
if (id != ctx.device ) {
1546
- if (convert_src1_to_q8_1 ) {
1579
+ if (quantize_src1 ) {
1547
1580
char * src1_ddq_i_source = dev[ctx.device ].src1_ddq + src1_ddq_i_offset;
1548
- CUDA_CHECK (cudaMemcpyPeerAsync (src1_ddq_i, id, src1_ddq_i_source, ctx.device ,
1549
- src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1581
+ if (quantize_src1 == quantize_mmq_q8_1_cuda) {
1582
+ const size_t pitch = ne11*sizeof (block_q8_1_mmq);
1583
+ const size_t width = src1_ncols*sizeof (block_q8_1_mmq);
1584
+ const size_t height = src1_padded_col_size/(4 *QK8_1);
1585
+ CUDA_CHECK (ggml_cuda_Memcpy2DPeerAsync (src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device , pitch, width, height, stream));
1586
+ } else {
1587
+ CUDA_CHECK (cudaMemcpyPeerAsync (
1588
+ src1_ddq_i, id, src1_ddq_i_source, ctx.device , src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
1589
+ }
1550
1590
} else {
1551
1591
float * src1_ddf_i_source = (float *) src1->data ;
1552
1592
src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
@@ -1561,8 +1601,8 @@ static void ggml_cuda_op_mul_mat(
1561
1601
GGML_ASSERT (false );
1562
1602
}
1563
1603
1564
- if (convert_src1_to_q8_1 && !src1_is_contiguous) {
1565
- quantize_row_q8_1_cuda (src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
1604
+ if (quantize_src1 && !src1_is_contiguous) {
1605
+ quantize_src1 (src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1 , src1_padded_col_size, src0-> type , stream);
1566
1606
CUDA_CHECK (cudaGetLastError ());
1567
1607
}
1568
1608
@@ -1587,22 +1627,8 @@ static void ggml_cuda_op_mul_mat(
1587
1627
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1588
1628
GGML_ASSERT (dst->nb [1 ] == ne0*sizeof (float ));
1589
1629
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low ;
1590
- #if !defined(GGML_USE_HIPBLAS)
1591
- // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
1592
- cudaMemcpy3DPeerParms p = {};
1593
- p.dstDevice = ctx.device ;
1594
- p.dstPtr = make_cudaPitchedPtr (dhf_dst_i, ne0*sizeof (float ), row_diff, src1_ncols);
1595
- p.srcDevice = id;
1596
- p.srcPtr = make_cudaPitchedPtr (dst_dd_i, row_diff*sizeof (float ), row_diff, src1_ncols);
1597
- p.extent = make_cudaExtent (row_diff*sizeof (float ), src1_ncols, 1 );
1598
- CUDA_CHECK (cudaMemcpy3DPeerAsync (&p, stream));
1599
- #else
1600
- // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
1601
- CUDA_CHECK (cudaMemcpy2DAsync (dhf_dst_i, ne0*sizeof (float ),
1602
- dst_dd_i, row_diff*sizeof (float ),
1603
- row_diff*sizeof (float ), src1_ncols,
1604
- cudaMemcpyDeviceToDevice, stream));
1605
- #endif
1630
+ CUDA_CHECK (ggml_cuda_Memcpy2DPeerAsync (
1631
+ dhf_dst_i, ctx.device , ne0*sizeof (float ), dst_dd_i, id, row_diff*sizeof (float ), row_diff*sizeof (float ), src1_ncols, stream));
1606
1632
} else {
1607
1633
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
1608
1634
GGML_ASSERT (dst->nb [1 ] == ne0*sizeof (float ));
@@ -1941,13 +1967,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
1941
1967
// KQ + KQV multi-batch
1942
1968
ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
1943
1969
} else if (use_dequantize_mul_mat_vec) {
1944
- ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false );
1970
+ ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr );
1945
1971
} else if (use_mul_mat_vec_q) {
1946
- ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true );
1972
+ ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda );
1947
1973
} else if (use_mul_mat_q) {
1948
- ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true );
1974
+ ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda );
1949
1975
} else {
1950
- ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false );
1976
+ ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr );
1951
1977
}
1952
1978
}
1953
1979
0 commit comments