Skip to content

Commit 62fead3

Browse files
authored
cuda : fix tensor size calculation for non-split buffer (ggml-org#5145)
1 parent 15b4538 commit 62fead3

File tree

2 files changed

+8
-15
lines changed

2 files changed

+8
-15
lines changed

ggml-backend.c

+3-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
3030
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
3131
// get_alloc_size is optional, defaults to ggml_nbytes
3232
if (buft->iface.get_alloc_size) {
33-
return buft->iface.get_alloc_size(buft, tensor);
33+
size_t size = buft->iface.get_alloc_size(buft, tensor);
34+
assert(size >= ggml_nbytes(tensor));
35+
return size;
3436
}
3537
return ggml_nbytes(tensor);
3638
}

ggml-cuda.cu

+5-14
Original file line numberDiff line numberDiff line change
@@ -9790,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
97909790
// TODO: mmq/mmv support
97919791
#endif
97929792

9793-
const int64_t nb11 = src1->nb[1];
9794-
const int64_t nb1 = dst->nb[1];
9793+
const size_t nb11 = src1->nb[1];
9794+
const size_t nb1 = dst->nb[1];
97959795

97969796
const struct ggml_tensor * ids = src0;
97979797
const int32_t id = ((int32_t *) dst->op_params)[0];
@@ -10304,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
1030410304

1030510305
if (ggml_is_quantized(tensor->type)) {
1030610306
// initialize padding to 0 to avoid possible NaN values
10307-
int64_t row_low = 0;
10308-
int64_t row_high = ggml_nrows(tensor);
10309-
int64_t nrows_split = row_high - row_low;
10310-
10311-
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
10307+
size_t original_size = ggml_nbytes(tensor);
1031210308
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
1031310309

1031410310
if (padded_size > original_size && tensor->view_src == nullptr) {
10315-
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
10311+
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
1031610312
}
1031710313
}
1031810314
}
@@ -10415,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
1041510411
}
1041610412

1041710413
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
10418-
int64_t row_low = 0;
10419-
int64_t row_high = ggml_nrows(tensor);
10420-
int64_t nrows_split = row_high - row_low;
10421-
10422-
size_t size = ggml_nbytes_split(tensor, nrows_split);
10423-
10414+
size_t size = ggml_nbytes(tensor);
1042410415
int64_t ne0 = tensor->ne[0];
1042510416

1042610417
if (ggml_is_quantized(tensor->type)) {

0 commit comments

Comments
 (0)