Skip to content

Commit a17aab4

Browse files
committed
NXS_Llama.cpp Alpha 0.05 - b5506 - IQ5_KS quantization works
WIP for the rest. Revert back to b5506 for parity with Croco.cpp.
1 parent f69aee3 commit a17aab4

File tree

3 files changed

+10
-4
lines changed

3 files changed

+10
-4
lines changed

ggml/src/ggml-quants.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5410,8 +5410,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
54105410
return false;
54115411
}
54125412

5413-
if (nbytes % ggml_type_size(type) != 0) {
5414-
// if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ3_KS && nbytes % ggml_type_size(type) != 0) {
5413+
if (type != GGML_TYPE_IQ2_BN && type != GGML_TYPE_IQ1_BN
5414+
// && type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN &&
5415+
&& type != GGML_TYPE_IQ2_KS && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ4_KSS && type != GGML_TYPE_IQ3_KS && type != GGML_TYPE_IQ5_KS && type != GGML_TYPE_IQ2_KT && type != GGML_TYPE_IQ3_KT && type != GGML_TYPE_IQ4_KT && nbytes % ggml_type_size(type) != 0) {
54155416
fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
54165417
return false;
54175418
}

ggml/src/gguf.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
599599
}
600600
const size_t type_size = ggml_type_size(info.t.type);
601601
const int64_t blck_size = ggml_blck_size(info.t.type);
602+
// const int64_t row_size = ggml_row_size(info.t.type, info.t.ne[0]);
602603

603604
// check that row size is divisible by block size
604605
if (blck_size == 0 || info.t.ne[0] % blck_size != 0) {
@@ -611,6 +612,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
611612

612613
// calculate byte offsets given the tensor shape and type
613614
info.t.nb[0] = type_size;
615+
// info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size);
614616
info.t.nb[1] = ggml_row_size(info.t.type, info.t.ne[0]);
615617
for (int j = 2; j < GGML_MAX_DIMS; ++j) {
616618
info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1];
@@ -1162,7 +1164,9 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
11621164
GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type");
11631165

11641166
tensor->nb[0] = type_size;
1165-
tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
1167+
// tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
1168+
tensor->nb[1] = ggml_row_size(type, tensor->ne[0]);
1169+
11661170
for (int i = 2; i < GGML_MAX_DIMS; i++) {
11671171
tensor->nb[i] = tensor->nb[i - 1]*tensor->ne[i - 1];
11681172
}

src/llama-quant.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -945,8 +945,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
945945

946946
// update the gguf meta data as we go
947947
gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
948-
GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
948+
// GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
949949
gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
950+
// gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
950951

951952
// write tensor data + padding
952953
fout.write((const char *) new_data, new_size);

0 commit comments

Comments
 (0)