NXS_Llama.cpp Alpha 0.05 - b5506 - IQ5_KS quantization works

Nexesenex · Nexesenex · commit a17aab42bdc1 · 2025-05-29T13:51:20.000+02:00
WIP for the rest.
Revert back to b5506 for parity with Croco.cpp.
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
@@ -5410,8 +5410,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
         return false;
     }
 
-    if (nbytes % ggml_type_size(type) != 0) {
-    // if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ3_KS && nbytes % ggml_type_size(type) != 0) {
+    if (type != GGML_TYPE_IQ2_BN && type != GGML_TYPE_IQ1_BN
+        // && type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN &&
+        && type != GGML_TYPE_IQ2_KS && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ4_KSS && type != GGML_TYPE_IQ3_KS && type != GGML_TYPE_IQ5_KS && type != GGML_TYPE_IQ2_KT && type != GGML_TYPE_IQ3_KT && type != GGML_TYPE_IQ4_KT && nbytes % ggml_type_size(type) != 0) {
         fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
         return false;
     }
diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp
@@ -599,6 +599,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
             }
             const size_t  type_size = ggml_type_size(info.t.type);
             const int64_t blck_size = ggml_blck_size(info.t.type);
+            // const int64_t row_size = ggml_row_size(info.t.type, info.t.ne[0]);
 
             // check that row size is divisible by block size
             if (blck_size == 0 || info.t.ne[0] % blck_size != 0) {
@@ -611,6 +612,7 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
 
             // calculate byte offsets given the tensor shape and type
             info.t.nb[0] = type_size;
+            // info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size);
             info.t.nb[1] = ggml_row_size(info.t.type, info.t.ne[0]);
             for (int j = 2; j < GGML_MAX_DIMS; ++j) {
                 info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1];
@@ -1162,7 +1164,9 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
     GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type");
 
     tensor->nb[0] = type_size;
-    tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
+    // tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
+    tensor->nb[1] = ggml_row_size(type, tensor->ne[0]);
+
     for (int i = 2; i < GGML_MAX_DIMS; i++) {
         tensor->nb[i] = tensor->nb[i - 1]*tensor->ne[i - 1];
     }
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
@@ -945,8 +945,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
         // update the gguf meta data as we go
         gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
-        GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
+        // GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
         gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+        // gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
 
         // write tensor data + padding
         fout.write((const char *) new_data, new_size);

Original file line number	Diff line number	Diff line change
`@@ -5410,8 +5410,9 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte`
`5410`	`5410`	`return false;`
`5411`	`5411`	`}`
`5412`	`5412`
`5413`		`- if (nbytes % ggml_type_size(type) != 0) {`
`5414`		`- // if (type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ3_KS && nbytes % ggml_type_size(type) != 0) {`
	`5413`	`+ if (type != GGML_TYPE_IQ2_BN && type != GGML_TYPE_IQ1_BN`
	`5414`	`+ // && type != GGML_TYPE_IQ2_TN && type != GGML_TYPE_IQ1_TN &&`
	`5415`	`+ && type != GGML_TYPE_IQ2_KS && type != GGML_TYPE_IQ4_KS && type != GGML_TYPE_IQ4_KSS && type != GGML_TYPE_IQ3_KS && type != GGML_TYPE_IQ5_KS && type != GGML_TYPE_IQ2_KT && type != GGML_TYPE_IQ3_KT && type != GGML_TYPE_IQ4_KT && nbytes % ggml_type_size(type) != 0) {`
`5415`	`5416`	`fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));`
`5416`	`5417`	`return false;`
`5417`	`5418`	`}`