Skip to content

Commit c5aa5b9

Browse files
committed
Clean Q4_0_N_M ref
Enable restrict on C++
1 parent b0eb6bf commit c5aa5b9

File tree

11 files changed

+85
-48
lines changed

11 files changed

+85
-48
lines changed

docs/build.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ cmake --build build --config Release
5555
cmake --preset arm64-windows-llvm-release -D GGML_OPENMP=OFF
5656
cmake --build build-arm64-windows-llvm-release
5757
```
58-
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_4_8 CPU kernels.
58+
Building for arm64 can also be done with the MSVC compiler with the build-arm64-windows-MSVC preset, or the standard CMake build instructions. However, note that the MSVC compiler does not support inline ARM assembly code, used e.g. for the accelerated Q4_0_N_M CPU kernels.
5959
6060
## BLAS Build
6161

examples/quantize/README.md

-2
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,6 @@ As the models are currently fully loaded into memory, you will need adequate dis
5454

5555
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
5656

57-
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
58-
5957
*(outdated)*
6058

6159
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |

examples/quantize/quantize.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
4848
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
4949
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
5050
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
51-
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
52-
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
53-
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
5451
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
5552
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
5653
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

ggml/include/ggml.h

+13-5
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ extern "C" {
392392
// GGML_TYPE_IQ4_NL_4_4 = 36,
393393
// GGML_TYPE_IQ4_NL_4_8 = 37,
394394
// GGML_TYPE_IQ4_NL_8_8 = 38,
395-
GGML_TYPE_COUNT,
395+
GGML_TYPE_COUNT = 39,
396396
};
397397

398398
// precision
@@ -2194,11 +2194,19 @@ extern "C" {
21942194
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
21952195
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
21962196

2197-
#ifdef __cplusplus
2198-
// restrict not standard in C++
2199-
#define GGML_RESTRICT
2197+
#ifdef __cplusplus
2198+
// restrict not standard in C++
2199+
# if defined(__GNUC__)
2200+
# define GGML_RESTRICT __restrict__
2201+
# elif defined(__clang__)
2202+
# define GGML_RESTRICT __restrict
2203+
# elif defined(_MSC_VER)
2204+
# define GGML_RESTRICT __restrict
2205+
# else
2206+
# define GGML_RESTRICT
2207+
# endif
22002208
#else
2201-
#define GGML_RESTRICT restrict
2209+
# define GGML_RESTRICT restrict
22022210
#endif
22032211
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
22042212
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);

ggml/src/ggml-cpu/amx/amx.h

-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
// GGML internal header
55

6-
// if defined(GGML_USE_CPU_AMX) ?
76
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
87
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
98
#endif

ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp

+17-18
Original file line numberDiff line numberDiff line change
@@ -222,12 +222,12 @@ static inline __m256i mul_sum_i8_pairs_int32x8(const __m256i x, const __m256i y)
222222

223223
static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
224224

225-
static void quantize_q8_0_4x4(const float * x, void * vy, int64_t k) {
225+
static void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
226226
assert(QK8_0 == 32);
227227
assert(k % QK8_0 == 0);
228228
const int nb = k / QK8_0;
229229

230-
block_q8_0x4 * y = (block_q8_0x4 *) vy;
230+
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
231231

232232
#if defined(__ARM_NEON)
233233
float32x4_t srcv[4][8];
@@ -316,12 +316,12 @@ static void quantize_q8_0_4x4(const float * x, void * vy, int64_t k) {
316316
#endif
317317
}
318318

319-
static void quantize_q8_0_4x8(const float * x, void * vy, int64_t k) {
319+
static void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
320320
assert(QK8_0 == 32);
321321
assert(k % QK8_0 == 0);
322322
const int nb = k / QK8_0;
323323

324-
block_q8_0x4 * y = (block_q8_0x4 *) vy;
324+
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
325325

326326
#if defined(__ARM_NEON)
327327
float32x4_t srcv[4][8];
@@ -531,7 +531,7 @@ static void quantize_q8_0_4x8(const float * x, void * vy, int64_t k) {
531531
#endif
532532
}
533533

534-
static void quantize_mat_q8_0(const float * x, void * vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
534+
static void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row, int64_t blck_size_interleave) {
535535
assert(nrow == 4);
536536
UNUSED(nrow);
537537
if (blck_size_interleave == 4) {
@@ -543,7 +543,7 @@ static void quantize_mat_q8_0(const float * x, void * vy, int64_t nrow, int64_t
543543
}
544544
}
545545

546-
static void ggml_gemv_q4_0_4x4_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
546+
static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
547547
const int qk = QK8_0;
548548
const int nb = n / qk;
549549
const int ncols_interleaved = 4;
@@ -628,7 +628,7 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * s, size_t bs, const void * vx
628628
}
629629
}
630630

631-
static void ggml_gemv_q4_0_4x8_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
631+
static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
632632
const int qk = QK8_0;
633633
const int nb = n / qk;
634634
const int ncols_interleaved = 4;
@@ -738,7 +738,7 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * s, size_t bs, const void * vx
738738
}
739739
}
740740

741-
static void ggml_gemv_q4_0_8x8_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
741+
static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
742742
const int qk = QK8_0;
743743
const int nb = n / qk;
744744
const int ncols_interleaved = 8;
@@ -1011,7 +1011,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * s, size_t bs, const void * vx
10111011
}
10121012
}
10131013

1014-
static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1014+
static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
10151015
const int qk = QK8_0;
10161016
const int nb = n / qk;
10171017
const int ncols_interleaved = 4;
@@ -1107,7 +1107,7 @@ static void ggml_gemv_iq4_nl_4x4_q8_0(int n, float * s, size_t bs, const void *
11071107
}
11081108
}
11091109

1110-
static void ggml_gemm_q4_0_4x4_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1110+
static void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
11111111
const int qk = QK8_0;
11121112
const int nb = n / qk;
11131113
const int ncols_interleaved = 4;
@@ -1623,7 +1623,7 @@ static void ggml_gemm_q4_0_4x4_q8_0(int n, float * s, size_t bs, const void * vx
16231623
}
16241624
}
16251625

1626-
static void ggml_gemm_q4_0_4x8_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
1626+
static void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
16271627
const int qk = QK8_0;
16281628
const int nb = n / qk;
16291629
const int ncols_interleaved = 4;
@@ -2077,7 +2077,7 @@ static void ggml_gemm_q4_0_4x8_q8_0(int n, float * s, size_t bs, const void * vx
20772077
}
20782078
}
20792079

2080-
static void ggml_gemm_q4_0_8x8_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
2080+
static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
20812081
const int qk = QK8_0;
20822082
const int nb = n / qk;
20832083
const int ncols_interleaved = 8;
@@ -3497,7 +3497,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * s, size_t bs, const void * vx
34973497
}
34983498
}
34993499

3500-
static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
3500+
static void ggml_gemm_iq4_nl_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
35013501
const int qk = QK8_0;
35023502
const int nb = n / qk;
35033503
const int ncols_interleaved = 4;
@@ -3677,7 +3677,7 @@ static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_in
36773677
return out;
36783678
}
36793679

3680-
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) {
3680+
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
36813681
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
36823682
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
36833683

@@ -3708,7 +3708,7 @@ static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block
37083708
GGML_UNUSED(data_size);
37093709
}
37103710

3711-
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) {
3711+
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
37123712
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
37133713
GGML_ASSERT(interleave_block == 8);
37143714

@@ -3772,7 +3772,7 @@ static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_s
37723772
return out;
37733773
}
37743774

3775-
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) {
3775+
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
37763776
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
37773777
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
37783778

@@ -3971,8 +3971,7 @@ static const tensor_traits<block_iq4_nl, 4, 4> iq4_nl_4x4_q8_0;
39713971

39723972
static const ggml::cpu::tensor_traits * ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
39733973
if (cur->type == GGML_TYPE_Q4_0) {
3974-
// TODO: enable for AVX2 - currently disabled due to bad gemv performance
3975-
if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
3974+
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
39763975
return &ggml::cpu::aarch64::q4_0_8x8_q8_0;
39773976
}
39783977
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {

ggml/src/ggml-cpu/ggml-cpu-traits.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@ namespace ggml::cpu {
1919
// register in tensor->extra
2020
class tensor_traits {
2121
public:
22-
~tensor_traits();
22+
virtual ~tensor_traits();
2323
virtual bool work_size(int n_threads, const struct ggml_tensor * op, size_t & size) = 0;
2424
virtual bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) = 0;
2525
};
2626

2727
class extra_buffer_type {
2828
public:
29-
~extra_buffer_type();
29+
virtual ~extra_buffer_type();
3030
virtual bool supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) = 0;
3131
virtual tensor_traits * get_tensor_traits(const struct ggml_tensor * op) = 0;
3232
};

ggml/src/ggml.c

+46-1
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,24 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
791791
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
792792
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row_ref,
793793
},
794+
[31] = { // GGML_TYPE_Q4_0_4_4
795+
.type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
796+
.blck_size = 0,
797+
.type_size = 0,
798+
.is_quantized = false,
799+
},
800+
[32] = { // GGML_TYPE_Q4_0_4_8
801+
.type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
802+
.blck_size = 0,
803+
.type_size = 0,
804+
.is_quantized = false,
805+
},
806+
[33] = { // GGML_TYPE_Q4_0_8_8
807+
.type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
808+
.blck_size = 0,
809+
.type_size = 0,
810+
.is_quantized = false,
811+
},
794812
[GGML_TYPE_TQ1_0] = {
795813
.type_name = "tq1_0",
796814
.blck_size = QK_K,
@@ -807,6 +825,24 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
807825
.to_float = (ggml_to_float_t) dequantize_row_tq2_0,
808826
.from_float_ref = (ggml_from_float_t) quantize_row_tq2_0_ref,
809827
},
828+
[36] = { // GGML_TYPE_IQ4_NL_4_4
829+
.type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
830+
.blck_size = 0,
831+
.type_size = 0,
832+
.is_quantized = false,
833+
},
834+
[37] = { // GGML_TYPE_IQ4_NL_4_8
835+
.type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
836+
.blck_size = 0,
837+
.type_size = 0,
838+
.is_quantized = false,
839+
},
840+
[38] = { // GGML_TYPE_IQ4_NL_8_8
841+
.type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
842+
.blck_size = 0,
843+
.type_size = 0,
844+
.is_quantized = false,
845+
},
810846
};
811847

812848
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
@@ -6766,7 +6802,16 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
67666802
(int64_t) info->ne[2] *
67676803
(int64_t) info->ne[3];
67686804

6769-
if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
6805+
if (ggml_blck_size(info->type) == 0 ) {
6806+
// this tensor type support have been removed:
6807+
fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
6808+
__func__, info->name.data, (int) info->type, ggml_type_name(info->type));
6809+
fclose(file);
6810+
gguf_free(ctx);
6811+
return NULL;
6812+
}
6813+
6814+
if (ne % ggml_blck_size(info->type) != 0) {
67706815
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
67716816
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
67726817
fclose(file);

gguf-py/gguf/constants.py

+3-9
Original file line numberDiff line numberDiff line change
@@ -1432,9 +1432,6 @@ class GGMLQuantizationType(IntEnum):
14321432
F64 = 28
14331433
IQ1_M = 29
14341434
BF16 = 30
1435-
Q4_0_4_4 = 31
1436-
Q4_0_4_8 = 32
1437-
Q4_0_8_8 = 33
14381435
TQ1_0 = 34
14391436
TQ2_0 = 35
14401437

@@ -1478,9 +1475,9 @@ class LlamaFileType(IntEnum):
14781475
MOSTLY_IQ4_XS = 30 # except 1d tensors
14791476
MOSTLY_IQ1_M = 31 # except 1d tensors
14801477
MOSTLY_BF16 = 32 # except 1d tensors
1481-
MOSTLY_Q4_0_4_4 = 33 # except 1d tensors
1482-
MOSTLY_Q4_0_4_8 = 34 # except 1d tensors
1483-
MOSTLY_Q4_0_8_8 = 35 # except 1d tensors
1478+
# MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack
1479+
# MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack
1480+
# MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack
14841481
MOSTLY_TQ1_0 = 36 # except 1d tensors
14851482
MOSTLY_TQ2_0 = 37 # except 1d tensors
14861483

@@ -1556,9 +1553,6 @@ def get_type(val: Any) -> GGUFValueType:
15561553
GGMLQuantizationType.F64: (1, 8),
15571554
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
15581555
GGMLQuantizationType.BF16: (1, 2),
1559-
GGMLQuantizationType.Q4_0_4_4:(32, 2 + 16),
1560-
GGMLQuantizationType.Q4_0_4_8:(32, 2 + 16),
1561-
GGMLQuantizationType.Q4_0_8_8:(32, 2 + 16),
15621556
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
15631557
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
15641558
}

include/llama.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,9 @@ extern "C" {
171171
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
172172
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
173173
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
174-
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
175-
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
176-
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
174+
//LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
175+
//LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
176+
//LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
177177
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
178178
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
179179

src/llama.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -5341,9 +5341,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
53415341
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
53425342
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
53435343
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
5344-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
5345-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
5346-
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
53475344

53485345
default: return "unknown, may not work";
53495346
}

0 commit comments

Comments
 (0)