Add Q6 and Q8 cache options to eval scripts

turboderp · turboderp · commit 675450d84556 · 2024-06-09T02:13:06.000+02:00
diff --git a/doc/eval.md b/doc/eval.md
@@ -58,6 +58,10 @@ prefix for the response.
 performance.
 
 - **-cq4 / --cache_q4**: Use Q4 cache
+  
+- **-cq6 / --cache_q6**: Use Q6 cache
+
+- **-cq8 / --cache_q8**: Use Q8 cache
 
 ## MMLU
 
@@ -83,3 +87,7 @@ the full list of subjects.
 performance.
 
 - **-cq4 / --cache_q4**: Use Q4 cache
+
+- **-cq6 / --cache_q6**: Use Q6 cache
+
+- **-cq8 / --cache_q8**: Use Q8 cache
diff --git a/eval/humaneval.py b/eval/humaneval.py
@@ -3,7 +3,7 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from human_eval.data import write_jsonl, read_problems
 from exllamav2 import model_init
-from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4
+from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4, ExLlamaV2Cache_Q6, ExLlamaV2Cache_Q8
 from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler
 import argparse, contextlib
 import util
@@ -15,6 +15,8 @@
 parser.add_argument("-cs", "--cache_size", type = int, default = None)
 parser.add_argument("-spt", "--samples_per_task", type = int, default = 200)
 parser.add_argument("-cq4", "--cache_q4", action = "store_true", help = "Use Q4 cache")
+parser.add_argument("-cq6", "--cache_q6", action = "store_true", help = "Use Q6 cache")
+parser.add_argument("-cq8", "--cache_q8", action = "store_true", help = "Use Q8 cache")
 parser.add_argument("--max_tokens", type = int, default = 768, help = "Max number of tokens for each completion")
 parser.add_argument("-pf", "--prompt_format", type = str, help = "Instruct format to apply. Default is raw completion (for base models) ")
 parser.add_argument("-v", "--verbose", action = "store_true", help = "Spam completions to console while generating")
@@ -75,6 +77,8 @@
 )
 
 if args.cache_q4: cache_type = ExLlamaV2Cache_Q4
+elif args.cache_q6: cache_type = ExLlamaV2Cache_Q6
+elif args.cache_q8: cache_type = ExLlamaV2Cache_Q8
 else: cache_type = ExLlamaV2Cache
 cache = cache_type(
     model,
diff --git a/eval/mmlu.py b/eval/mmlu.py
@@ -2,7 +2,7 @@
 import sys, os
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from exllamav2 import model_init
-from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4
+from exllamav2 import ExLlamaV2Cache, ExLlamaV2Cache_Q4, ExLlamaV2Cache_Q6, ExLlamaV2Cache_Q8
 from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2DynamicJob, ExLlamaV2Sampler
 import argparse, contextlib
 import torch
@@ -14,6 +14,8 @@
 parser = argparse.ArgumentParser(description = "Run MMLU evaluation on EXL2 model")
 parser.add_argument("-cs", "--cache_size", type = int, default = None)
 parser.add_argument("-cq4", "--cache_q4", action = "store_true", help = "Use Q4 cache")
+parser.add_argument("-cq6", "--cache_q6", action = "store_true", help = "Use Q6 cache")
+parser.add_argument("-cq8", "--cache_q8", action = "store_true", help = "Use Q8 cache")
 parser.add_argument("-sub", "--subjects", type = str, default = "all", help = "Comma-separated list of categories to test, or 'all'")
 parser.add_argument("-fs", "--fewshot_examples", type = int, default = 5, help = "Number of examples for fewshot examples, max 5")
 parser.add_argument("-shf", "--shuffle", action = "store_true", help = "Shuffle choices randomly")
@@ -33,6 +35,8 @@
 )
 
 if args.cache_q4: cache_type = ExLlamaV2Cache_Q4
+elif args.cache_q6: cache_type = ExLlamaV2Cache_Q6
+elif args.cache_q8: cache_type = ExLlamaV2Cache_Q8
 else: cache_type = ExLlamaV2Cache
 cache = cache_type(
     model,