oobabooga
diff --git a/‎README.md
+15-9 b/‎README.md
+15-9
diff --git a/‎eval/humaneval.py
+16-6 b/‎eval/humaneval.py
+16-6
diff --git a/‎examples/bulk_inference.py
+1-1 b/‎examples/bulk_inference.py
+1-1
diff --git a/‎examples/chat.py
+18-7 b/‎examples/chat.py
+18-7
diff --git a/‎examples/inference_tp.py
+86 b/‎examples/inference_tp.py
+86
diff --git a/‎exllamav2/__init__.py
+1 b/‎exllamav2/__init__.py
+1
diff --git a/‎exllamav2/architecture.py
+8-2 b/‎exllamav2/architecture.py
+8-2
@@ -2,6 +2,12 @@
 
 ExLlamaV2 is an inference library for running local LLMs on modern consumer GPUs.
 
+The official and recommended backend server for ExLlamaV2 is [TabbyAPI](https://github.com/theroyallab/tabbyAPI/),
+which provides an OpenAI-compatible API for local or remote inference, with extended features like HF model
+downloading, embedding model support and support for HF Jinja2 chat templates.
+
+See the [wiki](https://github.com/theroyallab/tabbyAPI/wiki/1.-Getting-Started) for help getting started.
+
 
 ## New in v0.1.0+:
 
@@ -56,17 +62,17 @@ and speeds will vary across GPUs, with slow CPUs still being a potential bottlen
 
 | Model      | Mode         | Size  | grpsz | act | 3090Ti  | 4090        |
 |------------|--------------|-------|-------|-----|---------|-------------|
-| Llama      | GPTQ         | 7B    | 128   | no  | 177 t/s | **198** t/s |
-| Llama      | GPTQ         | 13B   | 128   | no  | 109 t/s | **111** t/s |
+| Llama      | GPTQ         | 7B    | 128   | no  | 181 t/s | **205** t/s |
+| Llama      | GPTQ         | 13B   | 128   | no  | 110 t/s | **114** t/s |
 | Llama      | GPTQ         | 33B   | 128   | yes | 44 t/s  | **48** t/s  |
-| OpenLlama  | GPTQ         | 3B    | 128   | yes | 252 t/s | **283** t/s |
+| OpenLlama  | GPTQ         | 3B    | 128   | yes | 259 t/s | **296** t/s |
 | CodeLlama  | EXL2 4.0 bpw | 34B   | -     | -   | 44 t/s  | **50** t/s  |
-| Llama2     | EXL2 3.0 bpw | 7B    | -     | -   | 211 t/s | **245** t/s |
-| Llama2     | EXL2 4.0 bpw | 7B    | -     | -   | 179 t/s | **207** t/s |
-| Llama2     | EXL2 5.0 bpw | 7B    | -     | -   | 159 t/s | **170** t/s |
-| Llama2     | EXL2 2.5 bpw | 70B   | -     | -   | 33 t/s  | **37** t/s  |
-| TinyLlama  | EXL2 3.0 bpw | 1.1B  | -     | -   | 623 t/s | **730** t/s |
-| TinyLlama  | EXL2 4.0 bpw | 1.1B  | -     | -   | 560 t/s | **643** t/s |
+| Llama2     | EXL2 3.0 bpw | 7B    | -     | -   | 217 t/s | **257** t/s |
+| Llama2     | EXL2 4.0 bpw | 7B    | -     | -   | 185 t/s | **211** t/s |
+| Llama2     | EXL2 5.0 bpw | 7B    | -     | -   | 164 t/s | **179** t/s |
+| Llama2     | EXL2 2.5 bpw | 70B   | -     | -   | 33 t/s  | **38** t/s  |
+| TinyLlama  | EXL2 3.0 bpw | 1.1B  | -     | -   | 656 t/s | **770** t/s |
+| TinyLlama  | EXL2 4.0 bpw | 1.1B  | -     | -   | 602 t/s | **700** t/s |
 
 
 ## How to
 
@@ -21,6 +21,7 @@
 parser.add_argument("-pf", "--prompt_format", type = str, help = "Instruct format to apply. Default is raw completion (for base models) ")
 parser.add_argument("-v", "--verbose", action = "store_true", help = "Spam completions to console while generating")
 parser.add_argument("-e", "--eval", action = "store_true", help = "Run evaluation script on output file after sampling")
+parser.add_argument("-temp", "--temperature", type = float, help = "Sampling temperature (0 for greedy), default: 0.6")
 model_init.add_args(parser)
 args = parser.parse_args()
 
@@ -42,7 +43,16 @@
     ),
     "granite": (
         "Question:\nComplete the following Python function:\n\n{{problem}}\n\nAnswer:\n"
-        "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}    ",
+        "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
+        "    "
+    ),
+    "llama": (
+        "[INST] <<SYS>>\n"
+        "You are a helpful AI coding assistant.\n"
+        "<</SYS>>\n\n"
+        "Complete the following Python function:\n\n"
+        "{{problem}} [/INST] "
+        "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
         "    "
     ),
     "llama3": (
@@ -51,14 +61,14 @@
         "<|start_header_id|>user<|end_header_id|>\n\n"
         "Complete the following Python function:\n\n{{problem}}<|eot_id|>"
         "<|start_header_id|>assistant<|end_header_id|>\n\n"
-        "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}    ",
+        "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
         "    "
     ),
     "gemma": (
         "<bos><start_of_turn>user\n"
         "Complete the following Python function:\n\n{{problem}}<|eot_id|>"
         "<start_of_turn>model\n"
-        "```python\n{{problem}}    ",
+        "```python\n{{problem}}",
         "    "
     )
 }
@@ -109,9 +119,9 @@
 
 gen_settings = ExLlamaV2Sampler.Settings(
     token_repetition_penalty = 1.0,
-    temperature = 0.8,
-    top_k = 100,
-    top_p = 0.8
+    temperature = 0.6,
+    top_k = 50,
+    top_p = 0.6
 )
 
 # Get problems
 
@@ -93,6 +93,7 @@
 
     # We'll always get at least one result for each active job, even if the result contains no output text
     bsz = len(set([r["identifier"] for r in results]))
+    num_tokens += bsz
 
     for result in results:
         if not result["eos"]: continue
@@ -104,7 +105,6 @@
 
         # Measure performance
         num_completions += 1
-        num_tokens += result["new_tokens"]
         elapsed_time = time.time() - time_begin
         rpm = num_completions / (elapsed_time / 60)
         tps = num_tokens / elapsed_time
 
@@ -10,6 +10,7 @@
     ExLlamaV2Cache_Q4,
     ExLlamaV2Cache_Q6,
     ExLlamaV2Cache_Q8,
+    ExLlamaV2Cache_TP,
     ExLlamaV2Tokenizer,
     model_init,
 )
@@ -94,7 +95,7 @@
 
 model_init.check_args(args)
 model_init.print_options(args)
-model, tokenizer = model_init.init(args, allow_auto_split = True, max_output_len = 16)
+model, tokenizer = model_init.init(args, allow_auto_split = True, max_output_len = 16, skip_load = True)
 
 # Initialize draft model if provided, assume it always fits on first device
 
@@ -139,24 +140,34 @@
     else:
         draft_cache = ExLlamaV2Cache(draft_model)
 
+# Load model after draft model
+
+print(" -- Loading model...")
+
+model_init.post_init_load(model, args, allow_auto_split = True)
+
 # Create cache
 
 if args.cache_8bit:
-    cache = ExLlamaV2Cache_8bit(model, lazy = not model.loaded)
+    cache_type = ExLlamaV2Cache_8bit
 elif args.cache_q4:
-    cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
+    cache_type = ExLlamaV2Cache_Q4
 elif args.cache_q6:
-    cache = ExLlamaV2Cache_Q6(model, lazy=not model.loaded)
+    cache_type = ExLlamaV2Cache_Q6
 elif args.cache_q8:
-    cache = ExLlamaV2Cache_Q8(model, lazy = not model.loaded)
+    cache_type = ExLlamaV2Cache_Q8
+else:
+    cache_type = ExLlamaV2Cache
+
+if model.tp_context:
+    cache = ExLlamaV2Cache_TP(model, base = cache_type)
 else:
-    cache = ExLlamaV2Cache(model, lazy = not model.loaded)
+    cache = cache_type(model, lazy = not model.loaded)
 
 # Load model now if auto split enabled
 
 if not model.loaded:
 
-    print(" -- Loading model...")
     model.load_autosplit(cache)
 
 # Chat context
 
@@ -0,0 +1,86 @@
+
+import sys, os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache_TP, ExLlamaV2Tokenizer, Timer
+from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler
+
+model_dir = "/mnt/str/models/llama3.1-70b-instruct-exl2/6.0bpw"
+config = ExLlamaV2Config(model_dir)
+config.arch_compat_overrides()
+config.no_graphs = True
+model = ExLlamaV2(config)
+
+# Load the model in tensor-parallel mode. With no gpu_split specified, the model will attempt to split across
+# all visible devices according to the currently available VRAM on each. expect_cache_tokens is necessary for
+# balancing the split, in case the GPUs are of uneven sizes, or if the number of GPUs doesn't divide the number
+# of KV heads in the model
+#
+# The cache type for a TP model is always ExLlamaV2Cache_TP and should be allocated after the model. To use a
+# quantized cache, add a `base = ExLlamaV2Cache_Q6` etc. argument to the cache constructor. It's advisable
+# to also add `expect_cache_base = ExLlamaV2Cache_Q6` to load_tp() as well so the size can be correctly
+# accounted for when splitting the model.
+
+model.load_tp(progress = True, expect_cache_tokens = 16384)
+cache = ExLlamaV2Cache_TP(model, max_seq_len = 16384)
+
+# After loading the model, all other functions should work the same
+
+print("Loading tokenizer...")
+tokenizer = ExLlamaV2Tokenizer(config)
+
+# Initialize the generator with all default parameters
+
+generator = ExLlamaV2DynamicGenerator(
+    model = model,
+    cache = cache,
+    tokenizer = tokenizer,
+)
+
+max_new_tokens = 200
+
+# Warmup generator. The function runs a small completion job to allow all the kernels to fully initialize and
+# autotune before we do any timing measurements. It can be a little slow for larger models and is not needed
+# to produce correct output.
+
+generator.warmup()
+
+# Generate one completion, using default settings
+
+prompt = "Our story begins in the Scottish town of"
+
+with Timer() as t_single:
+    output = generator.generate(
+        prompt = prompt,
+        max_new_tokens = max_new_tokens,
+        add_bos = True,
+    )
+
+print("-----------------------------------------------------------------------------------")
+print("- Single completion")
+print("-----------------------------------------------------------------------------------")
+print(output)
+print()
+
+# Do a batched generation
+
+prompts = [
+    "Once upon a time,",
+    "The secret to success is",
+    "There's no such thing as",
+    "Here's why you should adopt a cat:",
+]
+
+with Timer() as t_batched:
+    outputs = generator.generate(prompt = prompts, max_new_tokens = max_new_tokens, add_bos = True)
+
+for idx, output in enumerate(outputs):
+    print("-----------------------------------------------------------------------------------")
+    print(f"- Batched completion #{idx + 1}")
+    print("-----------------------------------------------------------------------------------")
+    print(output)
+    print()
+
+print("-----------------------------------------------------------------------------------")
+print(f"speed, bsz 1: {max_new_tokens / t_single.interval:.2f} tokens/second")
+print(f"speed, bsz {len(prompts)}: {max_new_tokens * len(prompts) / t_batched.interval:.2f} tokens/second")
@@ -7,6 +7,7 @@
 from exllamav2.cache import ExLlamaV2Cache_Q6
 from exllamav2.cache import ExLlamaV2Cache_Q8
 from exllamav2.cache import ExLlamaV2Cache_8bit
+from exllamav2.cache import ExLlamaV2Cache_TP
 from exllamav2.config import ExLlamaV2Config
 from exllamav2.tokenizer.tokenizer import ExLlamaV2Tokenizer
 from exllamav2.lora import ExLlamaV2Lora
 
@@ -1,4 +1,4 @@
-from enum import Enum
+from enum import IntEnum
 
 # Common keys
 
@@ -94,7 +94,7 @@
                     (".attention.", ".self_attn."),
                     (".wo.", ".o_proj.")]
 
-class RopeStyle(Enum):
+class RopeStyle(IntEnum):
     NONE = 0
     GPTJ = 1
     NEOX = 2
@@ -181,6 +181,9 @@ def __init__(self, arch_string, read_config):
         # Scale attn weights (GPT2 quirk, not important for inference)
         self.scale_attn_weights = False
 
+        # Model implementation works in tensor-parallel mode
+        self.supports_tp = False
+
         # Mistral
 
         if arch_string == "MistralForCausalLM":
@@ -201,6 +204,7 @@ def __init__(self, arch_string, read_config):
             self.mlp_act_func = "silu"
             self.norm = "rmsnorm"
             self.rope_style = RopeStyle.NEOX
+            self.supports_tp = True
 
         # Mixtral
 
@@ -288,6 +292,7 @@ def __init__(self, arch_string, read_config):
             self.norm = "rmsnorm"
             self.rope_style = RopeStyle.NEOX
             self.attention_bias_qkv = True
+            self.supports_tp = True
 
         # Gemma
 
@@ -613,6 +618,7 @@ def __init__(self, arch_string, read_config):
             self.mlp_act_func = "silu"
             self.norm = "rmsnorm"
             self.rope_style = RopeStyle.NEOX
+            self.supports_tp = True
 
         # Arch overrides