Skip to content

Commit 03255d4

Browse files
committed
Merge remote-tracking branch 'turboderp/master'
2 parents 130ec91 + 57ee846 commit 03255d4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+6122
-887
lines changed

README.md

+15-9
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
ExLlamaV2 is an inference library for running local LLMs on modern consumer GPUs.
44

5+
The official and recommended backend server for ExLlamaV2 is [TabbyAPI](https://github.com/theroyallab/tabbyAPI/),
6+
which provides an OpenAI-compatible API for local or remote inference, with extended features like HF model
7+
downloading, embedding model support and support for HF Jinja2 chat templates.
8+
9+
See the [wiki](https://github.com/theroyallab/tabbyAPI/wiki/1.-Getting-Started) for help getting started.
10+
511

612
## New in v0.1.0+:
713

@@ -56,17 +62,17 @@ and speeds will vary across GPUs, with slow CPUs still being a potential bottlen
5662

5763
| Model | Mode | Size | grpsz | act | 3090Ti | 4090 |
5864
|------------|--------------|-------|-------|-----|---------|-------------|
59-
| Llama | GPTQ | 7B | 128 | no | 177 t/s | **198** t/s |
60-
| Llama | GPTQ | 13B | 128 | no | 109 t/s | **111** t/s |
65+
| Llama | GPTQ | 7B | 128 | no | 181 t/s | **205** t/s |
66+
| Llama | GPTQ | 13B | 128 | no | 110 t/s | **114** t/s |
6167
| Llama | GPTQ | 33B | 128 | yes | 44 t/s | **48** t/s |
62-
| OpenLlama | GPTQ | 3B | 128 | yes | 252 t/s | **283** t/s |
68+
| OpenLlama | GPTQ | 3B | 128 | yes | 259 t/s | **296** t/s |
6369
| CodeLlama | EXL2 4.0 bpw | 34B | - | - | 44 t/s | **50** t/s |
64-
| Llama2 | EXL2 3.0 bpw | 7B | - | - | 211 t/s | **245** t/s |
65-
| Llama2 | EXL2 4.0 bpw | 7B | - | - | 179 t/s | **207** t/s |
66-
| Llama2 | EXL2 5.0 bpw | 7B | - | - | 159 t/s | **170** t/s |
67-
| Llama2 | EXL2 2.5 bpw | 70B | - | - | 33 t/s | **37** t/s |
68-
| TinyLlama | EXL2 3.0 bpw | 1.1B | - | - | 623 t/s | **730** t/s |
69-
| TinyLlama | EXL2 4.0 bpw | 1.1B | - | - | 560 t/s | **643** t/s |
70+
| Llama2 | EXL2 3.0 bpw | 7B | - | - | 217 t/s | **257** t/s |
71+
| Llama2 | EXL2 4.0 bpw | 7B | - | - | 185 t/s | **211** t/s |
72+
| Llama2 | EXL2 5.0 bpw | 7B | - | - | 164 t/s | **179** t/s |
73+
| Llama2 | EXL2 2.5 bpw | 70B | - | - | 33 t/s | **38** t/s |
74+
| TinyLlama | EXL2 3.0 bpw | 1.1B | - | - | 656 t/s | **770** t/s |
75+
| TinyLlama | EXL2 4.0 bpw | 1.1B | - | - | 602 t/s | **700** t/s |
7076

7177

7278
## How to

eval/humaneval.py

+16-6
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
parser.add_argument("-pf", "--prompt_format", type = str, help = "Instruct format to apply. Default is raw completion (for base models) ")
2222
parser.add_argument("-v", "--verbose", action = "store_true", help = "Spam completions to console while generating")
2323
parser.add_argument("-e", "--eval", action = "store_true", help = "Run evaluation script on output file after sampling")
24+
parser.add_argument("-temp", "--temperature", type = float, help = "Sampling temperature (0 for greedy), default: 0.6")
2425
model_init.add_args(parser)
2526
args = parser.parse_args()
2627

@@ -42,7 +43,16 @@
4243
),
4344
"granite": (
4445
"Question:\nComplete the following Python function:\n\n{{problem}}\n\nAnswer:\n"
45-
"Sure! Here is how you might implement the function:\n\n```python\n{{problem}} ",
46+
"Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
47+
" "
48+
),
49+
"llama": (
50+
"[INST] <<SYS>>\n"
51+
"You are a helpful AI coding assistant.\n"
52+
"<</SYS>>\n\n"
53+
"Complete the following Python function:\n\n"
54+
"{{problem}} [/INST] "
55+
"Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
4656
" "
4757
),
4858
"llama3": (
@@ -51,14 +61,14 @@
5161
"<|start_header_id|>user<|end_header_id|>\n\n"
5262
"Complete the following Python function:\n\n{{problem}}<|eot_id|>"
5363
"<|start_header_id|>assistant<|end_header_id|>\n\n"
54-
"Sure! Here is how you might implement the function:\n\n```python\n{{problem}} ",
64+
"Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
5565
" "
5666
),
5767
"gemma": (
5868
"<bos><start_of_turn>user\n"
5969
"Complete the following Python function:\n\n{{problem}}<|eot_id|>"
6070
"<start_of_turn>model\n"
61-
"```python\n{{problem}} ",
71+
"```python\n{{problem}}",
6272
" "
6373
)
6474
}
@@ -109,9 +119,9 @@
109119

110120
gen_settings = ExLlamaV2Sampler.Settings(
111121
token_repetition_penalty = 1.0,
112-
temperature = 0.8,
113-
top_k = 100,
114-
top_p = 0.8
122+
temperature = 0.6,
123+
top_k = 50,
124+
top_p = 0.6
115125
)
116126

117127
# Get problems

examples/bulk_inference.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393

9494
# We'll always get at least one result for each active job, even if the result contains no output text
9595
bsz = len(set([r["identifier"] for r in results]))
96+
num_tokens += bsz
9697

9798
for result in results:
9899
if not result["eos"]: continue
@@ -104,7 +105,6 @@
104105

105106
# Measure performance
106107
num_completions += 1
107-
num_tokens += result["new_tokens"]
108108
elapsed_time = time.time() - time_begin
109109
rpm = num_completions / (elapsed_time / 60)
110110
tps = num_tokens / elapsed_time

examples/chat.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
ExLlamaV2Cache_Q4,
1111
ExLlamaV2Cache_Q6,
1212
ExLlamaV2Cache_Q8,
13+
ExLlamaV2Cache_TP,
1314
ExLlamaV2Tokenizer,
1415
model_init,
1516
)
@@ -94,7 +95,7 @@
9495

9596
model_init.check_args(args)
9697
model_init.print_options(args)
97-
model, tokenizer = model_init.init(args, allow_auto_split = True, max_output_len = 16)
98+
model, tokenizer = model_init.init(args, allow_auto_split = True, max_output_len = 16, skip_load = True)
9899

99100
# Initialize draft model if provided, assume it always fits on first device
100101

@@ -139,24 +140,34 @@
139140
else:
140141
draft_cache = ExLlamaV2Cache(draft_model)
141142

143+
# Load model after draft model
144+
145+
print(" -- Loading model...")
146+
147+
model_init.post_init_load(model, args, allow_auto_split = True)
148+
142149
# Create cache
143150

144151
if args.cache_8bit:
145-
cache = ExLlamaV2Cache_8bit(model, lazy = not model.loaded)
152+
cache_type = ExLlamaV2Cache_8bit
146153
elif args.cache_q4:
147-
cache = ExLlamaV2Cache_Q4(model, lazy = not model.loaded)
154+
cache_type = ExLlamaV2Cache_Q4
148155
elif args.cache_q6:
149-
cache = ExLlamaV2Cache_Q6(model, lazy=not model.loaded)
156+
cache_type = ExLlamaV2Cache_Q6
150157
elif args.cache_q8:
151-
cache = ExLlamaV2Cache_Q8(model, lazy = not model.loaded)
158+
cache_type = ExLlamaV2Cache_Q8
159+
else:
160+
cache_type = ExLlamaV2Cache
161+
162+
if model.tp_context:
163+
cache = ExLlamaV2Cache_TP(model, base = cache_type)
152164
else:
153-
cache = ExLlamaV2Cache(model, lazy = not model.loaded)
165+
cache = cache_type(model, lazy = not model.loaded)
154166

155167
# Load model now if auto split enabled
156168

157169
if not model.loaded:
158170

159-
print(" -- Loading model...")
160171
model.load_autosplit(cache)
161172

162173
# Chat context

examples/inference_tp.py

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
2+
import sys, os
3+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
4+
5+
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache_TP, ExLlamaV2Tokenizer, Timer
6+
from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler
7+
8+
model_dir = "/mnt/str/models/llama3.1-70b-instruct-exl2/6.0bpw"
9+
config = ExLlamaV2Config(model_dir)
10+
config.arch_compat_overrides()
11+
config.no_graphs = True
12+
model = ExLlamaV2(config)
13+
14+
# Load the model in tensor-parallel mode. With no gpu_split specified, the model will attempt to split across
15+
# all visible devices according to the currently available VRAM on each. expect_cache_tokens is necessary for
16+
# balancing the split, in case the GPUs are of uneven sizes, or if the number of GPUs doesn't divide the number
17+
# of KV heads in the model
18+
#
19+
# The cache type for a TP model is always ExLlamaV2Cache_TP and should be allocated after the model. To use a
20+
# quantized cache, add a `base = ExLlamaV2Cache_Q6` etc. argument to the cache constructor. It's advisable
21+
# to also add `expect_cache_base = ExLlamaV2Cache_Q6` to load_tp() as well so the size can be correctly
22+
# accounted for when splitting the model.
23+
24+
model.load_tp(progress = True, expect_cache_tokens = 16384)
25+
cache = ExLlamaV2Cache_TP(model, max_seq_len = 16384)
26+
27+
# After loading the model, all other functions should work the same
28+
29+
print("Loading tokenizer...")
30+
tokenizer = ExLlamaV2Tokenizer(config)
31+
32+
# Initialize the generator with all default parameters
33+
34+
generator = ExLlamaV2DynamicGenerator(
35+
model = model,
36+
cache = cache,
37+
tokenizer = tokenizer,
38+
)
39+
40+
max_new_tokens = 200
41+
42+
# Warmup generator. The function runs a small completion job to allow all the kernels to fully initialize and
43+
# autotune before we do any timing measurements. It can be a little slow for larger models and is not needed
44+
# to produce correct output.
45+
46+
generator.warmup()
47+
48+
# Generate one completion, using default settings
49+
50+
prompt = "Our story begins in the Scottish town of"
51+
52+
with Timer() as t_single:
53+
output = generator.generate(
54+
prompt = prompt,
55+
max_new_tokens = max_new_tokens,
56+
add_bos = True,
57+
)
58+
59+
print("-----------------------------------------------------------------------------------")
60+
print("- Single completion")
61+
print("-----------------------------------------------------------------------------------")
62+
print(output)
63+
print()
64+
65+
# Do a batched generation
66+
67+
prompts = [
68+
"Once upon a time,",
69+
"The secret to success is",
70+
"There's no such thing as",
71+
"Here's why you should adopt a cat:",
72+
]
73+
74+
with Timer() as t_batched:
75+
outputs = generator.generate(prompt = prompts, max_new_tokens = max_new_tokens, add_bos = True)
76+
77+
for idx, output in enumerate(outputs):
78+
print("-----------------------------------------------------------------------------------")
79+
print(f"- Batched completion #{idx + 1}")
80+
print("-----------------------------------------------------------------------------------")
81+
print(output)
82+
print()
83+
84+
print("-----------------------------------------------------------------------------------")
85+
print(f"speed, bsz 1: {max_new_tokens / t_single.interval:.2f} tokens/second")
86+
print(f"speed, bsz {len(prompts)}: {max_new_tokens * len(prompts) / t_batched.interval:.2f} tokens/second")

exllamav2/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from exllamav2.cache import ExLlamaV2Cache_Q6
88
from exllamav2.cache import ExLlamaV2Cache_Q8
99
from exllamav2.cache import ExLlamaV2Cache_8bit
10+
from exllamav2.cache import ExLlamaV2Cache_TP
1011
from exllamav2.config import ExLlamaV2Config
1112
from exllamav2.tokenizer.tokenizer import ExLlamaV2Tokenizer
1213
from exllamav2.lora import ExLlamaV2Lora

exllamav2/architecture.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from enum import Enum
1+
from enum import IntEnum
22

33
# Common keys
44

@@ -94,7 +94,7 @@
9494
(".attention.", ".self_attn."),
9595
(".wo.", ".o_proj.")]
9696

97-
class RopeStyle(Enum):
97+
class RopeStyle(IntEnum):
9898
NONE = 0
9999
GPTJ = 1
100100
NEOX = 2
@@ -181,6 +181,9 @@ def __init__(self, arch_string, read_config):
181181
# Scale attn weights (GPT2 quirk, not important for inference)
182182
self.scale_attn_weights = False
183183

184+
# Model implementation works in tensor-parallel mode
185+
self.supports_tp = False
186+
184187
# Mistral
185188

186189
if arch_string == "MistralForCausalLM":
@@ -201,6 +204,7 @@ def __init__(self, arch_string, read_config):
201204
self.mlp_act_func = "silu"
202205
self.norm = "rmsnorm"
203206
self.rope_style = RopeStyle.NEOX
207+
self.supports_tp = True
204208

205209
# Mixtral
206210

@@ -288,6 +292,7 @@ def __init__(self, arch_string, read_config):
288292
self.norm = "rmsnorm"
289293
self.rope_style = RopeStyle.NEOX
290294
self.attention_bias_qkv = True
295+
self.supports_tp = True
291296

292297
# Gemma
293298

@@ -613,6 +618,7 @@ def __init__(self, arch_string, read_config):
613618
self.mlp_act_func = "silu"
614619
self.norm = "rmsnorm"
615620
self.rope_style = RopeStyle.NEOX
621+
self.supports_tp = True
616622

617623
# Arch overrides
618624

0 commit comments

Comments
 (0)